BerriAI
diff --git a/‎docs/my-website/docs/proxy/config_settings.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/my-website/docs/proxy/config_settings.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎litellm/caching/dual_cache.py‎
Lines changed: 23 additions & 15 deletions b/‎litellm/caching/dual_cache.py‎
Lines changed: 23 additions & 15 deletions
diff --git a/‎litellm/constants.py‎
Lines changed: 7 additions & 0 deletions b/‎litellm/constants.py‎
Lines changed: 7 additions & 0 deletions
@@ -470,6 +470,7 @@ router_settings:
 | DEFAULT_MAX_RETRIES | Default maximum retry attempts. Default is 2
 | DEFAULT_MAX_TOKENS | Default maximum tokens for LLM calls. Default is 4096
 | DEFAULT_MAX_TOKENS_FOR_TRITON | Default maximum tokens for Triton models. Default is 2000
+| DEFAULT_MAX_REDIS_BATCH_CACHE_SIZE | Default maximum size for redis batch cache. Default is 1000
 | DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT | Default token count for mock response completions. Default is 20
 | DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT | Default token count for mock response prompts. Default is 10
 | DEFAULT_MODEL_CREATED_AT_TIME | Default creation timestamp for models. Default is 1677610602
@@ -717,6 +718,7 @@ router_settings:
 | PROXY_BATCH_POLLING_INTERVAL | Time in seconds to wait before polling a batch, to check if it's completed. Default is 6000s (1 hour)
 | PROXY_BUDGET_RESCHEDULER_MAX_TIME | Maximum time in seconds to wait before checking database for budget resets. Default is 605
 | PROXY_BUDGET_RESCHEDULER_MIN_TIME | Minimum time in seconds to wait before checking database for budget resets. Default is 597
+| PYTHON_GC_THRESHOLD | GC thresholds ('gen0,gen1,gen2', e.g. '1000,50,50'); defaults to Python’s values.
 | PROXY_LOGOUT_URL | URL for logging out of the proxy service
 | QDRANT_API_BASE | Base URL for Qdrant API
 | QDRANT_API_KEY | API key for Qdrant service
 
@@ -19,6 +19,7 @@
 
 import litellm
 from litellm._logging import print_verbose, verbose_logger
+from litellm.constants import DEFAULT_MAX_REDIS_BATCH_CACHE_SIZE
 
 from .base_cache import BaseCache
 from .in_memory_cache import InMemoryCache
@@ -60,7 +61,7 @@ def __init__(
         default_in_memory_ttl: Optional[float] = None,
         default_redis_ttl: Optional[float] = None,
         default_redis_batch_cache_expiry: Optional[float] = None,
-        default_max_redis_batch_cache_size: int = 100,
+        default_max_redis_batch_cache_size: int = DEFAULT_MAX_REDIS_BATCH_CACHE_SIZE,
     ) -> None:
         super().__init__()
         # If in_memory_cache is not provided, use the default InMemoryCache
@@ -260,7 +261,7 @@ async def async_batch_get_cache(
         **kwargs,
     ):
         try:
-            result = [None for _ in range(len(keys))]
+            result = [None] * len(keys)
             if self.in_memory_cache is not None:
                 in_memory_result = await self.in_memory_cache.async_batch_get_cache(
                     keys, **kwargs
@@ -283,20 +284,27 @@ async def async_batch_get_cache(
                     redis_result = await self.redis_cache.async_batch_get_cache(
                         sublist_keys, parent_otel_span=parent_otel_span
                     )
-
-                    if redis_result is not None:
-                        # Update in-memory cache with the value from Redis
-                        for key, value in redis_result.items():
-                            if value is not None:
-                                await self.in_memory_cache.async_set_cache(
-                                    key, redis_result[key], **kwargs
-                                )
-                            # Update the last access time for each key fetched from Redis
-                            self.last_redis_batch_access_time[key] = current_time
-
+                    
+                    # Update the last access time for ALL queried keys
+                    # This includes keys with None values to throttle repeated Redis queries
+                    for key in sublist_keys:
+                        self.last_redis_batch_access_time[key] = current_time
+                    
+                    # Short-circuit if redis_result is None or contains only None values
+                    if redis_result is None or all(v is None for v in redis_result.values()):
+                        return result
+
+                    # Pre-compute key-to-index mapping for O(1) lookup
+                    key_to_index = {key: i for i, key in enumerate(keys)}
+                    
+                    # Update both result and in-memory cache in a single loop
                     for key, value in redis_result.items():
-                        index = keys.index(key)
-                        result[index] = value
+                        result[key_to_index[key]] = value
+                        
+                        if value is not None and self.in_memory_cache is not None:
+                            await self.in_memory_cache.async_set_cache(
+                                key, value, **kwargs
+                            )
 
             return result
         except Exception:
 
@@ -199,6 +199,9 @@
 DEFAULT_IN_MEMORY_TTL = int(
     os.getenv("DEFAULT_IN_MEMORY_TTL", 5)
 )  # default time to live for the in-memory cache
+DEFAULT_MAX_REDIS_BATCH_CACHE_SIZE = int(
+    os.getenv("DEFAULT_MAX_REDIS_BATCH_CACHE_SIZE", 1000)
+)  # default max size for redis batch cache
 DEFAULT_POLLING_INTERVAL = float(
     os.getenv("DEFAULT_POLLING_INTERVAL", 0.03)
 )  # default polling interval for the scheduler
@@ -970,6 +973,10 @@
 # makes it clear this is a rate limit error for a litellm virtual key
 RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
 
+# Python garbage collection threshold configuration
+# Format: "gen0,gen1,gen2" e.g., "1000,50,50"
+PYTHON_GC_THRESHOLD = os.getenv("PYTHON_GC_THRESHOLD")
+
 # pass through route constansts
 BEDROCK_AGENT_RUNTIME_PASS_THROUGH_ROUTES = [
     "agents/",