Revert "merge from upstream"

colesmcintosh · colesmcintosh · commit 8197fd74d5eb · 2025-08-11T07:54:47.000-06:00
This reverts commit e6ca918.
diff --git a/litellm/caching/caching_handler.py b/litellm/caching/caching_handler.py
@@ -1,5 +1,5 @@
 """
-This contains LLMCachingHandler 
+This contains LLMCachingHandler
 
 This exposes two methods:
     - async_get_cache
@@ -18,6 +18,7 @@
 import datetime
 import inspect
 import threading
+from functools import lru_cache, wraps
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -35,11 +36,13 @@
 
 import litellm
 from litellm._logging import print_verbose, verbose_logger
+from litellm._service_logger import ServiceLogging
+from litellm.caching import InMemoryCache
 from litellm.caching.caching import S3Cache
-from litellm.types.caching import CachedEmbedding
 from litellm.litellm_core_utils.logging_utils import (
     _assemble_complete_response_from_streaming_chunks,
 )
+from litellm.types.caching import CachedEmbedding
 from litellm.types.rerank import RerankResponse
 from litellm.types.utils import (
     CallTypes,
@@ -68,7 +71,12 @@ class CachingHandlerResponse(BaseModel):
 
     cached_result: Optional[Any] = None
     final_embedding_cached_response: Optional[EmbeddingResponse] = None
-    embedding_all_elements_cache_hit: bool = False  # this is set to True when all elements in the list have a cache hit in the embedding cache, if true return the final_embedding_cached_response no need to make an API call
+    embedding_all_elements_cache_hit: bool = (
+        False  # this is set to True when all elements in the list have a cache hit in the embedding cache, if true return the final_embedding_cached_response no need to make an API call
+    )
+
+
+in_memory_cache_obj = InMemoryCache()
 
 
 class LLMCachingHandler:
@@ -78,11 +86,20 @@ def __init__(
         request_kwargs: Dict[str, Any],
         start_time: datetime.datetime,
     ):
+        from litellm.caching import DualCache, RedisCache
+
         self.async_streaming_chunks: List[ModelResponse] = []
         self.sync_streaming_chunks: List[ModelResponse] = []
         self.request_kwargs = request_kwargs
         self.original_function = original_function
         self.start_time = start_time
+        if litellm.cache is not None and isinstance(litellm.cache.cache, RedisCache):
+            self.dual_cache: Optional[DualCache] = DualCache(
+                redis_cache=litellm.cache.cache,
+                in_memory_cache=in_memory_cache_obj,
+            )
+        else:
+            self.dual_cache = None
         pass
 
     async def _async_get_cache(
@@ -115,10 +132,16 @@ async def _async_get_cache(
         Raises:
             None
         """
+        from litellm.litellm_core_utils.core_helpers import (
+            _get_parent_otel_span_from_kwargs,
+        )
         from litellm.utils import CustomStreamWrapper
 
+        kwargs = kwargs.copy()
         args = args or ()
 
+        parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
+        kwargs["parent_otel_span"] = parent_otel_span
         final_embedding_cached_response: Optional[EmbeddingResponse] = None
         embedding_all_elements_cache_hit: bool = False
         cached_result: Optional[Any] = None
@@ -306,13 +329,15 @@ def handle_kwargs_input_list_or_str(self, kwargs: Dict[str, Any]) -> List[str]:
         else:
             raise ValueError("input must be a string or a list")
 
-    def _extract_model_from_cached_results(self, non_null_list: List[Tuple[int, CachedEmbedding]]) -> Optional[str]:
+    def _extract_model_from_cached_results(
+        self, non_null_list: List[Tuple[int, CachedEmbedding]]
+    ) -> Optional[str]:
         """
         Helper method to extract the model name from cached results.
-        
+
         Args:
             non_null_list: List of (idx, cr) tuples where cr is the cached result dict
-            
+
         Returns:
             Optional[str]: The model name if found, None otherwise
         """
@@ -558,7 +583,12 @@ async def _retrieve_from_cache(
                 preset_cache_key = litellm.cache.get_cache_key(
                     **{**new_kwargs, "input": i}
                 )
-                tasks.append(litellm.cache.async_get_cache(cache_key=preset_cache_key))
+                tasks.append(
+                    litellm.cache.async_get_cache(
+                        cache_key=preset_cache_key,
+                        dynamic_cache_object=self.dual_cache,
+                    )
+                )
             cached_result = await asyncio.gather(*tasks)
             ## check if cached result is None ##
             if cached_result is not None and isinstance(cached_result, list):
@@ -567,9 +597,14 @@ async def _retrieve_from_cache(
                     cached_result = None
         else:
             if litellm.cache._supports_async() is True:
-                cached_result = await litellm.cache.async_get_cache(**new_kwargs)
+                ## check if dual cache is supported ##
+                cached_result = await litellm.cache.async_get_cache(
+                    dynamic_cache_object=self.dual_cache, **new_kwargs
+                )
             else:  # for s3 caching. [NOT RECOMMENDED IN PROD - this will slow down responses since boto3 is sync]
-                cached_result = litellm.cache.get_cache(**new_kwargs)
+                cached_result = litellm.cache.get_cache(
+                    dynamic_cache_object=self.dual_cache, **new_kwargs
+                )
         return cached_result
 
     def _convert_cached_result_to_model_response(
@@ -735,6 +770,9 @@ async def async_set_cache(
         Raises:
             None
         """
+        from litellm.litellm_core_utils.core_helpers import (
+            _get_parent_otel_span_from_kwargs,
+        )
 
         if litellm.cache is None:
             return
@@ -746,6 +784,8 @@ async def async_set_cache(
                 args,
             )
         )
+        parent_otel_span = _get_parent_otel_span_from_kwargs(new_kwargs)
+        new_kwargs["parent_otel_span"] = parent_otel_span
         # [OPTIONAL] ADD TO CACHE
         if self._should_store_result_in_cache(
             original_function=original_function, kwargs=new_kwargs
@@ -764,7 +804,9 @@ async def async_set_cache(
                     )  # s3 doesn't support bulk writing. Exclude.
                 ):
                     asyncio.create_task(
-                        litellm.cache.async_add_cache_pipeline(result, **new_kwargs)
+                        litellm.cache.async_add_cache_pipeline(
+                            result, dynamic_cache_object=self.dual_cache, **new_kwargs
+                        )
                     )
                 elif isinstance(litellm.cache.cache, S3Cache):
                     threading.Thread(
@@ -775,7 +817,9 @@ async def async_set_cache(
                 else:
                     asyncio.create_task(
                         litellm.cache.async_add_cache(
-                            result.model_dump_json(), **new_kwargs
+                            result.model_dump_json(),
+                            dynamic_cache_object=self.dual_cache,
+                            **new_kwargs,
                         )
                     )
             else:
@@ -933,9 +977,9 @@ def _update_litellm_logging_obj_environment(
         }
 
         if litellm.cache is not None:
-            litellm_params[
-                "preset_cache_key"
-            ] = litellm.cache._get_preset_cache_key_from_kwargs(**kwargs)
+            litellm_params["preset_cache_key"] = (
+                litellm.cache._get_preset_cache_key_from_kwargs(**kwargs)
+            )
         else:
             litellm_params["preset_cache_key"] = None