Merge pull request #13375 from colesmcintosh/fix/ollama-gpt-oss-thinking-field

krrishdholakia · web-flow · commit 5e45fcdff0f7 · 2025-08-13T07:58:57.000-07:00
diff --git a/litellm/llms/ollama/completion/transformation.py b/litellm/llms/ollama/completion/transformation.py
@@ -24,6 +24,8 @@
     ModelResponse,
     ModelResponseStream,
     ProviderField,
+    StreamingChoices,
+    Delta,
 )
 
 from ..common_utils import OllamaError, _convert_image
@@ -423,7 +425,7 @@ def _handle_string_chunk(
     ) -> Union[GenericStreamingChunk, ModelResponseStream]:
         return self.chunk_parser(json.loads(str_line))
 
-    def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
+    def chunk_parser(self, chunk: dict) -> Union[GenericStreamingChunk, ModelResponseStream]:
         try:
             if "error" in chunk:
                 raise Exception(f"Ollama Error - {chunk}")
@@ -459,6 +461,17 @@ def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
                     finish_reason="stop",
                     usage=None,
                 )
+            elif "thinking" in chunk and not chunk["response"]:
+                # Return reasoning content as ModelResponseStream so UIs can render it
+                thinking_content = chunk.get("thinking") or ""
+                return ModelResponseStream(
+                    choices=[
+                        StreamingChoices(
+                            index=0,
+                            delta=Delta(reasoning_content=thinking_content),
+                        )
+                    ]
+                )
             else:
                 raise Exception(f"Unable to parse ollama chunk - {chunk}")
         except Exception as e:
diff --git a/tests/test_litellm/llms/ollama/test_ollama_completion_transformation.py b/tests/test_litellm/llms/ollama/test_ollama_completion_transformation.py
@@ -10,8 +10,11 @@
     0, os.path.abspath("../../../../..")
 )  # Adds the parent directory to the system path
 
-from litellm.llms.ollama.completion.transformation import OllamaConfig
-from litellm.types.utils import Message, ModelResponse
+from litellm.llms.ollama.completion.transformation import (
+    OllamaConfig,
+    OllamaTextCompletionResponseIterator,
+)
+from litellm.types.utils import Message, ModelResponse, ModelResponseStream
 
 
 class TestOllamaConfig:
@@ -155,3 +158,74 @@ def test_transform_response_regular_json(self):
         assert result.choices[0]["message"].content == expected_content
         assert result.choices[0]["finish_reason"] == "stop"
         # No usage assertions here as we don't need to test them in every case
+
+
+class TestOllamaTextCompletionResponseIterator:
+    def test_chunk_parser_with_thinking_field(self):
+        """Test that chunks with 'thinking' field and empty 'response' are handled correctly."""
+        iterator = OllamaTextCompletionResponseIterator(
+            streaming_response=iter([]), sync_stream=True, json_mode=False
+        )
+
+        # Test chunk with thinking field - this is the problematic case from the issue
+        chunk_with_thinking = {
+            "model": "gpt-oss:20b",
+            "created_at": "2025-08-06T14:34:31.5276077Z",
+            "response": "",
+            "thinking": "User",
+            "done": False,
+        }
+
+        result = iterator.chunk_parser(chunk_with_thinking)
+
+        # Should return a ModelResponseStream with reasoning content
+        assert isinstance(result, ModelResponseStream)
+        assert result.choices and result.choices[0].delta is not None
+        assert getattr(result.choices[0].delta, "reasoning_content") == "User"
+
+    def test_chunk_parser_normal_response(self):
+        """Test that normal response chunks still work."""
+        iterator = OllamaTextCompletionResponseIterator(
+            streaming_response=iter([]), sync_stream=True, json_mode=False
+        )
+
+        # Test normal chunk with response
+        normal_chunk = {
+            "model": "llama2",
+            "created_at": "2025-08-06T14:34:31.5276077Z",
+            "response": "Hello world",
+            "done": False,
+        }
+
+        result = iterator.chunk_parser(normal_chunk)
+
+        assert result["text"] == "Hello world"
+        assert result["is_finished"] is False
+        assert result["finish_reason"] == "stop"
+        assert result["usage"] is None
+
+    def test_chunk_parser_done_chunk(self):
+        """Test that done chunks work correctly."""
+        iterator = OllamaTextCompletionResponseIterator(
+            streaming_response=iter([]), sync_stream=True, json_mode=False
+        )
+
+        # Test done chunk
+        done_chunk = {
+            "model": "llama2",
+            "created_at": "2025-08-06T14:34:31.5276077Z",
+            "response": "",
+            "done": True,
+            "prompt_eval_count": 10,
+            "eval_count": 5,
+        }
+
+        result = iterator.chunk_parser(done_chunk)
+
+        assert result["text"] == ""
+        assert result["is_finished"] is True
+        assert result["finish_reason"] == "stop"
+        assert result["usage"] is not None
+        assert result["usage"]["prompt_tokens"] == 10
+        assert result["usage"]["completion_tokens"] == 5
+        assert result["usage"]["total_tokens"] == 15