Fix: preserve usage from earlier stream chunks when later chunks have none

ihower · ihower · commit 16d503c4d6ef · 2025-11-26T18:51:54.000+08:00
diff --git a/src/agents/models/chatcmpl_stream_handler.py b/src/agents/models/chatcmpl_stream_handler.py
@@ -97,7 +97,9 @@ async def handle_stream(
                 )
 
             # This is always set by the OpenAI API, but not by others e.g. LiteLLM
-            usage = chunk.usage if hasattr(chunk, "usage") else None
+            # Only update when chunk has usage data (not always in the last chunk)
+            if hasattr(chunk, "usage") and chunk.usage is not None:
+                usage = chunk.usage
 
             if not chunk.choices or not chunk.choices[0].delta:
                 continue
diff --git a/tests/test_reasoning_content.py b/tests/test_reasoning_content.py
@@ -234,6 +234,60 @@ async def patched_fetch_response(self, *args, **kwargs):
     assert resp.output[1].content[0].text == "The answer is 42"
 
 
+@pytest.mark.allow_call_model_methods
+@pytest.mark.asyncio
+async def test_stream_response_preserves_usage_from_earlier_chunk(monkeypatch) -> None:
+    """
+    Test that when an earlier chunk has usage data and later chunks don't,
+    the usage from the earlier chunk is preserved in the final response.
+    This handles cases where some providers (e.g., LiteLLM) may not include
+    usage in every chunk.
+    """
+    # Create test chunks where first chunk has usage, last chunk doesn't
+    chunks = [
+        create_chunk(create_content_delta("Hello"), include_usage=True),  # Has usage
+        create_chunk(create_content_delta("")),  # No usage (usage=None)
+    ]
+
+    async def patched_fetch_response(self, *args, **kwargs):
+        resp = Response(
+            id="resp-id",
+            created_at=0,
+            model="fake-model",
+            object="response",
+            output=[],
+            tool_choice="none",
+            tools=[],
+            parallel_tool_calls=False,
+        )
+        return resp, create_fake_stream(chunks)
+
+    monkeypatch.setattr(OpenAIChatCompletionsModel, "_fetch_response", patched_fetch_response)
+    model = OpenAIProvider(use_responses=False).get_model("gpt-4")
+    output_events = []
+    async for event in model.stream_response(
+        system_instructions=None,
+        input="",
+        model_settings=ModelSettings(),
+        tools=[],
+        output_schema=None,
+        handoffs=[],
+        tracing=ModelTracing.DISABLED,
+        previous_response_id=None,
+        conversation_id=None,
+        prompt=None,
+    ):
+        output_events.append(event)
+
+    # Verify the final response preserves usage from the first chunk
+    response_event = output_events[-1]
+    assert response_event.type == "response.completed"
+    assert response_event.response.usage is not None
+    assert response_event.response.usage.input_tokens == 2
+    assert response_event.response.usage.output_tokens == 4
+    assert response_event.response.usage.total_tokens == 6
+
+
 @pytest.mark.allow_call_model_methods
 @pytest.mark.asyncio
 async def test_stream_response_with_empty_reasoning_content(monkeypatch) -> None:

Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,9 @@ async def handle_stream(`
`97`	`97`	`)`
`98`	`98`
`99`	`99`	`# This is always set by the OpenAI API, but not by others e.g. LiteLLM`
`100`		`- usage = chunk.usage if hasattr(chunk, "usage") else None`
	`100`	`+ # Only update when chunk has usage data (not always in the last chunk)`
	`101`	`+ if hasattr(chunk, "usage") and chunk.usage is not None:`
	`102`	`+ usage = chunk.usage`
`101`	`103`
`102`	`104`	`if not chunk.choices or not chunk.choices[0].delta:`
`103`	`105`	`continue`