Skip to content

Commit 16d503c

Browse files
committed
Fix: preserve usage from earlier stream chunks when later chunks have none
1 parent db68d1c commit 16d503c

File tree

2 files changed

+57
-1
lines changed

2 files changed

+57
-1
lines changed

src/agents/models/chatcmpl_stream_handler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,9 @@ async def handle_stream(
9797
)
9898

9999
# This is always set by the OpenAI API, but not by others e.g. LiteLLM
100-
usage = chunk.usage if hasattr(chunk, "usage") else None
100+
# Only update when chunk has usage data (not always in the last chunk)
101+
if hasattr(chunk, "usage") and chunk.usage is not None:
102+
usage = chunk.usage
101103

102104
if not chunk.choices or not chunk.choices[0].delta:
103105
continue

tests/test_reasoning_content.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,60 @@ async def patched_fetch_response(self, *args, **kwargs):
234234
assert resp.output[1].content[0].text == "The answer is 42"
235235

236236

237+
@pytest.mark.allow_call_model_methods
238+
@pytest.mark.asyncio
239+
async def test_stream_response_preserves_usage_from_earlier_chunk(monkeypatch) -> None:
240+
"""
241+
Test that when an earlier chunk has usage data and later chunks don't,
242+
the usage from the earlier chunk is preserved in the final response.
243+
This handles cases where some providers (e.g., LiteLLM) may not include
244+
usage in every chunk.
245+
"""
246+
# Create test chunks where first chunk has usage, last chunk doesn't
247+
chunks = [
248+
create_chunk(create_content_delta("Hello"), include_usage=True), # Has usage
249+
create_chunk(create_content_delta("")), # No usage (usage=None)
250+
]
251+
252+
async def patched_fetch_response(self, *args, **kwargs):
253+
resp = Response(
254+
id="resp-id",
255+
created_at=0,
256+
model="fake-model",
257+
object="response",
258+
output=[],
259+
tool_choice="none",
260+
tools=[],
261+
parallel_tool_calls=False,
262+
)
263+
return resp, create_fake_stream(chunks)
264+
265+
monkeypatch.setattr(OpenAIChatCompletionsModel, "_fetch_response", patched_fetch_response)
266+
model = OpenAIProvider(use_responses=False).get_model("gpt-4")
267+
output_events = []
268+
async for event in model.stream_response(
269+
system_instructions=None,
270+
input="",
271+
model_settings=ModelSettings(),
272+
tools=[],
273+
output_schema=None,
274+
handoffs=[],
275+
tracing=ModelTracing.DISABLED,
276+
previous_response_id=None,
277+
conversation_id=None,
278+
prompt=None,
279+
):
280+
output_events.append(event)
281+
282+
# Verify the final response preserves usage from the first chunk
283+
response_event = output_events[-1]
284+
assert response_event.type == "response.completed"
285+
assert response_event.response.usage is not None
286+
assert response_event.response.usage.input_tokens == 2
287+
assert response_event.response.usage.output_tokens == 4
288+
assert response_event.response.usage.total_tokens == 6
289+
290+
237291
@pytest.mark.allow_call_model_methods
238292
@pytest.mark.asyncio
239293
async def test_stream_response_with_empty_reasoning_content(monkeypatch) -> None:

0 commit comments

Comments
 (0)