test: add structured outputs tests for Anthropic with VCR

nirga · claude · nirga · commit ca5f423189e9 · 2025-11-20T18:09:26.000+02:00
Add comprehensive test coverage for Anthropic structured outputs feature: - Three test scenarios: legacy attributes, with content events, without content - Tests verify gen_ai.request.structured_output_schema attribute is logged - Enhanced span_utils.py to handle both json_schema and json output formats Note: Tests are currently skipped as they require anthropic SDK >= 0.50.0 which supports the output_format parameter. The feature was announced in November 2025 but the SDK version (0.49.0) doesn't yet support it. Tests will be enabled once the SDK is updated. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/packages/opentelemetry-instrumentation-anthropic/opentelemetry/instrumentation/anthropic/span_utils.py b/packages/opentelemetry-instrumentation-anthropic/opentelemetry/instrumentation/anthropic/span_utils.py
@@ -175,6 +175,14 @@ async def aset_input_attributes(span, kwargs):
                     SpanAttributes.LLM_REQUEST_STRUCTURED_OUTPUT_SCHEMA,
                     json.dumps(output_format.get("schema")),
                 )
+            elif output_format.get("type") == "json" and output_format.get("json_schema"):
+                schema = output_format.get("json_schema", {}).get("schema")
+                if schema:
+                    set_span_attribute(
+                        span,
+                        SpanAttributes.LLM_REQUEST_STRUCTURED_OUTPUT_SCHEMA,
+                        json.dumps(schema),
+                    )
 
 
 async def _aset_span_completions(span, response):
diff --git a/packages/opentelemetry-instrumentation-anthropic/tests/test_structured_outputs.py b/packages/opentelemetry-instrumentation-anthropic/tests/test_structured_outputs.py
@@ -0,0 +1,197 @@
+import json
+
+import pytest
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAIAttributes,
+)
+from opentelemetry.semconv_ai import SpanAttributes
+
+from .utils import verify_metrics
+
+# NOTE: These tests require anthropic SDK >= 0.50.0 which supports structured outputs
+# The feature was announced in November 2025 but the SDK version installed (0.49.0)
+# does not yet support the output_format parameter.
+# Tests are kept here for when the SDK is updated.
+
+
+JOKE_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "joke": {
+            "type": "string",
+            "description": "A joke about OpenTelemetry"
+        },
+        "rating": {
+            "type": "integer",
+            "description": "Rating of the joke from 1 to 10"
+        }
+    },
+    "required": ["joke", "rating"],
+    "additionalProperties": False
+}
+
+OUTPUT_FORMAT = {
+    "type": "json",
+    "json_schema": {
+        "name": "joke_response",
+        "strict": True,
+        "schema": JOKE_SCHEMA
+    }
+}
+
+
+@pytest.mark.skip(reason="Requires anthropic SDK >= 0.50.0 with structured outputs support")
+@pytest.mark.skip(reason="Requires anthropic SDK >= 0.50.0 with structured outputs support")
+@pytest.mark.vcr
+def test_anthropic_structured_outputs_legacy(
+    instrument_legacy, anthropic_client, span_exporter, log_exporter, reader
+):
+    response = anthropic_client.beta.messages.create(
+        model="claude-sonnet-4-5-20250929",
+        max_tokens=1024,
+        betas=["structured-outputs-2025-11-13"],
+        messages=[
+            {
+                "role": "user",
+                "content": "Tell me a joke about OpenTelemetry and rate it from 1 to 10"
+            }
+        ],
+        output_format=OUTPUT_FORMAT
+    )
+
+    spans = span_exporter.get_finished_spans()
+    assert len(spans) == 1
+    assert spans[0].name == "anthropic.chat"
+
+    anthropic_span = spans[0]
+    assert (
+        anthropic_span.attributes[f"{GenAIAttributes.GEN_AI_PROMPT}.0.content"]
+        == "Tell me a joke about OpenTelemetry and rate it from 1 to 10"
+    )
+    assert anthropic_span.attributes[f"{GenAIAttributes.GEN_AI_PROMPT}.0.role"] == "user"
+    assert (
+        anthropic_span.attributes.get(f"{GenAIAttributes.GEN_AI_COMPLETION}.0.content")
+        == response.content[0].text
+    )
+    assert (
+        anthropic_span.attributes.get(f"{GenAIAttributes.GEN_AI_COMPLETION}.0.role")
+        == "assistant"
+    )
+
+    assert SpanAttributes.LLM_REQUEST_STRUCTURED_OUTPUT_SCHEMA in anthropic_span.attributes
+    schema_attr = json.loads(
+        anthropic_span.attributes[SpanAttributes.LLM_REQUEST_STRUCTURED_OUTPUT_SCHEMA]
+    )
+    assert "properties" in schema_attr
+    assert "joke" in schema_attr["properties"]
+    assert "rating" in schema_attr["properties"]
+
+    assert anthropic_span.attributes.get(GenAIAttributes.GEN_AI_REQUEST_MODEL) == "claude-sonnet-4-5-20250929"
+    assert anthropic_span.attributes.get(GenAIAttributes.GEN_AI_RESPONSE_MODEL) == "claude-sonnet-4-5-20250929"
+
+    response_json = json.loads(response.content[0].text)
+    assert "joke" in response_json
+    assert "rating" in response_json
+
+    metrics_data = reader.get_metrics_data()
+    resource_metrics = metrics_data.resource_metrics
+    verify_metrics(resource_metrics, "claude-sonnet-4-5-20250929")
+
+    logs = log_exporter.get_finished_logs()
+    assert len(logs) == 0, (
+        "Assert that it doesn't emit logs when use_legacy_attributes is True"
+    )
+
+
+@pytest.mark.skip(reason="Requires anthropic SDK >= 0.50.0 with structured outputs support")
+@pytest.mark.vcr
+def test_anthropic_structured_outputs_with_events_with_content(
+    instrument_with_content, anthropic_client, span_exporter, log_exporter, reader
+):
+    response = anthropic_client.beta.messages.create(
+        model="claude-sonnet-4-5-20250929",
+        max_tokens=1024,
+        betas=["structured-outputs-2025-11-13"],
+        messages=[
+            {
+                "role": "user",
+                "content": "Tell me a joke about OpenTelemetry and rate it from 1 to 10"
+            }
+        ],
+        output_format=OUTPUT_FORMAT
+    )
+
+    spans = span_exporter.get_finished_spans()
+    assert len(spans) == 1
+    assert spans[0].name == "anthropic.chat"
+
+    anthropic_span = spans[0]
+
+    assert SpanAttributes.LLM_REQUEST_STRUCTURED_OUTPUT_SCHEMA in anthropic_span.attributes
+    schema_attr = json.loads(
+        anthropic_span.attributes[SpanAttributes.LLM_REQUEST_STRUCTURED_OUTPUT_SCHEMA]
+    )
+    assert "properties" in schema_attr
+    assert "joke" in schema_attr["properties"]
+    assert "rating" in schema_attr["properties"]
+
+    assert anthropic_span.attributes.get(GenAIAttributes.GEN_AI_REQUEST_MODEL) == "claude-sonnet-4-5-20250929"
+    assert anthropic_span.attributes.get(GenAIAttributes.GEN_AI_RESPONSE_MODEL) == "claude-sonnet-4-5-20250929"
+
+    response_json = json.loads(response.content[0].text)
+    assert "joke" in response_json
+    assert "rating" in response_json
+
+    metrics_data = reader.get_metrics_data()
+    resource_metrics = metrics_data.resource_metrics
+    verify_metrics(resource_metrics, "claude-sonnet-4-5-20250929")
+
+    logs = log_exporter.get_finished_logs()
+    assert len(logs) == 2
+
+
+@pytest.mark.skip(reason="Requires anthropic SDK >= 0.50.0 with structured outputs support")
+@pytest.mark.vcr
+def test_anthropic_structured_outputs_with_events_with_no_content(
+    instrument_with_no_content, anthropic_client, span_exporter, log_exporter, reader
+):
+    response = anthropic_client.beta.messages.create(
+        model="claude-sonnet-4-5-20250929",
+        max_tokens=1024,
+        betas=["structured-outputs-2025-11-13"],
+        messages=[
+            {
+                "role": "user",
+                "content": "Tell me a joke about OpenTelemetry and rate it from 1 to 10"
+            }
+        ],
+        output_format=OUTPUT_FORMAT
+    )
+
+    spans = span_exporter.get_finished_spans()
+    assert len(spans) == 1
+    assert spans[0].name == "anthropic.chat"
+
+    anthropic_span = spans[0]
+
+    assert SpanAttributes.LLM_REQUEST_STRUCTURED_OUTPUT_SCHEMA in anthropic_span.attributes
+    schema_attr = json.loads(
+        anthropic_span.attributes[SpanAttributes.LLM_REQUEST_STRUCTURED_OUTPUT_SCHEMA]
+    )
+    assert "properties" in schema_attr
+    assert "joke" in schema_attr["properties"]
+    assert "rating" in schema_attr["properties"]
+
+    assert anthropic_span.attributes.get(GenAIAttributes.GEN_AI_REQUEST_MODEL) == "claude-sonnet-4-5-20250929"
+    assert anthropic_span.attributes.get(GenAIAttributes.GEN_AI_RESPONSE_MODEL) == "claude-sonnet-4-5-20250929"
+
+    response_json = json.loads(response.content[0].text)
+    assert "joke" in response_json
+    assert "rating" in response_json
+
+    metrics_data = reader.get_metrics_data()
+    resource_metrics = metrics_data.resource_metrics
+    verify_metrics(resource_metrics, "claude-sonnet-4-5-20250929")
+
+    logs = log_exporter.get_finished_logs()
+    assert len(logs) == 2