diff --git a/sentry_sdk/client.py b/sentry_sdk/client.py index 9f795d2489..87504c94b1 100644 --- a/sentry_sdk/client.py +++ b/sentry_sdk/client.py @@ -2,11 +2,12 @@ import uuid import random import socket -from collections.abc import Mapping +from collections.abc import Mapping, Iterable from datetime import datetime, timezone from importlib import import_module from typing import TYPE_CHECKING, List, Dict, cast, overload import warnings +import json from sentry_sdk._compat import check_uwsgi_thread_support from sentry_sdk._metrics_batcher import MetricsBatcher @@ -30,6 +31,7 @@ ) from sentry_sdk.serializer import serialize from sentry_sdk.tracing import trace +from sentry_sdk.traces import SpanStatus from sentry_sdk.tracing_utils import has_span_streaming_enabled from sentry_sdk.transport import ( HttpTransportCore, @@ -38,6 +40,7 @@ ) from sentry_sdk.consts import ( SPANDATA, + SPANSTATUS, DEFAULT_MAX_VALUE_LENGTH, DEFAULT_OPTIONS, INSTRUMENTER, @@ -56,6 +59,8 @@ ) from sentry_sdk.scrubber import EventScrubber from sentry_sdk.monitor import Monitor +from sentry_sdk.envelope import Item, PayloadRef +from sentry_sdk.utils import datetime_from_isoformat if TYPE_CHECKING: from typing import Any @@ -66,7 +71,15 @@ from typing import Union from typing import TypeVar - from sentry_sdk._types import Event, Hint, SDKInfo, Log, Metric, EventDataCategory + from sentry_sdk._types import ( + Event, + Hint, + SDKInfo, + Log, + Metric, + EventDataCategory, + SerializedAttributeValue, + ) from sentry_sdk.integrations import Integration from sentry_sdk.scope import Scope from sentry_sdk.session import Session @@ -89,6 +102,197 @@ } +def _serialized_v1_attribute_to_serialized_v2_attribute( + attribute_value: "Any", +) -> "Optional[SerializedAttributeValue]": + if isinstance(attribute_value, bool): + return { + "value": attribute_value, + "type": "boolean", + } + + if isinstance(attribute_value, int): + return { + "value": attribute_value, + "type": "integer", + } + + if isinstance(attribute_value, float): + return { + "value": attribute_value, + "type": "double", + } + + if isinstance(attribute_value, str): + return { + "value": attribute_value, + "type": "string", + } + + if isinstance(attribute_value, list): + if not attribute_value: + return {"value": [], "type": "array"} + + ty = type(attribute_value[0]) + if ty in (int, str, bool, float) and all( + type(v) is ty for v in attribute_value + ): + return { + "value": attribute_value, + "type": "array", + } + + # Types returned when the serializer for V1 span attributes recurses into some container types. + if isinstance(attribute_value, (dict, list)): + return { + "value": json.dumps(attribute_value), + "type": "string", + } + + if attribute_value is None: + return { + "value": "None", + "type": "string", + } + + return None + + +def _serialized_v1_span_to_serialized_v2_span( + span: "dict[str, Any]", event: "Event" +) -> "dict[str, Any]": + # See SpanBatcher._to_transport_format() for analogous population of all entries except "attributes". + res: "dict[str, Any]" = { + "status": SpanStatus.OK.value, + "is_segment": False, + } + + if "trace_id" in span: + res["trace_id"] = span["trace_id"] + + if "span_id" in span: + res["span_id"] = span["span_id"] + + if "description" in span: + res["name"] = span["description"] + + if "start_timestamp" in span: + start_timestamp = None + try: + start_timestamp = datetime_from_isoformat(span["start_timestamp"]) + except Exception: + pass + + if start_timestamp is not None: + res["start_timestamp"] = start_timestamp.timestamp() + + if "timestamp" in span: + end_timestamp = None + try: + end_timestamp = datetime_from_isoformat(span["timestamp"]) + except Exception: + pass + + if end_timestamp is not None: + res["end_timestamp"] = end_timestamp.timestamp() + + if "parent_span_id" in span: + res["parent_span_id"] = span["parent_span_id"] + + if "status" in span and span["status"] != SPANSTATUS.OK: + res["status"] = "error" + + attributes: "Dict[str, Any]" = {} + + if "op" in span: + attributes["sentry.op"] = span["op"] + if "origin" in span: + attributes["sentry.origin"] = span["origin"] + + span_data = span.get("data") + if isinstance(span_data, dict): + attributes.update(span_data) + + span_tags = span.get("tags") + if isinstance(span_tags, dict): + attributes.update(span_tags) + + # See Scope._apply_user_attributes_to_telemetry() for user attributes. + user = event.get("user") + if isinstance(user, dict): + if "id" in user: + attributes["user.id"] = user["id"] + if "username" in user: + attributes["user.name"] = user["username"] + if "email" in user: + attributes["user.email"] = user["email"] + + # See Scope.set_global_attributes() for release, environment, and SDK metadata. + if "release" in event: + attributes["sentry.release"] = event["release"] + if "environment" in event: + attributes["sentry.environment"] = event["environment"] + if "transaction" in event: + attributes["sentry.segment.name"] = event["transaction"] + + trace_context = event.get("contexts", {}).get("trace", {}) + if "span_id" in trace_context: + attributes["sentry.segment.id"] = trace_context["span_id"] + + sdk_info = event.get("sdk") + if isinstance(sdk_info, dict): + if "name" in sdk_info: + attributes["sentry.sdk.name"] = sdk_info["name"] + if "version" in sdk_info: + attributes["sentry.sdk.version"] = sdk_info["version"] + + if not attributes: + return res + + res["attributes"] = {} + for key, value in attributes.items(): + converted_value = _serialized_v1_attribute_to_serialized_v2_attribute(value) + if converted_value is None: + continue + + res["attributes"][key] = converted_value + + # Remove redundant attribute, as status is stored in the status field. + if "status" in res["attributes"]: + del res["attributes"]["status"] + + return res + + +def _split_gen_ai_spans( + event_opt: "Event", +) -> "Optional[tuple[List[Dict[str, object]], List[Dict[str, object]]]]": + if "spans" not in event_opt: + return None + + spans: "Any" = event_opt["spans"] + if isinstance(spans, AnnotatedValue): + spans = spans.value + + if not isinstance(spans, Iterable): + return None + + non_gen_ai_spans = [] + gen_ai_spans = [] + for span in spans: + if not isinstance(span, dict): + non_gen_ai_spans.append(span) + continue + + span_op = span.get("op") + if isinstance(span_op, str) and span_op.startswith("gen_ai."): + gen_ai_spans.append(span) + else: + non_gen_ai_spans.append(span) + + return non_gen_ai_spans, gen_ai_spans + + def _get_options(*args: "Optional[str]", **kwargs: "Any") -> "Dict[str, Any]": if args and (isinstance(args[0], (bytes, str)) or args[0] is None): dsn: "Optional[str]" = args[0] @@ -909,10 +1113,42 @@ def capture_event( envelope = Envelope(headers=headers) - if is_transaction: - if isinstance(profile, Profile): - envelope.add_profile(profile.to_json(event_opt, self.options)) + if is_transaction and isinstance(profile, Profile): + envelope.add_profile(profile.to_json(event_opt, self.options)) + + if is_transaction and not self.options["_experiments"].get( + "gen_ai_as_v2_spans", False + ): envelope.add_transaction(event_opt) + elif is_transaction: + split_spans = _split_gen_ai_spans(event_opt) + if split_spans is None or not split_spans[1]: + envelope.add_transaction(event_opt) + else: + non_gen_ai_spans, gen_ai_spans = split_spans + + event_opt["spans"] = non_gen_ai_spans + envelope.add_transaction(event_opt) + + converted_gen_ai_spans = [ + _serialized_v1_span_to_serialized_v2_span(span, event) + for span in gen_ai_spans + if isinstance(span, dict) + ] + + envelope.add_item( + Item( + type=SpanBatcher.TYPE, + content_type=SpanBatcher.CONTENT_TYPE, + headers={ + "item_count": len(converted_gen_ai_spans), + }, + payload=PayloadRef( + json={"items": converted_gen_ai_spans}, + ), + ) + ) + elif is_checkin: envelope.add_checkin(event_opt) else: diff --git a/sentry_sdk/consts.py b/sentry_sdk/consts.py index 73e5a6d9cb..82107b49ee 100644 --- a/sentry_sdk/consts.py +++ b/sentry_sdk/consts.py @@ -86,6 +86,7 @@ class CompressionAlgo(Enum): "trace_lifecycle": Optional[Literal["static", "stream"]], "ignore_spans": Optional[IgnoreSpansConfig], "suppress_asgi_chained_exceptions": Optional[bool], + "gen_ai_as_v2_spans": Optional[bool], }, total=False, ) diff --git a/tests/integrations/anthropic/test_anthropic.py b/tests/integrations/anthropic/test_anthropic.py index e86f7e1fa9..b19cca9347 100644 --- a/tests/integrations/anthropic/test_anthropic.py +++ b/tests/integrations/anthropic/test_anthropic.py @@ -91,14 +91,15 @@ async def __call__(self, *args, **kwargs): ], ) def test_nonstreaming_create_message( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): sentry_init( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -120,37 +121,38 @@ def test_nonstreaming_create_message( assert usage.input_tokens == 10 assert usage.output_tokens == 20 - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] @pytest.mark.asyncio @@ -164,14 +166,15 @@ def test_nonstreaming_create_message( ], ) async def test_nonstreaming_create_message_async( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): sentry_init( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = AsyncAnthropic(api_key="z") client.messages._post = AsyncMock(return_value=EXAMPLE_MESSAGE) @@ -193,36 +196,37 @@ async def test_nonstreaming_create_message_async( assert usage.input_tokens == 10 assert usage.output_tokens == 20 - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) @pytest.mark.parametrize( @@ -236,7 +240,7 @@ async def test_nonstreaming_create_message_async( ) def test_streaming_create_message( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -285,8 +289,9 @@ def test_streaming_create_message( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -308,42 +313,45 @@ def test_streaming_create_message( for _ in message: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] def test_streaming_create_message_close( sentry_init, - capture_events, + capture_items, get_model_response, server_side_event_chunks, ): @@ -390,8 +398,9 @@ def test_streaming_create_message_close( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -415,31 +424,34 @@ def test_streaming_create_message_close( messages.close() - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) @pytest.mark.skipif( @@ -448,7 +460,7 @@ def test_streaming_create_message_close( ) def test_streaming_create_message_api_error( sentry_init, - capture_events, + capture_items, get_model_response, server_side_event_chunks, ): @@ -490,8 +502,9 @@ def test_streaming_create_message_api_error( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -513,34 +526,36 @@ def test_streaming_create_message_api_error( for _ in message: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) - assert span["status"] == "internal_error" - assert span["tags"]["status"] == "internal_error" + assert span["status"] == "error" assert event["contexts"]["trace"]["status"] == "internal_error" @@ -555,7 +570,7 @@ def test_streaming_create_message_api_error( ) def test_stream_messages( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -604,8 +619,9 @@ def test_stream_messages( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -628,42 +644,45 @@ def test_stream_messages( for event in stream: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] def test_stream_messages_close( sentry_init, - capture_events, + capture_items, get_model_response, server_side_event_chunks, ): @@ -710,8 +729,9 @@ def test_stream_messages_close( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -740,31 +760,34 @@ def test_stream_messages_close( stream.close() - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) @pytest.mark.skipif( @@ -773,7 +796,7 @@ def test_stream_messages_close( ) def test_stream_messages_api_error( sentry_init, - capture_events, + capture_items, get_model_response, server_side_event_chunks, ): @@ -815,8 +838,9 @@ def test_stream_messages_api_error( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -839,34 +863,36 @@ def test_stream_messages_api_error( for event in stream: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) - assert span["status"] == "internal_error" - assert span["tags"]["status"] == "internal_error" + assert span["status"] == "error" assert event["contexts"]["trace"]["status"] == "internal_error" @@ -882,7 +908,7 @@ def test_stream_messages_api_error( ) async def test_streaming_create_message_async( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -935,8 +961,9 @@ async def test_streaming_create_message_async( traces_sample_rate=1.0, default_integrations=False, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -958,44 +985,45 @@ async def test_streaming_create_message_async( async for _ in message: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] @pytest.mark.asyncio async def test_streaming_create_message_async_close( sentry_init, - capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, @@ -1045,8 +1073,9 @@ async def test_streaming_create_message_async_close( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1069,31 +1098,34 @@ async def test_streaming_create_message_async_close( await messages.__anext__() await messages.close() - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) @pytest.mark.skipif( @@ -1103,7 +1135,7 @@ async def test_streaming_create_message_async_close( @pytest.mark.asyncio async def test_streaming_create_message_async_api_error( sentry_init, - capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, @@ -1148,8 +1180,9 @@ async def test_streaming_create_message_async_api_error( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1171,34 +1204,36 @@ async def test_streaming_create_message_async_api_error( async for _ in message: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) - assert span["status"] == "internal_error" - assert span["tags"]["status"] == "internal_error" + assert span["status"] == "error" assert event["contexts"]["trace"]["status"] == "internal_error" @@ -1214,7 +1249,7 @@ async def test_streaming_create_message_async_api_error( ) async def test_stream_message_async( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -1266,8 +1301,9 @@ async def test_stream_message_async( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1290,37 +1326,38 @@ async def test_stream_message_async( async for event in stream: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) @pytest.mark.skipif( @@ -1330,7 +1367,7 @@ async def test_stream_message_async( @pytest.mark.asyncio async def test_stream_messages_async_api_error( sentry_init, - capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, @@ -1375,8 +1412,9 @@ async def test_stream_messages_async_api_error( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1399,41 +1437,43 @@ async def test_stream_messages_async_api_error( async for event in stream: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) - assert span["status"] == "internal_error" - assert span["tags"]["status"] == "internal_error" + assert span["status"] == "error" assert event["contexts"]["trace"]["status"] == "internal_error" @pytest.mark.asyncio async def test_stream_messages_async_close( sentry_init, - capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, @@ -1483,8 +1523,9 @@ async def test_stream_messages_async_close( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1515,31 +1556,34 @@ async def test_stream_messages_async_close( await stream.close() - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "Hello, Claude"}]' ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) @pytest.mark.skipif( @@ -1557,7 +1601,7 @@ async def test_stream_messages_async_close( ) def test_streaming_create_message_with_input_json_delta( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -1636,8 +1680,9 @@ def test_streaming_create_message_with_input_json_delta( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1659,38 +1704,36 @@ def test_streaming_create_message_with_input_json_delta( for _ in message: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' ) assert ( - span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == '{"location": "San Francisco, CA"}' ) else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True @pytest.mark.skipif( @@ -1708,7 +1751,7 @@ def test_streaming_create_message_with_input_json_delta( ) def test_stream_messages_with_input_json_delta( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -1787,8 +1830,9 @@ def test_stream_messages_with_input_json_delta( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1811,38 +1855,36 @@ def test_stream_messages_with_input_json_delta( for event in stream: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' ) assert ( - span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == '{"location": "San Francisco, CA"}' ) else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True @pytest.mark.asyncio @@ -1861,7 +1903,7 @@ def test_stream_messages_with_input_json_delta( ) async def test_streaming_create_message_with_input_json_delta_async( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -1946,8 +1988,9 @@ async def test_streaming_create_message_with_input_json_delta_async( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1969,39 +2012,37 @@ async def test_streaming_create_message_with_input_json_delta_async( async for _ in message: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' ) assert ( - span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == '{"location": "San Francisco, CA"}' ) else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True @pytest.mark.asyncio @@ -2020,7 +2061,7 @@ async def test_streaming_create_message_with_input_json_delta_async( ) async def test_stream_message_with_input_json_delta_async( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -2105,8 +2146,9 @@ async def test_stream_message_with_input_json_delta_async( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -2129,44 +2171,46 @@ async def test_stream_message_with_input_json_delta_async( async for event in stream: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' ) assert ( - span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == '{"location": "San Francisco, CA"}' ) else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True -def test_exception_message_create(sentry_init, capture_events): - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() +def test_exception_message_create(sentry_init, capture_items): + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("event", "transaction") client = Anthropic(api_key="z") client.messages._post = mock.Mock( @@ -2179,14 +2223,20 @@ def test_exception_message_create(sentry_init, capture_events): max_tokens=1024, ) - (event, transaction) = events + (event,) = (item.payload for item in items if item.type == "event") assert event["level"] == "error" + + (transaction,) = (item.payload for item in items if item.type == "transaction") assert transaction["contexts"]["trace"]["status"] == "internal_error" -def test_span_status_error(sentry_init, capture_events): - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() +def test_span_status_error(sentry_init, capture_items): + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("event", "span") with start_transaction(name="anthropic"): client = Anthropic(api_key="z") @@ -2200,18 +2250,23 @@ def test_span_status_error(sentry_init, capture_events): max_tokens=1024, ) - (error, transaction) = events + (error,) = (item.payload for item in items if item.type == "event") assert error["level"] == "error" - assert transaction["spans"][0]["status"] == "internal_error" - assert transaction["spans"][0]["tags"]["status"] == "internal_error" - assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["status"] == "error" + assert spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" @pytest.mark.asyncio -async def test_span_status_error_async(sentry_init, capture_events): - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() +async def test_span_status_error_async(sentry_init, capture_items): + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("event", "span") with start_transaction(name="anthropic"): client = AsyncAnthropic(api_key="z") @@ -2225,18 +2280,23 @@ async def test_span_status_error_async(sentry_init, capture_events): max_tokens=1024, ) - (error, transaction) = events + (error,) = (item.payload for item in items if item.type == "event") assert error["level"] == "error" - assert transaction["spans"][0]["status"] == "internal_error" - assert transaction["spans"][0]["tags"]["status"] == "internal_error" - assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["status"] == "error" + assert spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" @pytest.mark.asyncio -async def test_exception_message_create_async(sentry_init, capture_events): - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() +async def test_exception_message_create_async(sentry_init, capture_items): + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("event", "transaction") client = AsyncAnthropic(api_key="z") client.messages._post = AsyncMock( @@ -2249,17 +2309,20 @@ async def test_exception_message_create_async(sentry_init, capture_events): max_tokens=1024, ) - (event, transaction) = events + (event,) = (item.payload for item in items if item.type == "event") assert event["level"] == "error" + + (transaction,) = (item.payload for item in items if item.type == "transaction") assert transaction["contexts"]["trace"]["status"] == "internal_error" -def test_span_origin(sentry_init, capture_events): +def test_span_origin(sentry_init, capture_items): sentry_init( integrations=[AnthropicIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -2274,21 +2337,23 @@ def test_span_origin(sentry_init, capture_events): with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - (event,) = events - + (event,) = (item.payload for item in items if item.type == "transaction") assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.anthropic" - assert event["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert event["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" @pytest.mark.asyncio -async def test_span_origin_async(sentry_init, capture_events): +async def test_span_origin_async(sentry_init, capture_items): sentry_init( integrations=[AnthropicIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = AsyncAnthropic(api_key="z") client.messages._post = AsyncMock(return_value=EXAMPLE_MESSAGE) @@ -2303,12 +2368,13 @@ async def test_span_origin_async(sentry_init, capture_events): with start_transaction(name="anthropic"): await client.messages.create(max_tokens=1024, messages=messages, model="model") - (event,) = events - + (event,) = (item.payload for item in items if item.type == "transaction") assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.anthropic" - assert event["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert event["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" @pytest.mark.skipif( @@ -2349,6 +2415,7 @@ def test_set_output_data_with_input_json_delta(sentry_init): integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with start_transaction(name="test"): @@ -2392,15 +2459,16 @@ def test_set_output_data_with_input_json_delta(sentry_init): ], ) def test_anthropic_message_role_mapping( - sentry_init, capture_events, test_message, expected_role + sentry_init, capture_items, test_message, expected_role ): """Test that Anthropic integration properly maps message roles like 'ai' to 'assistant'""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") @@ -2425,29 +2493,29 @@ def mock_messages_create(*args, **kwargs): model="claude-3-opus", max_tokens=10, messages=test_messages ) - (event,) = events - span = event["spans"][0] + span = next(item.payload for item in items if item.type == "span") # Verify that the span was created correctly - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] # Parse the stored messages - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert stored_messages[0]["role"] == expected_role -def test_anthropic_message_truncation(sentry_init, capture_events): +def test_anthropic_message_truncation(sentry_init, capture_items): """Test that large messages are truncated properly in Anthropic integration.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -2466,21 +2534,18 @@ def test_anthropic_message_truncation(sentry_init, capture_events): with start_transaction(): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] chat_spans = [ - span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_CHAT + span for span in spans if span["attributes"].get("sentry.op") == OP.GEN_AI_CHAT ] assert len(chat_spans) > 0 chat_span = chat_spans[0] - assert chat_span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert chat_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["data"] + assert chat_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert chat_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["attributes"] - messages_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + messages_data = chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] assert isinstance(messages_data, str) parsed_messages = json.loads(messages_data) @@ -2488,18 +2553,20 @@ def test_anthropic_message_truncation(sentry_init, capture_events): assert len(parsed_messages) == 1 assert "small message 5" in str(parsed_messages[0]) + tx = next(item.payload for item in items if item.type == "transaction") assert tx["_meta"]["spans"]["0"]["data"]["gen_ai.request.messages"][""]["len"] == 5 @pytest.mark.asyncio -async def test_anthropic_message_truncation_async(sentry_init, capture_events): +async def test_anthropic_message_truncation_async(sentry_init, capture_items): """Test that large messages are truncated properly in Anthropic integration.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = AsyncAnthropic(api_key="z") client.messages._post = mock.AsyncMock(return_value=EXAMPLE_MESSAGE) @@ -2518,21 +2585,18 @@ async def test_anthropic_message_truncation_async(sentry_init, capture_events): with start_transaction(): await client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] chat_spans = [ - span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_CHAT + span for span in spans if span["attributes"].get("sentry.op") == OP.GEN_AI_CHAT ] assert len(chat_spans) > 0 chat_span = chat_spans[0] - assert chat_span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert chat_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["data"] + assert chat_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert chat_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["attributes"] - messages_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + messages_data = chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] assert isinstance(messages_data, str) parsed_messages = json.loads(messages_data) @@ -2540,6 +2604,7 @@ async def test_anthropic_message_truncation_async(sentry_init, capture_events): assert len(parsed_messages) == 1 assert "small message 5" in str(parsed_messages[0]) + tx = next(item.payload for item in items if item.type == "transaction") assert tx["_meta"]["spans"]["0"]["data"]["gen_ai.request.messages"][""]["len"] == 5 @@ -2553,15 +2618,16 @@ async def test_anthropic_message_truncation_async(sentry_init, capture_events): ], ) def test_nonstreaming_create_message_with_system_prompt( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): """Test that system prompts are properly captured in GEN_AI_REQUEST_MESSAGES.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -2586,46 +2652,46 @@ def test_nonstreaming_create_message_with_system_prompt( assert usage.input_tokens == 10 assert usage.output_tokens == 20 - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] ) assert system_instructions == [ {"type": "text", "content": "You are a helpful assistant."} ] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == "user" assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] @pytest.mark.asyncio @@ -2639,15 +2705,16 @@ def test_nonstreaming_create_message_with_system_prompt( ], ) async def test_nonstreaming_create_message_with_system_prompt_async( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): """Test that system prompts are properly captured in GEN_AI_REQUEST_MESSAGES (async).""" sentry_init( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = AsyncAnthropic(api_key="z") client.messages._post = AsyncMock(return_value=EXAMPLE_MESSAGE) @@ -2672,46 +2739,46 @@ async def test_nonstreaming_create_message_with_system_prompt_async( assert usage.input_tokens == 10 assert usage.output_tokens == 20 - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] ) assert system_instructions == [ {"type": "text", "content": "You are a helpful assistant."} ] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == "user" assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] @pytest.mark.parametrize( @@ -2725,7 +2792,7 @@ async def test_nonstreaming_create_message_with_system_prompt_async( ) def test_streaming_create_message_with_system_prompt( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -2775,8 +2842,9 @@ def test_streaming_create_message_with_system_prompt( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -2802,46 +2870,46 @@ def test_streaming_create_message_with_system_prompt( for _ in message: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] ) assert system_instructions == [ {"type": "text", "content": "You are a helpful assistant."} ] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == "user" assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True @pytest.mark.parametrize( @@ -2855,7 +2923,7 @@ def test_streaming_create_message_with_system_prompt( ) def test_stream_messages_with_system_prompt( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -2905,8 +2973,9 @@ def test_stream_messages_with_system_prompt( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -2930,46 +2999,46 @@ def test_stream_messages_with_system_prompt( for event in stream: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] ) assert system_instructions == [ {"type": "text", "content": "You are a helpful assistant."} ] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == "user" assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True @pytest.mark.asyncio @@ -2984,7 +3053,7 @@ def test_stream_messages_with_system_prompt( ) async def test_stream_message_with_system_prompt_async( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -3037,8 +3106,9 @@ async def test_stream_message_with_system_prompt_async( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -3062,46 +3132,46 @@ async def test_stream_message_with_system_prompt_async( async for event in stream: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] ) assert system_instructions == [ {"type": "text", "content": "You are a helpful assistant."} ] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == "user" assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True @pytest.mark.asyncio @@ -3116,7 +3186,7 @@ async def test_stream_message_with_system_prompt_async( ) async def test_streaming_create_message_with_system_prompt_async( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -3169,8 +3239,9 @@ async def test_streaming_create_message_with_system_prompt_async( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -3196,56 +3267,57 @@ async def test_streaming_create_message_with_system_prompt_async( async for _ in message: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] ) assert system_instructions == [ {"type": "text", "content": "You are a helpful assistant."} ] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == "user" assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True -def test_system_prompt_with_complex_structure(sentry_init, capture_events): +def test_system_prompt_with_complex_structure(sentry_init, capture_items): """Test that complex system prompt structures (list of text blocks) are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3268,17 +3340,18 @@ def test_system_prompt_with_complex_structure(sentry_init, capture_events): ) assert response == EXAMPLE_MESSAGE - assert len(events) == 1 - (event,) = events - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] - system_instructions = json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] + system_instructions = json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) # System content should be a list of text blocks assert isinstance(system_instructions, list) @@ -3287,8 +3360,8 @@ def test_system_prompt_with_complex_structure(sentry_init, capture_events): {"type": "text", "content": "Be concise and clear."}, ] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == "user" @@ -3490,14 +3563,15 @@ def test_transform_message_content_list_anthropic(): # Integration tests for binary data in messages -def test_message_with_base64_image(sentry_init, capture_events): +def test_message_with_base64_image(sentry_init, capture_items): """Test that messages with base64 images are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3521,12 +3595,11 @@ def test_message_with_base64_image(sentry_init, capture_events): with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == "user" @@ -3541,14 +3614,15 @@ def test_message_with_base64_image(sentry_init, capture_events): } -def test_message_with_url_image(sentry_init, capture_events): +def test_message_with_url_image(sentry_init, capture_items): """Test that messages with URL-referenced images are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3571,11 +3645,10 @@ def test_message_with_url_image(sentry_init, capture_events): with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert content[1] == { "type": "uri", @@ -3585,14 +3658,15 @@ def test_message_with_url_image(sentry_init, capture_events): } -def test_message_with_file_image(sentry_init, capture_events): +def test_message_with_file_image(sentry_init, capture_items): """Test that messages with file_id-referenced images are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3616,11 +3690,10 @@ def test_message_with_file_image(sentry_init, capture_events): with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert content[1] == { "type": "file", @@ -3630,14 +3703,15 @@ def test_message_with_file_image(sentry_init, capture_events): } -def test_message_with_base64_pdf(sentry_init, capture_events): +def test_message_with_base64_pdf(sentry_init, capture_items): """Test that messages with base64-encoded PDF documents are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3651,7 +3725,7 @@ def test_message_with_base64_pdf(sentry_init, capture_events): "source": { "type": "base64", "media_type": "application/pdf", - "data": "JVBERi0xLjQKJeLj...base64pdfdata", + "attributes": "JVBERi0xLjQKJeLj...base64pdfdata", }, }, ], @@ -3661,11 +3735,10 @@ def test_message_with_base64_pdf(sentry_init, capture_events): with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert content[1] == { "type": "blob", @@ -3675,14 +3748,15 @@ def test_message_with_base64_pdf(sentry_init, capture_events): } -def test_message_with_url_pdf(sentry_init, capture_events): +def test_message_with_url_pdf(sentry_init, capture_items): """Test that messages with URL-referenced PDF documents are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3705,11 +3779,10 @@ def test_message_with_url_pdf(sentry_init, capture_events): with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert content[1] == { "type": "uri", @@ -3719,14 +3792,15 @@ def test_message_with_url_pdf(sentry_init, capture_events): } -def test_message_with_file_document(sentry_init, capture_events): +def test_message_with_file_document(sentry_init, capture_items): """Test that messages with file_id-referenced documents are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3750,11 +3824,10 @@ def test_message_with_file_document(sentry_init, capture_events): with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert content[1] == { "type": "file", @@ -3764,14 +3837,15 @@ def test_message_with_file_document(sentry_init, capture_events): } -def test_message_with_mixed_content(sentry_init, capture_events): +def test_message_with_mixed_content(sentry_init, capture_items): """Test that messages with mixed content (text, images, documents) are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3785,7 +3859,7 @@ def test_message_with_mixed_content(sentry_init, capture_events): "source": { "type": "base64", "media_type": "image/png", - "data": "iVBORw0KGgo...base64imagedata", + "attributes": "iVBORw0KGgo...base64imagedata", }, }, { @@ -3800,7 +3874,7 @@ def test_message_with_mixed_content(sentry_init, capture_events): "source": { "type": "base64", "media_type": "application/pdf", - "data": "JVBERi0xLjQK...base64pdfdata", + "attributes": "JVBERi0xLjQK...base64pdfdata", }, }, {"type": "text", "text": "Please provide a detailed analysis."}, @@ -3811,11 +3885,10 @@ def test_message_with_mixed_content(sentry_init, capture_events): with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert len(content) == 5 @@ -3847,14 +3920,15 @@ def test_message_with_mixed_content(sentry_init, capture_events): } -def test_message_with_multiple_images_different_formats(sentry_init, capture_events): +def test_message_with_multiple_images_different_formats(sentry_init, capture_items): """Test that messages with multiple images of different source types are handled.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3867,7 +3941,7 @@ def test_message_with_multiple_images_different_formats(sentry_init, capture_eve "source": { "type": "base64", "media_type": "image/jpeg", - "data": "base64data1...", + "attributes": "base64data1...", }, }, { @@ -3893,11 +3967,10 @@ def test_message_with_multiple_images_different_formats(sentry_init, capture_eve with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert len(content) == 4 @@ -3922,14 +3995,15 @@ def test_message_with_multiple_images_different_formats(sentry_init, capture_eve assert content[3] == {"type": "text", "text": "Compare these three images."} -def test_binary_content_not_stored_when_pii_disabled(sentry_init, capture_events): +def test_binary_content_not_stored_when_pii_disabled(sentry_init, capture_items): """Test that binary content is not stored when send_default_pii is False.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=False, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3943,7 +4017,7 @@ def test_binary_content_not_stored_when_pii_disabled(sentry_init, capture_events "source": { "type": "base64", "media_type": "image/jpeg", - "data": "base64encodeddatahere...", + "attributes": "base64encodeddatahere...", }, }, ], @@ -3953,22 +4027,22 @@ def test_binary_content_not_stored_when_pii_disabled(sentry_init, capture_events with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans # Messages should not be stored - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] -def test_binary_content_not_stored_when_prompts_disabled(sentry_init, capture_events): +def test_binary_content_not_stored_when_prompts_disabled(sentry_init, capture_items): """Test that binary content is not stored when include_prompts is False.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3982,7 +4056,7 @@ def test_binary_content_not_stored_when_prompts_disabled(sentry_init, capture_ev "source": { "type": "base64", "media_type": "image/jpeg", - "data": "base64encodeddatahere...", + "attributes": "base64encodeddatahere...", }, }, ], @@ -3992,18 +4066,21 @@ def test_binary_content_not_stored_when_prompts_disabled(sentry_init, capture_ev with start_transaction(name="anthropic"): client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans # Messages should not be stored - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] -def test_cache_tokens_nonstreaming(sentry_init, capture_events): +def test_cache_tokens_nonstreaming(sentry_init, capture_items): """Test cache read/write tokens are tracked for non-streaming responses.""" - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock( @@ -4029,16 +4106,16 @@ def test_cache_tokens_nonstreaming(sentry_init, capture_events): model="claude-3-5-sonnet-20241022", ) - (span,) = events[0]["spans"] + (span,) = [item.payload for item in items if item.type == "span"] # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 50 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 50 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 -def test_input_tokens_include_cache_write_nonstreaming(sentry_init, capture_events): +def test_input_tokens_include_cache_write_nonstreaming(sentry_init, capture_items): """ Test that gen_ai.usage.input_tokens includes cache_write tokens (non-streaming). @@ -4050,8 +4127,12 @@ def test_input_tokens_include_cache_write_nonstreaming(sentry_init, capture_even Usage(input_tokens=19, output_tokens=14, cache_creation_input_tokens=2846, cache_read_input_tokens=0) """ - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock( @@ -4077,16 +4158,16 @@ def test_input_tokens_include_cache_write_nonstreaming(sentry_init, capture_even model="claude-sonnet-4-20250514", ) - (span,) = events[0]["spans"] + (span,) = [item.payload for item in items if item.type == "span"] # input_tokens should be total: 19 (non-cached) + 2846 (cache_write) = 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 0 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 2846 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 0 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 2846 -def test_input_tokens_include_cache_read_nonstreaming(sentry_init, capture_events): +def test_input_tokens_include_cache_read_nonstreaming(sentry_init, capture_items): """ Test that gen_ai.usage.input_tokens includes cache_read tokens (non-streaming). @@ -4098,8 +4179,12 @@ def test_input_tokens_include_cache_read_nonstreaming(sentry_init, capture_event Usage(input_tokens=19, output_tokens=14, cache_creation_input_tokens=0, cache_read_input_tokens=2846) """ - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock( @@ -4125,18 +4210,18 @@ def test_input_tokens_include_cache_read_nonstreaming(sentry_init, capture_event model="claude-sonnet-4-20250514", ) - (span,) = events[0]["spans"] + (span,) = [item.payload for item in items if item.type == "span"] # input_tokens should be total: 19 (non-cached) + 2846 (cache_read) = 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 def test_input_tokens_include_cache_read_streaming( sentry_init, - capture_events, + capture_items, get_model_response, server_side_event_chunks, ): @@ -4175,8 +4260,12 @@ def test_input_tokens_include_cache_read_streaming( ) ) - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("transaction", "span") with mock.patch.object( client._client, @@ -4192,18 +4281,18 @@ def test_input_tokens_include_cache_read_streaming( ): pass - (span,) = events[0]["spans"] + (span,) = [item.payload for item in items if item.type == "span"] # input_tokens should be total: 19 + 2846 = test_stream_messages_input_tokens_include_cache_read_streaming - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 def test_stream_messages_input_tokens_include_cache_read_streaming( sentry_init, - capture_events, + capture_items, get_model_response, server_side_event_chunks, ): @@ -4241,8 +4330,12 @@ def test_stream_messages_input_tokens_include_cache_read_streaming( ) ) - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("transaction", "span") with mock.patch.object( client._client, @@ -4258,24 +4351,28 @@ def test_stream_messages_input_tokens_include_cache_read_streaming( for event in stream: pass - (span,) = events[0]["spans"] + (span,) = [item.payload for item in items if item.type == "span"] # input_tokens should be total: 19 + 2846 = 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 -def test_input_tokens_unchanged_without_caching(sentry_init, capture_events): +def test_input_tokens_unchanged_without_caching(sentry_init, capture_items): """ Test that input_tokens is unchanged when there are no cached tokens. Real Anthropic response (from E2E test, simple call without caching): Usage(input_tokens=20, output_tokens=12) """ - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("transaction", "span") client = Anthropic(api_key="z") client.messages._post = mock.Mock( @@ -4299,15 +4396,15 @@ def test_input_tokens_unchanged_without_caching(sentry_init, capture_events): model="claude-sonnet-4-20250514", ) - (span,) = events[0]["spans"] + (span,) = [item.payload for item in items if item.type == "span"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 32 # 20 + 12 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 32 # 20 + 12 def test_cache_tokens_streaming( sentry_init, - capture_events, + capture_items, get_model_response, server_side_event_chunks, ): @@ -4342,8 +4439,12 @@ def test_cache_tokens_streaming( ) ) - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("transaction", "span") with mock.patch.object( client._client, @@ -4359,17 +4460,17 @@ def test_cache_tokens_streaming( ): pass - (span,) = events[0]["spans"] + (span,) = [item.payload for item in items if item.type == "span"] # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 def test_stream_messages_cache_tokens( - sentry_init, capture_events, get_model_response, server_side_event_chunks + sentry_init, capture_items, get_model_response, server_side_event_chunks ): """Test cache tokens are tracked for streaming responses.""" client = Anthropic(api_key="z") @@ -4402,8 +4503,12 @@ def test_stream_messages_cache_tokens( ) ) - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("transaction", "span") with mock.patch.object( client._client, @@ -4419,10 +4524,10 @@ def test_stream_messages_cache_tokens( for event in stream: pass - (span,) = events[0]["spans"] + (span,) = [item.payload for item in items if item.type == "span"] # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 diff --git a/tests/integrations/google_genai/test_google_genai.py b/tests/integrations/google_genai/test_google_genai.py index 6e91ba6634..ae31fe565b 100644 --- a/tests/integrations/google_genai/test_google_genai.py +++ b/tests/integrations/google_genai/test_google_genai.py @@ -124,14 +124,15 @@ def create_test_config( ], ) def test_nonstreaming_generate_content( - sentry_init, capture_events, send_default_pii, include_prompts, mock_genai_client + sentry_init, capture_items, send_default_pii, include_prompts, mock_genai_client ): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock the HTTP response at the _api_client.request() level mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -146,38 +147,37 @@ def test_nonstreaming_generate_content( mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents="Tell me a joke", config=config ) - assert len(events) == 1 - (event,) = events - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "google_genai" - assert len(event["spans"]) == 1 - chat_span = event["spans"][0] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + chat_span = next(item.payload for item in items if item.type == "span") # Check chat span - assert chat_span["op"] == OP.GEN_AI_CHAT - assert chat_span["description"] == "chat gemini-1.5-flash" - assert chat_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert chat_span["data"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" - assert chat_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash" + assert chat_span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert chat_span["name"] == "chat gemini-1.5-flash" + assert chat_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert chat_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" + assert chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash" if send_default_pii and include_prompts: # Response text is stored as a JSON array - response_text = chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + response_text = chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] # Parse the JSON array response_texts = json.loads(response_text) assert response_texts == ["Hello! How can I help you today?"] else: - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_span["attributes"] # Check token usage - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 # Output tokens now include reasoning tokens: candidates_token_count (20) + thoughts_token_count (3) = 23 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 23 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 23 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 @pytest.mark.parametrize("generate_content_config", (False, True)) @@ -210,7 +210,7 @@ def test_nonstreaming_generate_content( ) def test_generate_content_with_system_instruction( sentry_init, - capture_events, + capture_items, mock_genai_client, generate_content_config, system_instructions, @@ -220,8 +220,9 @@ def test_generate_content_with_system_instruction( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -243,16 +244,15 @@ def test_generate_content_with_system_instruction( config=config, ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") if expected_texts is None: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_span["attributes"] return # (PII is enabled and include_prompts is True in this test) system_instructions = json.loads( - invoke_span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + invoke_span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] ) assert system_instructions == [ @@ -260,12 +260,13 @@ def test_generate_content_with_system_instruction( ] -def test_generate_content_with_tools(sentry_init, capture_events, mock_genai_client): +def test_generate_content_with_tools(sentry_init, capture_items, mock_genai_client): sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Create a mock tool function def get_weather(location: str) -> str: @@ -319,18 +320,17 @@ def get_weather(location: str) -> str: model="gemini-1.5-flash", contents="What's the weather?", config=config ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") # Check that tools are recorded (data is serialized as a string) - tools_data_str = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + tools_data_str = invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] # Parse the JSON string to verify content tools_data = json.loads(tools_data_str) assert len(tools_data) == 2 # The order of tools may not be guaranteed, so sort by name and description for comparison sorted_tools = sorted( - tools_data, key=lambda t: (t.get("name", ""), t.get("description", "")) + tools_data, key=lambda t: (t.get("name", ""), t.get("name", "")) ) # The function tool @@ -342,13 +342,14 @@ def get_weather(location: str) -> str: assert sorted_tools[1]["description"] == "Get weather information (tool object)" -def test_tool_execution(sentry_init, capture_events): +def test_tool_execution(sentry_init, capture_items): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Create a mock tool function def get_weather(location: str) -> str: @@ -366,25 +367,26 @@ def get_weather(location: str) -> str: assert result == "The weather in San Francisco is sunny" - (event,) = events - assert len(event["spans"]) == 1 - tool_span = event["spans"][0] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + tool_span = next(item.payload for item in items if item.type == "span") - assert tool_span["op"] == OP.GEN_AI_EXECUTE_TOOL - assert tool_span["description"] == "execute_tool get_weather" - assert tool_span["data"][SPANDATA.GEN_AI_TOOL_NAME] == "get_weather" + assert tool_span["attributes"]["sentry.op"] == OP.GEN_AI_EXECUTE_TOOL + assert tool_span["name"] == "execute_tool get_weather" + assert tool_span["attributes"][SPANDATA.GEN_AI_TOOL_NAME] == "get_weather" assert ( - tool_span["data"][SPANDATA.GEN_AI_TOOL_DESCRIPTION] + tool_span["attributes"][SPANDATA.GEN_AI_TOOL_DESCRIPTION] == "Get the weather for a location" ) -def test_error_handling(sentry_init, capture_events, mock_genai_client): +def test_error_handling(sentry_init, capture_items, mock_genai_client): sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event", "transaction") # Mock an error at the HTTP level with mock.patch.object( @@ -399,8 +401,8 @@ def test_error_handling(sentry_init, capture_events, mock_genai_client): ) # Should have both transaction and error events - assert len(events) == 2 - error_event, transaction_event = events + assert len([item for item in items if item.type == "transaction"]) == 1 + (error_event,) = (item.payload for item in items if item.type == "event") assert error_event["level"] == "error" assert error_event["exception"]["values"][0]["type"] == "Exception" @@ -408,14 +410,15 @@ def test_error_handling(sentry_init, capture_events, mock_genai_client): assert error_event["exception"]["values"][0]["mechanism"]["type"] == "google_genai" -def test_streaming_generate_content(sentry_init, capture_events, mock_genai_client): +def test_streaming_generate_content(sentry_init, capture_items, mock_genai_client): """Test streaming with generate_content_stream, verifying chunk accumulation.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Create streaming chunks - simulating a multi-chunk response # Chunk 1: First part of text with partial usage metadata @@ -497,40 +500,42 @@ def test_streaming_generate_content(sentry_init, capture_events, mock_genai_clie assert collected_chunks[1].candidates[0].content.parts[0].text == "How can I " assert collected_chunks[2].candidates[0].content.parts[0].text == "help you today?" - (event,) = events - - assert len(event["spans"]) == 1 - chat_span = event["spans"][0] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + chat_span = next(item.payload for item in items if item.type == "span") # Check that streaming flag is set on both spans - assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True # Verify accumulated response text (all chunks combined) expected_full_text = "Hello! How can I help you today?" # Response text is stored as a JSON string - chat_response_text = json.loads(chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]) + chat_response_text = json.loads( + chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + ) assert chat_response_text == [expected_full_text] # Verify finish reasons (only the final chunk has a finish reason) # When there's a single finish reason, it's stored as a plain string (not JSON) - assert SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS in chat_span["data"] - assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == "STOP" - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 25 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 + assert SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS in chat_span["attributes"] + assert chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == "STOP" + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 25 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 # Verify model name - assert chat_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash" + assert chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash" -def test_span_origin(sentry_init, capture_events, mock_genai_client): +def test_span_origin(sentry_init, capture_items, mock_genai_client): sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -543,22 +548,22 @@ def test_span_origin(sentry_init, capture_events, mock_genai_client): model="gemini-1.5-flash", contents="Test origin", config=config ) - (event,) = events - + (event,) = (item.payload for item in items if item.type == "transaction") assert event["contexts"]["trace"]["origin"] == "manual" - for span in event["spans"]: - assert span["origin"] == "auto.ai.google_genai" + spans = [item.payload for item in items if item.type == "span"] + for span in spans: + assert span["attributes"]["sentry.origin"] == "auto.ai.google_genai" -def test_response_without_usage_metadata( - sentry_init, capture_events, mock_genai_client -): + +def test_response_without_usage_metadata(sentry_init, capture_items, mock_genai_client): """Test handling of responses without usage metadata""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Response without usage metadata response_json = { @@ -584,23 +589,23 @@ def test_response_without_usage_metadata( model="gemini-1.5-flash", contents="Test", config=config ) - (event,) = events - chat_span = event["spans"][0] + chat_span = next(item.payload for item in items if item.type == "span") # Usage data should not be present - assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in chat_span["data"] - assert SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS not in chat_span["data"] - assert SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS not in chat_span["data"] + assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in chat_span["attributes"] + assert SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS not in chat_span["attributes"] + assert SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS not in chat_span["attributes"] -def test_multiple_candidates(sentry_init, capture_events, mock_genai_client): +def test_multiple_candidates(sentry_init, capture_items, mock_genai_client): """Test handling of multiple response candidates""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Response with multiple candidates multi_candidate_json = { @@ -638,12 +643,11 @@ def test_multiple_candidates(sentry_init, capture_events, mock_genai_client): model="gemini-1.5-flash", contents="Generate multiple", config=config ) - (event,) = events - chat_span = event["spans"][0] + chat_span = next(item.payload for item in items if item.type == "span") # Should capture all responses # Response text is stored as a JSON string when there are multiple responses - response_text = chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + response_text = chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] if isinstance(response_text, str) and response_text.startswith("["): # It's a JSON array response_list = json.loads(response_text) @@ -654,18 +658,19 @@ def test_multiple_candidates(sentry_init, capture_events, mock_genai_client): # Finish reasons are serialized as JSON finish_reasons = json.loads( - chat_span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] + chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] ) assert finish_reasons == ["STOP", "MAX_TOKENS"] -def test_all_configuration_parameters(sentry_init, capture_events, mock_genai_client): +def test_all_configuration_parameters(sentry_init, capture_items, mock_genai_client): """Test that all configuration parameters are properly recorded""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -686,26 +691,26 @@ def test_all_configuration_parameters(sentry_init, capture_events, mock_genai_cl model="gemini-1.5-flash", contents="Test all params", config=config ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") # Check all parameters are recorded - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.8 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.95 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_TOP_K] == 40 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 2048 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_SEED] == 12345 + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.8 + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.95 + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_K] == 40 + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 2048 + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_SEED] == 12345 -def test_empty_response(sentry_init, capture_events, mock_genai_client): +def test_empty_response(sentry_init, capture_items, mock_genai_client): """Test handling of minimal response with no content""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Minimal response with empty candidates array minimal_response_json = {"candidates": []} @@ -723,20 +728,21 @@ def test_empty_response(sentry_init, capture_events, mock_genai_client): assert response is not None assert len(response.candidates) == 0 - (event,) = events # Should still create spans even with empty candidates - assert len(event["spans"]) == 1 + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 def test_response_with_different_id_fields( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test handling of different response ID field names""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Response with response_id and model_version response_json = { @@ -763,20 +769,22 @@ def test_response_with_different_id_fields( model="gemini-1.5-flash", contents="Test", config=create_test_config() ) - (event,) = events - chat_span = event["spans"][0] + chat_span = next(item.payload for item in items if item.type == "span") - assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "resp-456" - assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gemini-1.5-flash-001" + assert chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] == "resp-456" + assert ( + chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] + == "gemini-1.5-flash-001" + ) -def test_tool_with_async_function(sentry_init, capture_events): +def test_tool_with_async_function(sentry_init): """Test that async tool functions are properly wrapped""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - capture_events() # Create an async tool function async def async_tool(param: str) -> str: @@ -792,14 +800,15 @@ async def async_tool(param: str) -> str: assert hasattr(wrapped_async_tool, "__wrapped__") # Should preserve original -def test_contents_as_none(sentry_init, capture_events, mock_genai_client): +def test_contents_as_none(sentry_init, capture_items, mock_genai_client): """Test handling when contents parameter is None""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -811,22 +820,22 @@ def test_contents_as_none(sentry_init, capture_events, mock_genai_client): model="gemini-1.5-flash", contents=None, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") # Should handle None contents gracefully - messages = invoke_span["data"].get(SPANDATA.GEN_AI_REQUEST_MESSAGES, []) + messages = invoke_span["attributes"].get(SPANDATA.GEN_AI_REQUEST_MESSAGES, []) # Should only have system message if any, not user message assert all(msg["role"] != "user" or msg["content"] is not None for msg in messages) -def test_tool_calls_extraction(sentry_init, capture_events, mock_genai_client): +def test_tool_calls_extraction(sentry_init, capture_items, mock_genai_client): """Test extraction of tool/function calls from response""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Response with function calls function_call_response_json = { @@ -875,14 +884,17 @@ def test_tool_calls_extraction(sentry_init, capture_events, mock_genai_client): config=create_test_config(), ) - (event,) = events - chat_span = event["spans"][0] # The chat span + chat_span = next( + item.payload for item in items if item.type == "span" + ) # The chat span # Check that tool calls are extracted and stored - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_span["attributes"] # Parse the JSON string to verify content - tool_calls = json.loads(chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS]) + tool_calls = json.loads( + chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + ) assert len(tool_calls) == 2 @@ -902,16 +914,15 @@ def test_tool_calls_extraction(sentry_init, capture_events, mock_genai_client): assert json.loads(tool_calls[1]["arguments"]) == {"timezone": "PST"} -def test_google_genai_message_truncation( - sentry_init, capture_events, mock_genai_client -): +def test_google_genai_message_truncation(sentry_init, capture_items, mock_genai_client): """Test that large messages are truncated properly in Google GenAI integration.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") large_content = ( "This is a very long message that will exceed our size limits. " * 1000 @@ -930,11 +941,10 @@ def test_google_genai_message_truncation( config=create_test_config(), ) - (event,) = events - invoke_span = event["spans"][0] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] + invoke_span = next(item.payload for item in items if item.type == "span") + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["attributes"] - messages_data = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + messages_data = invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] assert isinstance(messages_data, str) parsed_messages = json.loads(messages_data) @@ -980,14 +990,15 @@ def test_google_genai_message_truncation( ], ) def test_embed_content( - sentry_init, capture_events, send_default_pii, include_prompts, mock_genai_client + sentry_init, capture_items, send_default_pii, include_prompts, mock_genai_client ): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock the HTTP response at the _api_client.request() level mock_http_response = create_mock_http_response(EXAMPLE_EMBED_RESPONSE_JSON) @@ -1006,47 +1017,50 @@ def test_embed_content( ], ) - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "google_genai_embeddings" # Should have 1 span for embeddings - assert len(event["spans"]) == 1 - (embed_span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (embed_span,) = spans # Check embeddings span - assert embed_span["op"] == OP.GEN_AI_EMBEDDINGS - assert embed_span["description"] == "embeddings text-embedding-004" - assert embed_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" - assert embed_span["data"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" - assert embed_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-004" + assert embed_span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert embed_span["name"] == "embeddings text-embedding-004" + assert embed_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert embed_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" + assert ( + embed_span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-004" + ) # Check input texts if PII is allowed if send_default_pii and include_prompts: - input_texts = json.loads(embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) + input_texts = json.loads( + embed_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + ) assert input_texts == [ "What is your name?", "What is your favorite color?", ] else: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embed_span["data"] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embed_span["attributes"] # Check usage data (sum of token counts from statistics: 10 + 15 = 25) # Note: Only available in newer versions with ContentEmbeddingStatistics - if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: - assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 25 + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["attributes"]: + assert embed_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 25 -def test_embed_content_string_input(sentry_init, capture_events, mock_genai_client): +def test_embed_content_string_input(sentry_init, capture_items, mock_genai_client): """Test embed_content with a single string instead of list.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Mock response with single embedding single_embed_response = { @@ -1074,25 +1088,26 @@ def test_embed_content_string_input(sentry_init, capture_events, mock_genai_clie contents="Single text input", ) - (event,) = events - (embed_span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (embed_span,) = spans # Check that single string is handled correctly - input_texts = json.loads(embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) + input_texts = json.loads(embed_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) assert input_texts == ["Single text input"] # Should use token_count from statistics (5), not billable_character_count (10) # Note: Only available in newer versions with ContentEmbeddingStatistics - if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: - assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["attributes"]: + assert embed_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 -def test_embed_content_error_handling(sentry_init, capture_events, mock_genai_client): +def test_embed_content_error_handling(sentry_init, capture_items, mock_genai_client): """Test error handling in embed_content.""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "event") # Mock an error at the HTTP level with mock.patch.object( @@ -1108,8 +1123,8 @@ def test_embed_content_error_handling(sentry_init, capture_events, mock_genai_cl ) # Should have both transaction and error events - assert len(events) == 2 - error_event, _ = events + assert len([item for item in items if item.type == "transaction"]) == 1 + (error_event,) = (item.payload for item in items if item.type == "event") assert error_event["level"] == "error" assert error_event["exception"]["values"][0]["type"] == "Exception" @@ -1118,14 +1133,15 @@ def test_embed_content_error_handling(sentry_init, capture_events, mock_genai_cl def test_embed_content_without_statistics( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test embed_content response without statistics (older package versions).""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Response without statistics (typical for older google-genai versions) # Embeddings exist but don't have the statistics field @@ -1150,21 +1166,22 @@ def test_embed_content_without_statistics( contents=["Test without statistics", "Another test"], ) - (event,) = events - (embed_span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (embed_span,) = spans # No usage tokens since there are no statistics in older versions # This is expected and the integration should handle it gracefully - assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in embed_span["data"] + assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in embed_span["attributes"] -def test_embed_content_span_origin(sentry_init, capture_events, mock_genai_client): +def test_embed_content_span_origin(sentry_init, capture_items, mock_genai_client): """Test that embed_content spans have correct origin.""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") mock_http_response = create_mock_http_response(EXAMPLE_EMBED_RESPONSE_JSON) @@ -1177,11 +1194,12 @@ def test_embed_content_span_origin(sentry_init, capture_events, mock_genai_clien contents=["Test origin"], ) - (event,) = events - + (event,) = (item.payload for item in items if item.type == "transaction") assert event["contexts"]["trace"]["origin"] == "manual" - for span in event["spans"]: - assert span["origin"] == "auto.ai.google_genai" + + spans = [item.payload for item in items if item.type == "span"] + for span in spans: + assert span["attributes"]["sentry.origin"] == "auto.ai.google_genai" @pytest.mark.asyncio @@ -1195,15 +1213,16 @@ def test_embed_content_span_origin(sentry_init, capture_events, mock_genai_clien ], ) async def test_async_embed_content( - sentry_init, capture_events, send_default_pii, include_prompts, mock_genai_client + sentry_init, capture_items, send_default_pii, include_prompts, mock_genai_client ): """Test async embed_content method.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock the async HTTP response mock_http_response = create_mock_http_response(EXAMPLE_EMBED_RESPONSE_JSON) @@ -1222,50 +1241,53 @@ async def test_async_embed_content( ], ) - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "google_genai_embeddings_async" # Should have 1 span for embeddings - assert len(event["spans"]) == 1 - (embed_span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (embed_span,) = spans # Check embeddings span - assert embed_span["op"] == OP.GEN_AI_EMBEDDINGS - assert embed_span["description"] == "embeddings text-embedding-004" - assert embed_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" - assert embed_span["data"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" - assert embed_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-004" + assert embed_span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert embed_span["name"] == "embeddings text-embedding-004" + assert embed_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert embed_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" + assert ( + embed_span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-004" + ) # Check input texts if PII is allowed if send_default_pii and include_prompts: - input_texts = json.loads(embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) + input_texts = json.loads( + embed_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + ) assert input_texts == [ "What is your name?", "What is your favorite color?", ] else: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embed_span["data"] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embed_span["attributes"] # Check usage data (sum of token counts from statistics: 10 + 15 = 25) # Note: Only available in newer versions with ContentEmbeddingStatistics - if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: - assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 25 + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["attributes"]: + assert embed_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 25 @pytest.mark.asyncio async def test_async_embed_content_string_input( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test async embed_content with a single string instead of list.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Mock response with single embedding single_embed_response = { @@ -1293,28 +1315,29 @@ async def test_async_embed_content_string_input( contents="Single text input", ) - (event,) = events - (embed_span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (embed_span,) = spans # Check that single string is handled correctly - input_texts = json.loads(embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) + input_texts = json.loads(embed_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) assert input_texts == ["Single text input"] # Should use token_count from statistics (5), not billable_character_count (10) # Note: Only available in newer versions with ContentEmbeddingStatistics - if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: - assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["attributes"]: + assert embed_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 @pytest.mark.asyncio async def test_async_embed_content_error_handling( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test error handling in async embed_content.""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "event") # Mock an error at the HTTP level with mock.patch.object( @@ -1330,8 +1353,8 @@ async def test_async_embed_content_error_handling( ) # Should have both transaction and error events - assert len(events) == 2 - error_event, _ = events + assert len([item for item in items if item.type == "transaction"]) == 1 + (error_event,) = (item.payload for item in items if item.type == "event") assert error_event["level"] == "error" assert error_event["exception"]["values"][0]["type"] == "Exception" @@ -1341,14 +1364,15 @@ async def test_async_embed_content_error_handling( @pytest.mark.asyncio async def test_async_embed_content_without_statistics( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test async embed_content response without statistics (older package versions).""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") # Response without statistics (typical for older google-genai versions) # Embeddings exist but don't have the statistics field @@ -1373,24 +1397,25 @@ async def test_async_embed_content_without_statistics( contents=["Test without statistics", "Another test"], ) - (event,) = events - (embed_span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (embed_span,) = spans # No usage tokens since there are no statistics in older versions # This is expected and the integration should handle it gracefully - assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in embed_span["data"] + assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in embed_span["attributes"] @pytest.mark.asyncio async def test_async_embed_content_span_origin( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test that async embed_content spans have correct origin.""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") mock_http_response = create_mock_http_response(EXAMPLE_EMBED_RESPONSE_JSON) @@ -1403,24 +1428,26 @@ async def test_async_embed_content_span_origin( contents=["Test origin"], ) - (event,) = events - + (event,) = [item.payload for item in items if item.type == "transaction"] assert event["contexts"]["trace"]["origin"] == "manual" - for span in event["spans"]: - assert span["origin"] == "auto.ai.google_genai" + + spans = [item.payload for item in items if item.type == "span"] + for span in spans: + assert span["attributes"]["sentry.origin"] == "auto.ai.google_genai" # Integration tests for generate_content with different input message formats def test_generate_content_with_content_object( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test generate_content with Content object input.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1437,10 +1464,9 @@ def test_generate_content_with_content_object( model="gemini-1.5-flash", contents=content, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert messages[0]["content"] == [ @@ -1449,15 +1475,16 @@ def test_generate_content_with_content_object( def test_generate_content_with_dict_format( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test generate_content with dict format input (ContentDict).""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1472,10 +1499,9 @@ def test_generate_content_with_dict_format( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert messages[0]["content"] == [ @@ -1483,16 +1509,15 @@ def test_generate_content_with_dict_format( ] -def test_generate_content_with_file_data( - sentry_init, capture_events, mock_genai_client -): +def test_generate_content_with_file_data(sentry_init, capture_items, mock_genai_client): """Test generate_content with file_data (external file reference).""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1516,10 +1541,9 @@ def test_generate_content_with_file_data( model="gemini-1.5-flash", contents=content, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert len(messages[0]["content"]) == 2 @@ -1534,15 +1558,16 @@ def test_generate_content_with_file_data( def test_generate_content_with_inline_data( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test generate_content with inline_data (binary data).""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1565,10 +1590,9 @@ def test_generate_content_with_inline_data( model="gemini-1.5-flash", contents=content, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert len(messages[0]["content"]) == 2 @@ -1580,15 +1604,16 @@ def test_generate_content_with_inline_data( def test_generate_content_with_function_response( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test generate_content with function_response (tool result).""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1622,10 +1647,9 @@ def test_generate_content_with_function_response( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 # First message is user message assert messages[0]["role"] == "tool" @@ -1635,15 +1659,16 @@ def test_generate_content_with_function_response( def test_generate_content_with_mixed_string_and_content( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test generate_content with mixed string and Content objects in list.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1668,10 +1693,9 @@ def test_generate_content_with_mixed_string_and_content( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 # User message assert messages[0]["role"] == "user" @@ -1679,15 +1703,16 @@ def test_generate_content_with_mixed_string_and_content( def test_generate_content_with_part_object_directly( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test generate_content with Part object directly (not wrapped in Content).""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1702,17 +1727,16 @@ def test_generate_content_with_part_object_directly( model="gemini-1.5-flash", contents=part, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert messages[0]["content"] == [{"text": "Direct Part object", "type": "text"}] def test_generate_content_with_list_of_dicts( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """ Test generate_content with list of dict format inputs. @@ -1725,8 +1749,9 @@ def test_generate_content_with_list_of_dicts( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1745,25 +1770,25 @@ def test_generate_content_with_list_of_dicts( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert messages[0]["content"] == [{"text": "Second user message", "type": "text"}] def test_generate_content_with_dict_inline_data( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): """Test generate_content with dict format containing inline_data.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1784,10 +1809,9 @@ def test_generate_content_with_dict_inline_data( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert len(messages[0]["content"]) == 2 @@ -1801,14 +1825,15 @@ def test_generate_content_with_dict_inline_data( def test_generate_content_without_parts_property_inline_data( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1825,10 +1850,9 @@ def test_generate_content_without_parts_property_inline_data( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 @@ -1845,14 +1869,15 @@ def test_generate_content_without_parts_property_inline_data( def test_generate_content_without_parts_property_inline_data_and_binary_data_within_string( - sentry_init, capture_events, mock_genai_client + sentry_init, capture_items, mock_genai_client ): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1874,10 +1899,9 @@ def test_generate_content_without_parts_property_inline_data_and_binary_data_wit model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" @@ -2162,7 +2186,9 @@ def test_extract_contents_messages_dict_inline_data(): """Test extract_contents_messages with dict containing inline_data""" content_dict = { "role": "user", - "parts": [{"inline_data": {"data": b"binary_data", "mime_type": "image/gif"}}], + "parts": [ + {"inline_data": {"attributes": b"binary_data", "mime_type": "image/gif"}} + ], } result = extract_contents_messages(content_dict) diff --git a/tests/integrations/huggingface_hub/test_huggingface_hub.py b/tests/integrations/huggingface_hub/test_huggingface_hub.py index 9dd15ca4b5..eaac8c1ab1 100644 --- a/tests/integrations/huggingface_hub/test_huggingface_hub.py +++ b/tests/integrations/huggingface_hub/test_huggingface_hub.py @@ -471,7 +471,7 @@ def mock_hf_chat_completion_api_streaming_tools(httpx_mock): @pytest.mark.parametrize("include_prompts", [True, False]) def test_text_generation( sentry_init: "Any", - capture_events: "Any", + capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", mock_hf_text_generation_api: "Any", @@ -480,8 +480,9 @@ def test_text_generation( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = InferenceClient(model="test-model") @@ -492,23 +493,22 @@ def test_text_generation( details=True, ) - (transaction,) = events - + spans = [item.payload for item in items if item.type == "span"] span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): assert span is None, "there is exactly one gen_ai span" span = sp else: # there should be no other spans, just the gen_ai span # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" + assert sp["attributes"]["sentry.op"] == "http.client" assert span is not None - assert span["op"] == "gen_ai.text_completion" - assert span["description"] == "text_completion test-model" - assert span["origin"] == "auto.ai.huggingface_hub" + assert span["attributes"]["sentry.op"] == "gen_ai.text_completion" + assert span["name"] == "text_completion test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" expected_data = { "gen_ai.operation.name": "text_completion", @@ -516,6 +516,14 @@ def test_text_generation( "gen_ai.response.finish_reasons": "length", "gen_ai.response.streaming": False, "gen_ai.usage.total_tokens": 10, + "sentry.environment": "production", + "sentry.op": "gen_ai.text_completion", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": "2.58.0", + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", "thread.id": mock.ANY, "thread.name": mock.ANY, } @@ -528,10 +536,10 @@ def test_text_generation( assert "gen_ai.request.messages" not in expected_data assert "gen_ai.response.text" not in expected_data - assert span["data"] == expected_data + assert span["attributes"] == expected_data # text generation does not set the response model - assert "gen_ai.response.model" not in span["data"] + assert "gen_ai.response.model" not in span["attributes"] @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @@ -539,7 +547,7 @@ def test_text_generation( @pytest.mark.parametrize("include_prompts", [True, False]) def test_text_generation_streaming( sentry_init: "Any", - capture_events: "Any", + capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", mock_hf_text_generation_api_streaming: "Any", @@ -548,8 +556,9 @@ def test_text_generation_streaming( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = InferenceClient(model="test-model") @@ -561,23 +570,22 @@ def test_text_generation_streaming( ): pass - (transaction,) = events - + spans = [item.payload for item in items if item.type == "span"] span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): assert span is None, "there is exactly one gen_ai span" span = sp else: # there should be no other spans, just the gen_ai span # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" + assert sp["attributes"]["sentry.op"] == "http.client" assert span is not None - assert span["op"] == "gen_ai.text_completion" - assert span["description"] == "text_completion test-model" - assert span["origin"] == "auto.ai.huggingface_hub" + assert span["attributes"]["sentry.op"] == "gen_ai.text_completion" + assert span["name"] == "text_completion test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" expected_data = { "gen_ai.operation.name": "text_completion", @@ -585,6 +593,14 @@ def test_text_generation_streaming( "gen_ai.response.finish_reasons": "length", "gen_ai.response.streaming": True, "gen_ai.usage.total_tokens": 10, + "sentry.environment": "production", + "sentry.op": "gen_ai.text_completion", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", "thread.id": mock.ANY, "thread.name": mock.ANY, } @@ -597,10 +613,10 @@ def test_text_generation_streaming( assert "gen_ai.request.messages" not in expected_data assert "gen_ai.response.text" not in expected_data - assert span["data"] == expected_data + assert span["attributes"] == expected_data # text generation does not set the response model - assert "gen_ai.response.model" not in span["data"] + assert "gen_ai.response.model" not in span["attributes"] @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @@ -608,7 +624,7 @@ def test_text_generation_streaming( @pytest.mark.parametrize("include_prompts", [True, False]) def test_chat_completion( sentry_init: "Any", - capture_events: "Any", + capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", mock_hf_chat_completion_api: "Any", @@ -617,8 +633,9 @@ def test_chat_completion( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = get_hf_provider_inference_client() @@ -628,23 +645,22 @@ def test_chat_completion( stream=False, ) - (transaction,) = events - + spans = [item.payload for item in items if item.type == "span"] span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): assert span is None, "there is exactly one gen_ai span" span = sp else: # there should be no other spans, just the gen_ai span # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" + assert sp["attributes"]["sentry.op"] == "http.client" assert span is not None - assert span["op"] == "gen_ai.chat" - assert span["description"] == "chat test-model" - assert span["origin"] == "auto.ai.huggingface_hub" + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["name"] == "chat test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" expected_data = { "gen_ai.operation.name": "chat", @@ -655,6 +671,14 @@ def test_chat_completion( "gen_ai.usage.input_tokens": 10, "gen_ai.usage.output_tokens": 8, "gen_ai.usage.total_tokens": 18, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", "thread.id": mock.ANY, "thread.name": mock.ANY, } @@ -671,7 +695,7 @@ def test_chat_completion( assert "gen_ai.request.messages" not in expected_data assert "gen_ai.response.text" not in expected_data - assert span["data"] == expected_data + assert span["attributes"] == expected_data @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @@ -679,7 +703,7 @@ def test_chat_completion( @pytest.mark.parametrize("include_prompts", [True, False]) def test_chat_completion_streaming( sentry_init: "Any", - capture_events: "Any", + capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", mock_hf_chat_completion_api_streaming: "Any", @@ -688,8 +712,9 @@ def test_chat_completion_streaming( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = get_hf_provider_inference_client() @@ -701,23 +726,22 @@ def test_chat_completion_streaming( ) ) - (transaction,) = events - + spans = [item.payload for item in items if item.type == "span"] span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): assert span is None, "there is exactly one gen_ai span" span = sp else: # there should be no other spans, just the gen_ai span # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" + assert sp["attributes"]["sentry.op"] == "http.client" assert span is not None - assert span["op"] == "gen_ai.chat" - assert span["description"] == "chat test-model" - assert span["origin"] == "auto.ai.huggingface_hub" + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["name"] == "chat test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" expected_data = { "gen_ai.operation.name": "chat", @@ -725,6 +749,14 @@ def test_chat_completion_streaming( "gen_ai.response.finish_reasons": "stop", "gen_ai.response.model": "test-model-123", "gen_ai.response.streaming": True, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", "thread.id": mock.ANY, "thread.name": mock.ANY, } @@ -744,15 +776,15 @@ def test_chat_completion_streaming( assert "gen_ai.request.messages" not in expected_data assert "gen_ai.response.text" not in expected_data - assert span["data"] == expected_data + assert span["attributes"] == expected_data @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) def test_chat_completion_api_error( - sentry_init: "Any", capture_events: "Any", mock_hf_api_with_errors: "Any" + sentry_init: "Any", capture_items: "Any", mock_hf_api_with_errors: "Any" ) -> None: - sentry_init(traces_sample_rate=1.0) - events = capture_events() + sentry_init(traces_sample_rate=1.0, _experiments={"gen_ai_as_v2_spans": True}) + items = capture_items("event", "transaction", "span") client = get_hf_provider_inference_client() @@ -762,32 +794,29 @@ def test_chat_completion_api_error( messages=[{"role": "user", "content": "Hello!"}], ) - ( - error, - transaction, - ) = events - + (error,) = (item.payload for item in items if item.type == "event") assert error["exception"]["values"][0]["mechanism"]["type"] == "huggingface_hub" assert not error["exception"]["values"][0]["mechanism"]["handled"] + spans = [item.payload for item in items if item.type == "span"] span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): assert span is None, "there is exactly one gen_ai span" span = sp else: # there should be no other spans, just the gen_ai span # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" + assert sp["attributes"]["sentry.op"] == "http.client" assert span is not None - assert span["op"] == "gen_ai.chat" - assert span["description"] == "chat test-model" - assert span["origin"] == "auto.ai.huggingface_hub" - assert span["status"] == "internal_error" - assert span.get("tags", {}).get("status") == "internal_error" + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["name"] == "chat test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + assert span["status"] == "error" + (transaction,) = (item.payload for item in items if item.type == "transaction") assert ( error["contexts"]["trace"]["trace_id"] == transaction["contexts"]["trace"]["trace_id"] @@ -795,18 +824,26 @@ def test_chat_completion_api_error( expected_data = { "gen_ai.operation.name": "chat", "gen_ai.request.model": "test-model", + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", "thread.id": mock.ANY, "thread.name": mock.ANY, } - assert span["data"] == expected_data + assert span["attributes"] == expected_data @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) def test_span_status_error( - sentry_init: "Any", capture_events: "Any", mock_hf_api_with_errors: "Any" + sentry_init: "Any", capture_items: "Any", mock_hf_api_with_errors: "Any" ) -> None: - sentry_init(traces_sample_rate=1.0) - events = capture_events() + sentry_init(traces_sample_rate=1.0, _experiments={"gen_ai_as_v2_spans": True}) + items = capture_items("event", "transaction", "span") client = get_hf_provider_inference_client() @@ -816,22 +853,22 @@ def test_span_status_error( messages=[{"role": "user", "content": "Hello!"}], ) - (error, transaction) = events + (error,) = [item.payload for item in items if item.type == "event"] assert error["level"] == "error" + spans = [item.payload for item in items if item.type == "span"] span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): assert span is None, "there is exactly one gen_ai span" span = sp else: # there should be no other spans, just the gen_ai span # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" + assert sp["attributes"]["sentry.op"] == "http.client" assert span is not None - assert span["status"] == "internal_error" - assert span["tags"]["status"] == "internal_error" + assert span["status"] == "error" @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @@ -839,7 +876,7 @@ def test_span_status_error( @pytest.mark.parametrize("include_prompts", [True, False]) def test_chat_completion_with_tools( sentry_init: "Any", - capture_events: "Any", + capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", mock_hf_chat_completion_api_tools: "Any", @@ -848,8 +885,9 @@ def test_chat_completion_with_tools( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = get_hf_provider_inference_client() @@ -875,23 +913,22 @@ def test_chat_completion_with_tools( tool_choice="auto", ) - (transaction,) = events - + spans = [item.payload for item in items if item.type == "span"] span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): assert span is None, "there is exactly one gen_ai span" span = sp else: # there should be no other spans, just the gen_ai span # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" + assert sp["attributes"]["sentry.op"] == "http.client" assert span is not None - assert span["op"] == "gen_ai.chat" - assert span["description"] == "chat test-model" - assert span["origin"] == "auto.ai.huggingface_hub" + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["name"] == "chat test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" expected_data = { "gen_ai.operation.name": "chat", @@ -902,6 +939,14 @@ def test_chat_completion_with_tools( "gen_ai.usage.input_tokens": 10, "gen_ai.usage.output_tokens": 8, "gen_ai.usage.total_tokens": 18, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", "thread.id": mock.ANY, "thread.name": mock.ANY, } @@ -919,7 +964,7 @@ def test_chat_completion_with_tools( assert "gen_ai.response.text" not in expected_data assert "gen_ai.response.tool_calls" not in expected_data - assert span["data"] == expected_data + assert span["attributes"] == expected_data @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @@ -927,7 +972,7 @@ def test_chat_completion_with_tools( @pytest.mark.parametrize("include_prompts", [True, False]) def test_chat_completion_streaming_with_tools( sentry_init: "Any", - capture_events: "Any", + capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", mock_hf_chat_completion_api_streaming_tools: "Any", @@ -936,8 +981,9 @@ def test_chat_completion_streaming_with_tools( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = get_hf_provider_inference_client() @@ -966,23 +1012,22 @@ def test_chat_completion_streaming_with_tools( ) ) - (transaction,) = events - + spans = [item.payload for item in items if item.type == "span"] span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): assert span is None, "there is exactly one gen_ai span" span = sp else: # there should be no other spans, just the gen_ai span # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" + assert sp["attributes"]["sentry.op"] == "http.client" assert span is not None - assert span["op"] == "gen_ai.chat" - assert span["description"] == "chat test-model" - assert span["origin"] == "auto.ai.huggingface_hub" + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["name"] == "chat test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" expected_data = { "gen_ai.operation.name": "chat", @@ -991,6 +1036,14 @@ def test_chat_completion_streaming_with_tools( "gen_ai.response.finish_reasons": "tool_calls", "gen_ai.response.model": "test-model-123", "gen_ai.response.streaming": True, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", "thread.id": mock.ANY, "thread.name": mock.ANY, } @@ -1014,4 +1067,4 @@ def test_chat_completion_streaming_with_tools( assert "gen_ai.response.text" not in expected_data assert "gen_ai.response.tool_calls" not in expected_data - assert span["data"] == expected_data + assert span["attributes"] == expected_data diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py index 498a5d6f4a..ef27d45767 100644 --- a/tests/integrations/langchain/test_langchain.py +++ b/tests/integrations/langchain/test_langchain.py @@ -97,7 +97,7 @@ def _llm_type(self) -> str: def test_langchain_text_completion( sentry_init, - capture_events, + capture_items, get_model_response, ): sentry_init( @@ -108,8 +108,9 @@ def test_langchain_text_completion( ], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") model_response = get_model_response( Completion( @@ -149,25 +150,29 @@ def test_langchain_text_completion( input_text = "What is the capital of France?" model.invoke(input_text, config={"run_name": "my-snazzy-pipeline"}) - tx = events[0] + tx = next(item.payload for item in items if item.type == "transaction") assert tx["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] llm_spans = [ span - for span in tx.get("spans", []) - if span.get("op") == "gen_ai.text_completion" + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.text_completion" ] assert len(llm_spans) > 0 llm_span = llm_spans[0] - assert llm_span["description"] == "text_completion gpt-3.5-turbo" - assert llm_span["data"]["gen_ai.system"] == "openai" - assert llm_span["data"]["gen_ai.pipeline.name"] == "my-snazzy-pipeline" - assert llm_span["data"]["gen_ai.request.model"] == "gpt-3.5-turbo" - assert llm_span["data"]["gen_ai.response.text"] == "The capital of France is Paris." - assert llm_span["data"]["gen_ai.usage.total_tokens"] == 25 - assert llm_span["data"]["gen_ai.usage.input_tokens"] == 10 - assert llm_span["data"]["gen_ai.usage.output_tokens"] == 15 + assert llm_span["name"] == "text_completion gpt-3.5-turbo" + assert llm_span["attributes"]["gen_ai.system"] == "openai" + assert llm_span["attributes"]["gen_ai.pipeline.name"] == "my-snazzy-pipeline" + assert llm_span["attributes"]["gen_ai.request.model"] == "gpt-3.5-turbo" + assert ( + llm_span["attributes"]["gen_ai.response.text"] + == "The capital of France is Paris." + ) + assert llm_span["attributes"]["gen_ai.usage.total_tokens"] == 25 + assert llm_span["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert llm_span["attributes"]["gen_ai.usage.output_tokens"] == 15 @pytest.mark.skipif( @@ -196,7 +201,7 @@ def test_langchain_text_completion( ) def test_langchain_create_agent( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, system_instructions_content, @@ -212,8 +217,9 @@ def test_langchain_create_agent( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") model_response = get_model_response( nonstreaming_responses_model_response, @@ -250,22 +256,23 @@ def test_langchain_create_agent( }, ) - tx = events[0] + tx = next(item.payload for item in items if item.type == "transaction") assert tx["type"] == "transaction" assert tx["contexts"]["trace"]["origin"] == "manual" - chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list(x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat") assert len(chat_spans) == 1 - assert chat_spans[0]["origin"] == "auto.ai.langchain" + assert chat_spans[0]["attributes"]["sentry.origin"] == "auto.ai.langchain" - assert chat_spans[0]["data"]["gen_ai.system"] == "openai-chat" - assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 10 - assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 20 - assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 30 + assert chat_spans[0]["attributes"]["gen_ai.system"] == "openai-chat" + assert chat_spans[0]["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert chat_spans[0]["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert chat_spans[0]["attributes"]["gen_ai.usage.total_tokens"] == 30 if send_default_pii and include_prompts: assert ( - chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + chat_spans[0]["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hello, how can I help you?" ) @@ -276,7 +283,9 @@ def test_langchain_create_agent( "type": "text", "content": "You are very powerful assistant, but don't know current events", } - ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + ] == json.loads( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) else: assert [ { @@ -287,11 +296,17 @@ def test_langchain_create_agent( "type": "text", "content": "Be concise and clear.", }, - ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + ] == json.loads( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("attributes", {}) @pytest.mark.skipif( @@ -309,7 +324,7 @@ def test_langchain_create_agent( ) def test_tool_execution_span( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -323,8 +338,9 @@ def test_tool_execution_span( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") responses = responses_tool_call_model_responses( tool_name="get_word_length", @@ -400,60 +416,71 @@ def test_tool_execution_span( }, ) - tx = events[0] + tx = next(item.payload for item in items if item.type == "transaction") assert tx["type"] == "transaction" assert tx["contexts"]["trace"]["origin"] == "manual" - chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list(x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat") assert len(chat_spans) == 2 - tool_exec_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") + tool_exec_spans = list( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.execute_tool" + ) assert len(tool_exec_spans) == 1 tool_exec_span = tool_exec_spans[0] - assert chat_spans[0]["origin"] == "auto.ai.langchain" - assert chat_spans[1]["origin"] == "auto.ai.langchain" - assert tool_exec_span["origin"] == "auto.ai.langchain" + assert chat_spans[0]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[1]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert tool_exec_span["attributes"]["sentry.origin"] == "auto.ai.langchain" - assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 - assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 - assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 - assert chat_spans[0]["data"]["gen_ai.system"] == "openai-chat" + assert chat_spans[0]["attributes"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["attributes"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["attributes"]["gen_ai.usage.total_tokens"] == 192 + assert chat_spans[0]["attributes"]["gen_ai.system"] == "openai-chat" - assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 - assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 - assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 - assert chat_spans[1]["data"]["gen_ai.system"] == "openai-chat" + assert chat_spans[1]["attributes"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["attributes"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["attributes"]["gen_ai.usage.total_tokens"] == 117 + assert chat_spans[1]["attributes"]["gen_ai.system"] == "openai-chat" if send_default_pii and include_prompts: - assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT] + assert "word" in tool_exec_span["attributes"][SPANDATA.GEN_AI_TOOL_INPUT] - assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "5" in chat_spans[1]["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] # Verify tool calls are recorded when PII is enabled - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get("data", {}), ( + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get( + "attributes", {} + ), ( "Tool calls should be recorded when send_default_pii=True and include_prompts=True" ) - tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + tool_calls_data = chat_spans[0]["attributes"][ + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS + ] assert isinstance(tool_calls_data, str) assert "get_word_length" in tool_calls_data else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {}) - assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {}) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("attributes", {}) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("attributes", {}) + assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("attributes", {}) + assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("attributes", {}) # Verify tool calls are NOT recorded when PII is disabled assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get( - "data", {} + "attributes", {} ), ( f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " f"and include_prompts={include_prompts}" ) assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get( - "data", {} + "attributes", {} ), ( f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " f"and include_prompts={include_prompts}" @@ -461,7 +488,7 @@ def test_tool_execution_span( # Verify that available tools are always recorded regardless of PII settings for chat_span in chat_spans: - tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + tools_data = chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] assert "get_word_length" in tools_data @@ -488,7 +515,7 @@ def test_tool_execution_span( ) def test_langchain_openai_tools_agent( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, system_instructions_content, @@ -504,8 +531,9 @@ def test_langchain_openai_tools_agent( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") prompt = ChatPromptTemplate.from_messages( [ @@ -700,40 +728,47 @@ def test_langchain_openai_tools_agent( with start_transaction(): list(agent_executor.stream({"input": "How many letters in the word eudca"})) - tx = events[0] + tx = next(item.payload for item in items if item.type == "transaction") assert tx["type"] == "transaction" assert tx["contexts"]["trace"]["origin"] == "manual" - invoke_agent_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent") - chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") - tool_exec_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.invoke_agent" + ) + chat_spans = list(x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat") + tool_exec_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.execute_tool" + ) assert len(chat_spans) == 2 - assert invoke_agent_span["origin"] == "auto.ai.langchain" - assert chat_spans[0]["origin"] == "auto.ai.langchain" - assert chat_spans[1]["origin"] == "auto.ai.langchain" - assert tool_exec_span["origin"] == "auto.ai.langchain" + assert invoke_agent_span["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[0]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[1]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert tool_exec_span["attributes"]["sentry.origin"] == "auto.ai.langchain" # We can't guarantee anything about the "shape" of the langchain execution graph - assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 + assert ( + len(list(x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat")) > 0 + ) # Token usage is only available in newer versions of langchain (v0.2+) # where usage_metadata is supported on AIMessageChunk - if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: - assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 - assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 - assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + if "gen_ai.usage.input_tokens" in chat_spans[0]["attributes"]: + assert chat_spans[0]["attributes"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["attributes"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["attributes"]["gen_ai.usage.total_tokens"] == 192 - if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: - assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 - assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 - assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + if "gen_ai.usage.input_tokens" in chat_spans[1]["attributes"]: + assert chat_spans[1]["attributes"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["attributes"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["attributes"]["gen_ai.usage.total_tokens"] == 117 if send_default_pii and include_prompts: - assert "5" in chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT] - assert 5 == int(tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_OUTPUT]) + assert "5" in chat_spans[0]["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "word" in tool_exec_span["attributes"][SPANDATA.GEN_AI_TOOL_INPUT] + assert 5 == int(tool_exec_span["attributes"][SPANDATA.GEN_AI_TOOL_OUTPUT]) param_id = request.node.callspec.id if "string" in param_id: @@ -742,7 +777,9 @@ def test_langchain_openai_tools_agent( "type": "text", "content": "You are very powerful assistant, but don't know current events", } - ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + ] == json.loads( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) else: assert [ { @@ -753,15 +790,21 @@ def test_langchain_openai_tools_agent( "type": "text", "content": "Be concise and clear.", }, - ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + ] == json.loads( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) - assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "5" in chat_spans[1]["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] # Verify tool calls are recorded when PII is enabled - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get("data", {}), ( + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get( + "attributes", {} + ), ( "Tool calls should be recorded when send_default_pii=True and include_prompts=True" ) - tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + tool_calls_data = chat_spans[0]["attributes"][ + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS + ] assert isinstance(tool_calls_data, (list, str)) # Could be serialized if isinstance(tool_calls_data, str): assert "get_word_length" in tool_calls_data @@ -770,45 +813,55 @@ def test_langchain_openai_tools_agent( tool_call_str = str(tool_calls_data) assert "get_word_length" in tool_call_str else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {}) - assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {}) + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("attributes", {}) + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[1].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("attributes", {}) + assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("attributes", {}) + assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("attributes", {}) # Verify tool calls are NOT recorded when PII is disabled assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get( - "data", {} + "attributes", {} ), ( f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " f"and include_prompts={include_prompts}" ) assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get( - "data", {} + "attributes", {} ), ( f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " f"and include_prompts={include_prompts}" ) # Verify finish_reasons is always an array of strings - assert chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + assert chat_spans[0]["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ "function_call" ] - assert chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["stop"] + assert chat_spans[1]["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "stop" + ] # Verify that available tools are always recorded regardless of PII settings for chat_span in chat_spans: - tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + tools_data = chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] assert tools_data is not None, ( "Available tools should always be recorded regardless of PII settings" ) assert "get_word_length" in tools_data -def test_langchain_error(sentry_init, capture_events): +def test_langchain_error(sentry_init, capture_items): global llm_type llm_type = "acme-llm" @@ -816,8 +869,9 @@ def test_langchain_error(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event", "transaction", "span") prompt = ChatPromptTemplate.from_messages( [ @@ -843,19 +897,20 @@ def test_langchain_error(sentry_init, capture_events): with start_transaction(), pytest.raises(ValueError): list(agent_executor.stream({"input": "How many letters in the word eudca"})) - error = events[0] + error = next(item.payload for item in items if item.type == "event") assert error["level"] == "error" -def test_span_status_error(sentry_init, capture_events): +def test_span_status_error(sentry_init, capture_items): global llm_type llm_type = "acme-llm" sentry_init( integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event", "transaction", "span") with start_transaction(name="test"): prompt = ChatPromptTemplate.from_messages( @@ -884,10 +939,13 @@ def test_span_status_error(sentry_init, capture_events): with pytest.raises(ValueError): list(agent_executor.stream({"input": "How many letters in the word eudca"})) - (error, transaction) = events + error = next(item.payload for item in items if item.type == "event") assert error["level"] == "error" - assert transaction["spans"][0]["status"] == "internal_error" - assert transaction["spans"][0]["tags"]["status"] == "internal_error" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["status"] == "error" + + (transaction,) = (item.payload for item in items if item.type == "transaction") assert transaction["contexts"]["trace"]["status"] == "internal_error" @@ -935,7 +993,9 @@ def _llm_type(self): def _identifying_params(self): return {} - sentry_init(integrations=[LangchainIntegration()]) + sentry_init( + integrations=[LangchainIntegration()], _experiments={"gen_ai_as_v2_spans": True} + ) # Create a manual SentryLangchainCallback manual_callback = SentryLangchainCallback( @@ -976,6 +1036,7 @@ def test_langchain_callback_manager(sentry_init): sentry_init( integrations=[LangchainIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) local_manager = BaseCallbackManager(handlers=[]) @@ -1008,6 +1069,7 @@ def test_langchain_callback_manager_with_sentry_callback(sentry_init): sentry_init( integrations=[LangchainIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) sentry_callback = SentryLangchainCallback(0, False) local_manager = BaseCallbackManager(handlers=[sentry_callback]) @@ -1040,6 +1102,7 @@ def test_langchain_callback_list(sentry_init): sentry_init( integrations=[LangchainIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) local_callbacks = [] @@ -1072,6 +1135,7 @@ def test_langchain_callback_list_existing_callback(sentry_init): sentry_init( integrations=[LangchainIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) sentry_callback = SentryLangchainCallback(0, False) local_callbacks = [sentry_callback] @@ -1100,7 +1164,7 @@ def test_langchain_callback_list_existing_callback(sentry_init): assert handler is sentry_callback -def test_langchain_message_role_mapping(sentry_init, capture_events): +def test_langchain_message_role_mapping(sentry_init, capture_items): """Test that message roles are properly normalized in langchain integration.""" global llm_type llm_type = "openai-chat" @@ -1109,8 +1173,9 @@ def test_langchain_message_role_mapping(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") prompt = ChatPromptTemplate.from_messages( [ @@ -1146,19 +1211,18 @@ def test_langchain_message_role_mapping(sentry_init, capture_events): with start_transaction(): list(agent_executor.stream({"input": test_input})) - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] # Find spans with gen_ai operation that should have message data gen_ai_spans = [ - span for span in tx.get("spans", []) if span.get("op", "").startswith("gen_ai") + span + for span in spans + if span["attributes"].get("sentry.op", "").startswith("gen_ai") ] # Check if any span has message data with normalized roles message_data_found = False for span in gen_ai_spans: - span_data = span.get("data", {}) + span_data = span.get("attributes", {}) if SPANDATA.GEN_AI_REQUEST_MESSAGES in span_data: message_data_found = True messages_data = span_data[SPANDATA.GEN_AI_REQUEST_MESSAGES] @@ -1239,7 +1303,7 @@ def test_langchain_message_role_normalization_units(): assert normalized[5] == "string message" # String message unchanged -def test_langchain_message_truncation(sentry_init, capture_events): +def test_langchain_message_truncation(sentry_init, capture_items): """Test that large messages are truncated properly in Langchain integration.""" from langchain_core.outputs import LLMResult, Generation @@ -1247,8 +1311,9 @@ def test_langchain_message_truncation(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") callback = SentryLangchainCallback(max_span_map_size=100, include_prompts=True) @@ -1291,23 +1356,23 @@ def test_langchain_message_truncation(sentry_init, capture_events): ) callback.on_llm_end(response=response, run_id=run_id) - assert len(events) > 0 - tx = events[0] + tx = next(item.payload for item in items if item.type == "transaction") assert tx["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] llm_spans = [ span - for span in tx.get("spans", []) - if span.get("op") == "gen_ai.text_completion" + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.text_completion" ] assert len(llm_spans) > 0 llm_span = llm_spans[0] - assert llm_span["data"]["gen_ai.operation.name"] == "text_completion" - assert llm_span["data"][SPANDATA.GEN_AI_PIPELINE_NAME] == "my_pipeline" + assert llm_span["attributes"]["gen_ai.operation.name"] == "text_completion" + assert llm_span["attributes"][SPANDATA.GEN_AI_PIPELINE_NAME] == "my_pipeline" - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in llm_span["data"] - messages_data = llm_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in llm_span["attributes"] + messages_data = llm_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] assert isinstance(messages_data, str) parsed_messages = json.loads(messages_data) @@ -1327,7 +1392,7 @@ def test_langchain_message_truncation(sentry_init, capture_events): ], ) def test_langchain_embeddings_sync( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): """Test that sync embedding methods (embed_documents, embed_query) are properly traced.""" try: @@ -1339,8 +1404,9 @@ def test_langchain_embeddings_sync( integrations=[LangchainIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock the actual API call with mock.patch.object( @@ -1362,27 +1428,28 @@ def test_langchain_embeddings_sync( assert len(result) == 2 mock_embed_documents.assert_called_once() - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] # Find embeddings span embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" ] assert len(embeddings_spans) == 1 embeddings_span = embeddings_spans[0] - assert embeddings_span["description"] == "embeddings text-embedding-ada-002" - assert embeddings_span["origin"] == "auto.ai.langchain" - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" - assert embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + assert embeddings_span["name"] == "embeddings text-embedding-ada-002" + assert embeddings_span["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["attributes"]["gen_ai.request.model"] + == "text-embedding-ada-002" + ) # Check if input is captured based on PII settings if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] - input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["attributes"] + input_data = embeddings_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] # Could be serialized as string if isinstance(input_data, str): assert "Hello world" in input_data @@ -1391,7 +1458,9 @@ def test_langchain_embeddings_sync( assert "Hello world" in input_data assert "Test document" in input_data else: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get("data", {}) + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get( + "attributes", {} + ) @pytest.mark.parametrize( @@ -1402,7 +1471,7 @@ def test_langchain_embeddings_sync( ], ) def test_langchain_embeddings_embed_query( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): """Test that embed_query method is properly traced.""" try: @@ -1414,8 +1483,9 @@ def test_langchain_embeddings_embed_query( integrations=[LangchainIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock the actual API call with mock.patch.object( @@ -1436,32 +1506,35 @@ def test_langchain_embeddings_embed_query( assert len(result) == 3 mock_embed_query.assert_called_once() - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] # Find embeddings span embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" ] assert len(embeddings_spans) == 1 embeddings_span = embeddings_spans[0] - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" - assert embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["attributes"]["gen_ai.request.model"] + == "text-embedding-ada-002" + ) # Check if input is captured based on PII settings if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] - input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["attributes"] + input_data = embeddings_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] # Could be serialized as string if isinstance(input_data, str): assert "What is the capital of France?" in input_data else: assert "What is the capital of France?" in input_data else: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get("data", {}) + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get( + "attributes", {} + ) @pytest.mark.parametrize( @@ -1473,7 +1546,7 @@ def test_langchain_embeddings_embed_query( ) @pytest.mark.asyncio async def test_langchain_embeddings_async( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): """Test that async embedding methods (aembed_documents, aembed_query) are properly traced.""" try: @@ -1485,8 +1558,9 @@ async def test_langchain_embeddings_async( integrations=[LangchainIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") async def mock_aembed_documents(self, texts): return [[0.1, 0.2, 0.3] for _ in texts] @@ -1512,38 +1586,41 @@ async def mock_aembed_documents(self, texts): assert len(result) == 2 mock_aembed.assert_called_once() - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] # Find embeddings span embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" ] assert len(embeddings_spans) == 1 embeddings_span = embeddings_spans[0] - assert embeddings_span["description"] == "embeddings text-embedding-ada-002" - assert embeddings_span["origin"] == "auto.ai.langchain" - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" - assert embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + assert embeddings_span["name"] == "embeddings text-embedding-ada-002" + assert embeddings_span["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["attributes"]["gen_ai.request.model"] + == "text-embedding-ada-002" + ) # Check if input is captured based on PII settings if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] - input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["attributes"] + input_data = embeddings_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] # Could be serialized as string if isinstance(input_data, str): assert "Async hello" in input_data or "Async test document" in input_data else: assert "Async hello" in input_data or "Async test document" in input_data else: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get("data", {}) + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get( + "attributes", {} + ) @pytest.mark.asyncio -async def test_langchain_embeddings_aembed_query(sentry_init, capture_events): +async def test_langchain_embeddings_aembed_query(sentry_init, capture_items): """Test that aembed_query method is properly traced.""" try: from langchain_openai import OpenAIEmbeddings @@ -1554,8 +1631,9 @@ async def test_langchain_embeddings_aembed_query(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") async def mock_aembed_query(self, text): return [0.1, 0.2, 0.3] @@ -1579,24 +1657,25 @@ async def mock_aembed_query(self, text): assert len(result) == 3 mock_aembed.assert_called_once() - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] # Find embeddings span embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" ] assert len(embeddings_spans) == 1 embeddings_span = embeddings_spans[0] - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" - assert embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["attributes"]["gen_ai.request.model"] + == "text-embedding-ada-002" + ) # Check if input is captured - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] - input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["attributes"] + input_data = embeddings_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] # Could be serialized as string if isinstance(input_data, str): assert "Async query test" in input_data @@ -1604,7 +1683,7 @@ async def mock_aembed_query(self, text): assert "Async query test" in input_data -def test_langchain_embeddings_no_model_name(sentry_init, capture_events): +def test_langchain_embeddings_no_model_name(sentry_init, capture_items): """Test embeddings when model name is not available.""" try: from langchain_openai import OpenAIEmbeddings @@ -1614,8 +1693,9 @@ def test_langchain_embeddings_no_model_name(sentry_init, capture_events): sentry_init( integrations=[LangchainIntegration(include_prompts=False)], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock the actual API call and remove model attribute with mock.patch.object( @@ -1635,28 +1715,26 @@ def test_langchain_embeddings_no_model_name(sentry_init, capture_events): with start_transaction(name="test_embeddings_no_model"): embeddings.embed_documents(["Test"]) - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] # Find embeddings span embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" ] assert len(embeddings_spans) == 1 embeddings_span = embeddings_spans[0] - assert embeddings_span["description"] == "embeddings" - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" + assert embeddings_span["name"] == "embeddings" + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" # Model name should not be set if not available assert ( - "gen_ai.request.model" not in embeddings_span["data"] - or embeddings_span["data"]["gen_ai.request.model"] is None + "gen_ai.request.model" not in embeddings_span["attributes"] + or embeddings_span["attributes"]["gen_ai.request.model"] is None ) -def test_langchain_embeddings_integration_disabled(sentry_init, capture_events): +def test_langchain_embeddings_integration_disabled(sentry_init, capture_items): """Test that embeddings are not traced when integration is disabled.""" try: from langchain_openai import OpenAIEmbeddings @@ -1664,8 +1742,8 @@ def test_langchain_embeddings_integration_disabled(sentry_init, capture_events): pytest.skip("langchain_openai not installed") # Initialize without LangchainIntegration - sentry_init(traces_sample_rate=1.0) - events = capture_events() + sentry_init(traces_sample_rate=1.0, _experiments={"gen_ai_as_v2_spans": True}) + items = capture_items("transaction", "span") with mock.patch.object( OpenAIEmbeddings, @@ -1680,18 +1758,17 @@ def test_langchain_embeddings_integration_disabled(sentry_init, capture_events): embeddings.embed_documents(["Test"]) # Check that no embeddings spans were created - if events: - tx = events[0] - embeddings_spans = [ - span - for span in tx.get("spans", []) - if span.get("op") == "gen_ai.embeddings" - ] - # Should be empty since integration is disabled - assert len(embeddings_spans) == 0 + spans = [item.payload for item in items if item.type == "span"] + embeddings_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" + ] + # Should be empty since integration is disabled + assert len(embeddings_spans) == 0 -def test_langchain_embeddings_multiple_providers(sentry_init, capture_events): +def test_langchain_embeddings_multiple_providers(sentry_init, capture_items): """Test that embeddings work with different providers.""" try: from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings @@ -1702,8 +1779,9 @@ def test_langchain_embeddings_multiple_providers(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock both providers with mock.patch.object( @@ -1731,26 +1809,24 @@ def test_langchain_embeddings_multiple_providers(sentry_init, capture_events): openai_embeddings.embed_documents(["OpenAI test"]) azure_embeddings.embed_documents(["Azure test"]) - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] # Find embeddings spans embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" ] # Should have 2 spans, one for each provider assert len(embeddings_spans) == 2 # Verify both spans have proper data for span in embeddings_spans: - assert span["data"]["gen_ai.operation.name"] == "embeddings" - assert span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["data"] + assert span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert span["attributes"]["gen_ai.request.model"] == "text-embedding-ada-002" + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["attributes"] -def test_langchain_embeddings_error_handling(sentry_init, capture_events): +def test_langchain_embeddings_error_handling(sentry_init, capture_items): """Test that errors in embeddings are properly captured.""" try: from langchain_openai import OpenAIEmbeddings @@ -1761,8 +1837,9 @@ def test_langchain_embeddings_error_handling(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock the API call to raise an error with mock.patch.object( @@ -1781,15 +1858,16 @@ def test_langchain_embeddings_error_handling(sentry_init, capture_events): with pytest.raises(ValueError): embeddings.embed_documents(["Test"]) - # The error should be captured - assert len(events) >= 1 - # We should have both the transaction and potentially an error event - [e for e in events if e.get("level") == "error"] + [ + item.payload + for item in items + if item.type == "event" and item.payload.get("level") == "error" + ] # Note: errors might not be auto-captured depending on SDK settings, # but the span should still be created -def test_langchain_embeddings_multiple_calls(sentry_init, capture_events): +def test_langchain_embeddings_multiple_calls(sentry_init, capture_items): """Test that multiple embeddings calls within a transaction are all traced.""" try: from langchain_openai import OpenAIEmbeddings @@ -1800,8 +1878,9 @@ def test_langchain_embeddings_multiple_calls(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock the actual API calls with mock.patch.object( @@ -1828,32 +1907,31 @@ def test_langchain_embeddings_multiple_calls(sentry_init, capture_events): # Call embed_documents again embeddings.embed_documents(["Third batch"]) - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] # Find embeddings spans - should have 3 (2 embed_documents + 1 embed_query) embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" ] assert len(embeddings_spans) == 3 # Verify all spans have proper data for span in embeddings_spans: - assert span["data"]["gen_ai.operation.name"] == "embeddings" - assert span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["data"] + assert span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert span["attributes"]["gen_ai.request.model"] == "text-embedding-ada-002" + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["attributes"] # Verify the input data is different for each span input_data_list = [ - span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] for span in embeddings_spans + span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + for span in embeddings_spans ] # They should all be different (different inputs) assert len(set(str(data) for data in input_data_list)) == 3 -def test_langchain_embeddings_span_hierarchy(sentry_init, capture_events): +def test_langchain_embeddings_span_hierarchy(sentry_init, capture_items): """Test that embeddings spans are properly nested within parent spans.""" try: from langchain_openai import OpenAIEmbeddings @@ -1864,8 +1942,9 @@ def test_langchain_embeddings_span_hierarchy(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock the actual API call with mock.patch.object( @@ -1884,15 +1963,15 @@ def test_langchain_embeddings_span_hierarchy(sentry_init, capture_events): with sentry_sdk.start_span(op="custom", name="custom operation"): embeddings.embed_documents(["Test within custom span"]) - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] # Find all spans embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" ] + + tx = next(item.payload for item in items if item.type == "transaction") custom_spans = [span for span in tx.get("spans", []) if span.get("op") == "custom"] assert len(embeddings_spans) == 1 @@ -1902,11 +1981,11 @@ def test_langchain_embeddings_span_hierarchy(sentry_init, capture_events): embeddings_span = embeddings_spans[0] custom_span = custom_spans[0] - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" assert custom_span["description"] == "custom operation" -def test_langchain_embeddings_with_list_and_string_inputs(sentry_init, capture_events): +def test_langchain_embeddings_with_list_and_string_inputs(sentry_init, capture_items): """Test that embeddings correctly handle both list and string inputs.""" try: from langchain_openai import OpenAIEmbeddings @@ -1917,8 +1996,9 @@ def test_langchain_embeddings_with_list_and_string_inputs(sentry_init, capture_e integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock the actual API calls with mock.patch.object( @@ -1943,21 +2023,19 @@ def test_langchain_embeddings_with_list_and_string_inputs(sentry_init, capture_e # embed_query takes a string embeddings.embed_query("Single string query") - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] # Find embeddings spans embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" ] assert len(embeddings_spans) == 2 # Both should have input data captured as lists for span in embeddings_spans: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["data"] - input_data = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["attributes"] + input_data = span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] # Input should be normalized to list format if isinstance(input_data, str): # If serialized, should contain the input text @@ -1975,7 +2053,7 @@ def test_langchain_embeddings_with_list_and_string_inputs(sentry_init, capture_e ) def test_langchain_response_model_extraction( sentry_init, - capture_events, + capture_items, response_metadata_model, expected_model, ): @@ -1983,8 +2061,9 @@ def test_langchain_response_model_extraction( integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") callback = SentryLangchainCallback(max_span_map_size=100, include_prompts=True) @@ -2009,25 +2088,22 @@ def test_langchain_response_model_extraction( response = Mock(generations=[[generation]]) callback.on_llm_end(response=response, run_id=run_id) - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] llm_spans = [ span - for span in tx.get("spans", []) - if span.get("op") == "gen_ai.text_completion" + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.text_completion" ] assert len(llm_spans) > 0 llm_span = llm_spans[0] - assert llm_span["data"]["gen_ai.operation.name"] == "text_completion" + assert llm_span["attributes"]["gen_ai.operation.name"] == "text_completion" if expected_model is not None: - assert SPANDATA.GEN_AI_RESPONSE_MODEL in llm_span["data"] - assert llm_span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == expected_model + assert SPANDATA.GEN_AI_RESPONSE_MODEL in llm_span["attributes"] + assert llm_span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == expected_model else: - assert SPANDATA.GEN_AI_RESPONSE_MODEL not in llm_span.get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_MODEL not in llm_span.get("attributes", {}) # Tests for multimodal content transformation functions @@ -2286,13 +2362,14 @@ def test_transform_google_file_data(self): ], ) def test_langchain_ai_system_detection( - sentry_init, capture_events, ai_type, expected_system + sentry_init, capture_items, ai_type, expected_system ): sentry_init( integrations=[LangchainIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") callback = SentryLangchainCallback(max_span_map_size=100, include_prompts=True) @@ -2312,23 +2389,20 @@ def test_langchain_ai_system_detection( response = Mock(generations=[[generation]]) callback.on_llm_end(response=response, run_id=run_id) - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] llm_spans = [ span - for span in tx.get("spans", []) - if span.get("op") == "gen_ai.text_completion" + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.text_completion" ] assert len(llm_spans) > 0 llm_span = llm_spans[0] if expected_system is not None: - assert llm_span["data"][SPANDATA.GEN_AI_SYSTEM] == expected_system + assert llm_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == expected_system else: - assert SPANDATA.GEN_AI_SYSTEM not in llm_span.get("data", {}) + assert SPANDATA.GEN_AI_SYSTEM not in llm_span.get("attributes", {}) class TestTransformLangchainMessageContent: diff --git a/tests/integrations/langgraph/test_langgraph.py b/tests/integrations/langgraph/test_langgraph.py index 2a385d8a78..b70889548f 100644 --- a/tests/integrations/langgraph/test_langgraph.py +++ b/tests/integrations/langgraph/test_langgraph.py @@ -147,15 +147,16 @@ def test_langgraph_integration_init(): ], ) def test_state_graph_compile( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): """Test StateGraph.compile() wrapper creates proper create_agent span.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") graph = MockStateGraph() def original_compile(self, *args, **kwargs): @@ -171,21 +172,23 @@ def original_compile(self, *args, **kwargs): assert compiled_graph is not None assert compiled_graph.name == "test_graph" - tx = events[0] - assert tx["type"] == "transaction" - - agent_spans = [span for span in tx["spans"] if span["op"] == OP.GEN_AI_CREATE_AGENT] + spans = [item.payload for item in items if item.type == "span"] + agent_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_CREATE_AGENT + ] assert len(agent_spans) == 1 agent_span = agent_spans[0] - assert agent_span["description"] == "create_agent test_graph" - assert agent_span["origin"] == "auto.ai.langgraph" - assert agent_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "create_agent" - assert agent_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == "test_graph" - assert agent_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "test-model" - assert SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS in agent_span["data"] - - tools_data = agent_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + assert agent_span["name"] == "create_agent test_graph" + assert agent_span["attributes"]["sentry.origin"] == "auto.ai.langgraph" + assert agent_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "create_agent" + assert agent_span["attributes"][SPANDATA.GEN_AI_AGENT_NAME] == "test_graph" + assert agent_span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "test-model" + assert SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS in agent_span["attributes"] + + tools_data = agent_span["attributes"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] assert tools_data == ["search_tool", "calculator"] assert len(tools_data) == 2 assert "search_tool" in tools_data @@ -201,14 +204,15 @@ def original_compile(self, *args, **kwargs): (False, False), ], ) -def test_pregel_invoke(sentry_init, capture_events, send_default_pii, include_prompts): +def test_pregel_invoke(sentry_init, capture_items, send_default_pii, include_prompts): """Test Pregel.invoke() wrapper creates proper invoke_agent span.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = { "messages": [ @@ -245,26 +249,26 @@ def original_invoke(self, *args, **kwargs): assert result is not None - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_span = invoke_spans[0] - assert invoke_span["description"] == "invoke_agent test_graph" - assert invoke_span["origin"] == "auto.ai.langgraph" - assert invoke_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "invoke_agent" - assert invoke_span["data"][SPANDATA.GEN_AI_PIPELINE_NAME] == "test_graph" - assert invoke_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == "test_graph" + assert invoke_span["name"] == "invoke_agent test_graph" + assert invoke_span["attributes"]["sentry.origin"] == "auto.ai.langgraph" + assert invoke_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "invoke_agent" + assert invoke_span["attributes"][SPANDATA.GEN_AI_PIPELINE_NAME] == "test_graph" + assert invoke_span["attributes"][SPANDATA.GEN_AI_AGENT_NAME] == "test_graph" if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["attributes"] - request_messages = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + request_messages = invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] if isinstance(request_messages, str): import json @@ -273,11 +277,11 @@ def original_invoke(self, *args, **kwargs): assert len(request_messages) == 1 assert request_messages[0]["content"] == "Of course! How can I assist you?" - response_text = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + response_text = invoke_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] assert response_text == expected_assistant_response - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["data"] - tool_calls_data = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["attributes"] + tool_calls_data = invoke_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] if isinstance(tool_calls_data, str): import json @@ -287,9 +291,11 @@ def original_invoke(self, *args, **kwargs): assert tool_calls_data[0]["id"] == "call_test_123" assert tool_calls_data[0]["function"]["name"] == "search_tool" else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in invoke_span.get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in invoke_span.get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in invoke_span.get("data", {}) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in invoke_span.get("attributes", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in invoke_span.get("attributes", {}) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in invoke_span.get( + "attributes", {} + ) @pytest.mark.parametrize( @@ -301,14 +307,15 @@ def original_invoke(self, *args, **kwargs): (False, False), ], ) -def test_pregel_ainvoke(sentry_init, capture_events, send_default_pii, include_prompts): +def test_pregel_ainvoke(sentry_init, capture_items, send_default_pii, include_prompts): """Test Pregel.ainvoke() async wrapper creates proper invoke_agent span.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = {"messages": [MockMessage("What's the weather like?", name="user")]} pregel = MockPregelInstance("async_graph") @@ -341,30 +348,30 @@ async def run_test(): result = asyncio.run(run_test()) assert result is not None - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_span = invoke_spans[0] - assert invoke_span["description"] == "invoke_agent async_graph" - assert invoke_span["origin"] == "auto.ai.langgraph" - assert invoke_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "invoke_agent" - assert invoke_span["data"][SPANDATA.GEN_AI_PIPELINE_NAME] == "async_graph" - assert invoke_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == "async_graph" + assert invoke_span["name"] == "invoke_agent async_graph" + assert invoke_span["attributes"]["sentry.origin"] == "auto.ai.langgraph" + assert invoke_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "invoke_agent" + assert invoke_span["attributes"][SPANDATA.GEN_AI_PIPELINE_NAME] == "async_graph" + assert invoke_span["attributes"][SPANDATA.GEN_AI_AGENT_NAME] == "async_graph" if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["attributes"] - response_text = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + response_text = invoke_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] assert response_text == expected_assistant_response - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["data"] - tool_calls_data = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["attributes"] + tool_calls_data = invoke_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] if isinstance(tool_calls_data, str): import json @@ -374,19 +381,22 @@ async def run_test(): assert tool_calls_data[0]["id"] == "call_weather_456" assert tool_calls_data[0]["function"]["name"] == "get_weather" else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in invoke_span.get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in invoke_span.get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in invoke_span.get("data", {}) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in invoke_span.get("attributes", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in invoke_span.get("attributes", {}) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in invoke_span.get( + "attributes", {} + ) -def test_pregel_invoke_error(sentry_init, capture_events): +def test_pregel_invoke_error(sentry_init, capture_items): """Test error handling during graph execution.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = {"messages": [MockMessage("This will fail")]} pregel = MockPregelInstance("error_graph") @@ -397,25 +407,27 @@ def original_invoke(self, *args, **kwargs): wrapped_invoke = _wrap_pregel_invoke(original_invoke) wrapped_invoke(pregel, test_state) - tx = events[0] + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_span = invoke_spans[0] - assert invoke_span.get("status") == "internal_error" - assert invoke_span.get("tags", {}).get("status") == "internal_error" + assert invoke_span.get("status") == "error" -def test_pregel_ainvoke_error(sentry_init, capture_events): +def test_pregel_ainvoke_error(sentry_init, capture_items): """Test error handling during async graph execution.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = {"messages": [MockMessage("This will fail async")]} pregel = MockPregelInstance("async_error_graph") @@ -431,24 +443,26 @@ async def run_error_test(): asyncio.run(run_error_test()) - tx = events[0] + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_span = invoke_spans[0] - assert invoke_span.get("status") == "internal_error" - assert invoke_span.get("tags", {}).get("status") == "internal_error" + assert invoke_span.get("status") == "error" -def test_span_origin(sentry_init, capture_events): +def test_span_origin(sentry_init, capture_items): """Test that span origins are correctly set.""" sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") graph = MockStateGraph() @@ -461,24 +475,26 @@ def original_compile(self, *args, **kwargs): wrapped_compile = _wrap_state_graph_compile(original_compile) wrapped_compile(graph) - tx = events[0] + tx = next(item.payload for item in items if item.type == "transaction") assert tx["contexts"]["trace"]["origin"] == "manual" - for span in tx["spans"]: - assert span["origin"] == "auto.ai.langgraph" + spans = [item.payload for item in items if item.type == "span"] + for span in spans: + assert span["attributes"]["sentry.origin"] == "auto.ai.langgraph" @pytest.mark.parametrize("graph_name", ["my_graph", None, ""]) def test_pregel_invoke_with_different_graph_names( - sentry_init, capture_events, graph_name + sentry_init, capture_items, graph_name ): """Test Pregel.invoke() with different graph name scenarios.""" sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") pregel = MockPregelInstance(graph_name) if graph_name else MockPregelInstance() if not graph_name: @@ -492,25 +508,27 @@ def original_invoke(self, *args, **kwargs): wrapped_invoke = _wrap_pregel_invoke(original_invoke) wrapped_invoke(pregel, {"messages": []}) - tx = events[0] + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_span = invoke_spans[0] if graph_name and graph_name.strip(): - assert invoke_span["description"] == "invoke_agent my_graph" - assert invoke_span["data"][SPANDATA.GEN_AI_PIPELINE_NAME] == graph_name - assert invoke_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == graph_name + assert invoke_span["name"] == "invoke_agent my_graph" + assert invoke_span["attributes"][SPANDATA.GEN_AI_PIPELINE_NAME] == graph_name + assert invoke_span["attributes"][SPANDATA.GEN_AI_AGENT_NAME] == graph_name else: - assert invoke_span["description"] == "invoke_agent" - assert SPANDATA.GEN_AI_PIPELINE_NAME not in invoke_span.get("data", {}) - assert SPANDATA.GEN_AI_AGENT_NAME not in invoke_span.get("data", {}) + assert invoke_span["name"] == "invoke_agent" + assert SPANDATA.GEN_AI_PIPELINE_NAME not in invoke_span.get("attributes", {}) + assert SPANDATA.GEN_AI_AGENT_NAME not in invoke_span.get("attributes", {}) -def test_pregel_invoke_span_includes_usage_data(sentry_init, capture_events): +def test_pregel_invoke_span_includes_usage_data(sentry_init, capture_items): """ Test that invoke_agent spans include aggregated usage data from context_wrapper. This verifies the new functionality added to track token usage in invoke_agent spans. @@ -518,8 +536,9 @@ def test_pregel_invoke_span_includes_usage_data(sentry_init, capture_events): sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = { "messages": [ @@ -564,29 +583,29 @@ def original_invoke(self, *args, **kwargs): assert result is not None - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_agent_span = invoke_spans[0] # Verify invoke_agent span has usage data - assert invoke_agent_span["description"] == "invoke_agent test_graph" - assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + assert invoke_agent_span["name"] == "invoke_agent test_graph" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["attributes"] # The usage should match the mock_usage values (aggregated across all calls) - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 30 -def test_pregel_ainvoke_span_includes_usage_data(sentry_init, capture_events): +def test_pregel_ainvoke_span_includes_usage_data(sentry_init, capture_items): """ Test that invoke_agent spans include aggregated usage data from context_wrapper. This verifies the new functionality added to track token usage in invoke_agent spans. @@ -594,8 +613,9 @@ def test_pregel_ainvoke_span_includes_usage_data(sentry_init, capture_events): sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = { "messages": [ @@ -643,29 +663,29 @@ async def run_test(): result = asyncio.run(run_test()) assert result is not None - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_agent_span = invoke_spans[0] # Verify invoke_agent span has usage data - assert invoke_agent_span["description"] == "invoke_agent test_graph" - assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + assert invoke_agent_span["name"] == "invoke_agent test_graph" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["attributes"] # The usage should match the mock_usage values (aggregated across all calls) - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 30 -def test_pregel_invoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_events): +def test_pregel_invoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_items): """ Test that invoke_agent spans show aggregated usage across multiple LLM calls (e.g., when tools are used and multiple API calls are made). @@ -673,8 +693,9 @@ def test_pregel_invoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_e sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = { "messages": [ @@ -730,23 +751,23 @@ def original_invoke(self, *args, **kwargs): assert result is not None - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_agent_span = invoke_spans[0] # Verify invoke_agent span has aggregated usage from both API calls # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 50 -def test_pregel_ainvoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_events): +def test_pregel_ainvoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_items): """ Test that invoke_agent spans show aggregated usage across multiple LLM calls (e.g., when tools are used and multiple API calls are made). @@ -754,8 +775,9 @@ def test_pregel_ainvoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_ sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = { "messages": [ @@ -814,23 +836,23 @@ async def run_test(): result = asyncio.run(run_test()) assert result is not None - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_agent_span = invoke_spans[0] # Verify invoke_agent span has aggregated usage from both API calls # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 50 -def test_pregel_invoke_span_includes_response_model(sentry_init, capture_events): +def test_pregel_invoke_span_includes_response_model(sentry_init, capture_items): """ Test that invoke_agent spans include the response model. When an agent makes multiple LLM calls, it should report the last model used. @@ -838,8 +860,9 @@ def test_pregel_invoke_span_includes_response_model(sentry_init, capture_events) sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = { "messages": [ @@ -884,23 +907,25 @@ def original_invoke(self, *args, **kwargs): assert result is not None - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_agent_span = invoke_spans[0] # Verify invoke_agent span has response model - assert invoke_agent_span["description"] == "invoke_agent test_graph" - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + assert invoke_agent_span["name"] == "invoke_agent test_graph" + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) -def test_pregel_ainvoke_span_includes_response_model(sentry_init, capture_events): +def test_pregel_ainvoke_span_includes_response_model(sentry_init, capture_items): """ Test that invoke_agent spans include the response model. When an agent makes multiple LLM calls, it should report the last model used. @@ -908,8 +933,9 @@ def test_pregel_ainvoke_span_includes_response_model(sentry_init, capture_events sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = { "messages": [ @@ -957,23 +983,25 @@ async def run_test(): result = asyncio.run(run_test()) assert result is not None - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_agent_span = invoke_spans[0] # Verify invoke_agent span has response model - assert invoke_agent_span["description"] == "invoke_agent test_graph" - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + assert invoke_agent_span["name"] == "invoke_agent test_graph" + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) -def test_pregel_invoke_span_uses_last_response_model(sentry_init, capture_events): +def test_pregel_invoke_span_uses_last_response_model(sentry_init, capture_items): """ Test that when an agent makes multiple LLM calls (e.g., with tools), the invoke_agent span reports the last response model used. @@ -981,8 +1009,9 @@ def test_pregel_invoke_span_uses_last_response_model(sentry_init, capture_events sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = { "messages": [ @@ -1040,22 +1069,24 @@ def original_invoke(self, *args, **kwargs): assert result is not None - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_agent_span = invoke_spans[0] # Verify invoke_agent span uses the LAST response model - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) -def test_pregel_ainvoke_span_uses_last_response_model(sentry_init, capture_events): +def test_pregel_ainvoke_span_uses_last_response_model(sentry_init, capture_items): """ Test that when an agent makes multiple LLM calls (e.g., with tools), the invoke_agent span reports the last response model used. @@ -1063,8 +1094,9 @@ def test_pregel_ainvoke_span_uses_last_response_model(sentry_init, capture_event sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_state = { "messages": [ @@ -1125,19 +1157,21 @@ async def run_test(): result = asyncio.run(run_test()) assert result is not None - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_agent_span = invoke_spans[0] # Verify invoke_agent span uses the LAST response model - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) def test_complex_message_parsing(): @@ -1187,14 +1221,15 @@ def test_complex_message_parsing(): assert result[2]["function_call"]["name"] == "search" -def test_extraction_functions_complex_scenario(sentry_init, capture_events): +def test_extraction_functions_complex_scenario(sentry_init, capture_items): """Test extraction functions with complex scenarios including multiple messages and edge cases.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") pregel = MockPregelInstance("complex_graph") test_state = {"messages": [MockMessage("Complex request", name="user")]} @@ -1235,21 +1270,23 @@ def original_invoke(self, *args, **kwargs): assert result is not None - tx = events[0] + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 invoke_span = invoke_spans[0] - assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["data"] - response_text = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["attributes"] + response_text = invoke_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] assert response_text == "Final response" - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["attributes"] import json - tool_calls_data = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + tool_calls_data = invoke_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] if isinstance(tool_calls_data, str): tool_calls_data = json.loads(tool_calls_data) @@ -1260,14 +1297,15 @@ def original_invoke(self, *args, **kwargs): assert tool_calls_data[1]["function"]["name"] == "calculate" -def test_langgraph_message_role_mapping(sentry_init, capture_events): +def test_langgraph_message_role_mapping(sentry_init, capture_items): """Test that Langgraph integration properly maps message roles like 'ai' to 'assistant'""" sentry_init( integrations=[LanggraphIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Mock a langgraph message with mixed roles class MockMessage: @@ -1297,17 +1335,18 @@ def __init__(self, content, message_type="human"): ) wrapped_invoke(pregel, state_data) - (event,) = events - span = event["spans"][0] + span = next(item.payload for item in items if item.type == "span") # Verify that the span was created correctly - assert span["op"] == "gen_ai.invoke_agent" + assert span["attributes"]["sentry.op"] == "gen_ai.invoke_agent" # If messages were captured, verify role mapping - if SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"]: + if SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"]: import json - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) # Find messages with specific content to verify role mapping ai_message = next( @@ -1331,7 +1370,7 @@ def __init__(self, content, message_type="human"): assert "ai" not in roles -def test_langgraph_message_truncation(sentry_init, capture_events): +def test_langgraph_message_truncation(sentry_init, capture_items): """Test that large messages are truncated properly in Langgraph integration.""" import json @@ -1339,8 +1378,9 @@ def test_langgraph_message_truncation(sentry_init, capture_events): integrations=[LanggraphIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") large_content = ( "This is a very long message that will exceed our size limits. " * 1000 @@ -1365,23 +1405,25 @@ def original_invoke(self, *args, **kwargs): result = wrapped_invoke(pregel, test_state) assert result is not None - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] invoke_spans = [ - span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"].get("sentry.op") == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) > 0 invoke_span = invoke_spans[0] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["attributes"] - messages_data = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + messages_data = invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] assert isinstance(messages_data, str) parsed_messages = json.loads(messages_data) assert isinstance(parsed_messages, list) assert len(parsed_messages) == 1 assert "small message 5" in str(parsed_messages[0]) + + (tx,) = (item.payload for item in items if item.type == "transaction") assert tx["_meta"]["spans"]["0"]["data"]["gen_ai.request.messages"][""]["len"] == 5 diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py index a8df5891ce..b9365e7008 100644 --- a/tests/integrations/litellm/test_litellm.py +++ b/tests/integrations/litellm/test_litellm.py @@ -142,7 +142,7 @@ def __init__( def test_nonstreaming_chat_completion( reset_litellm_executor, sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -152,8 +152,9 @@ def test_nonstreaming_chat_completion( integrations=[LiteLLMIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [{"role": "user", "content": "Hello!"}] @@ -179,37 +180,36 @@ def test_nonstreaming_chat_completion( litellm_utils.executor.shutdown(wait=True) - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "litellm test" + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["attributes"] else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 @pytest.mark.asyncio(loop_scope="session") @@ -224,7 +224,7 @@ def test_nonstreaming_chat_completion( ) async def test_async_nonstreaming_chat_completion( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -234,8 +234,9 @@ async def test_async_nonstreaming_chat_completion( integrations=[LiteLLMIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [{"role": "user", "content": "Hello!"}] @@ -262,37 +263,36 @@ async def test_async_nonstreaming_chat_completion( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + (event,) = (item.payload for item in items if item.type == "transaction") assert event["transaction"] == "litellm test" + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["attributes"] else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 @pytest.mark.parametrize( @@ -307,7 +307,7 @@ async def test_async_nonstreaming_chat_completion( def test_streaming_chat_completion( reset_litellm_executor, sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -318,8 +318,9 @@ def test_streaming_chat_completion( integrations=[LiteLLMIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") messages = [{"role": "user", "content": "Hello!"}] @@ -350,20 +351,18 @@ def test_streaming_chat_completion( streaming_handler.executor.shutdown(wait=True) - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - assert span["op"] == OP.GEN_AI_CHAT - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True @pytest.mark.asyncio(loop_scope="session") @@ -378,7 +377,7 @@ def test_streaming_chat_completion( ) async def test_async_streaming_chat_completion( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -390,8 +389,9 @@ async def test_async_streaming_chat_completion( integrations=[LiteLLMIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [{"role": "user", "content": "Hello!"}] @@ -425,25 +425,23 @@ async def test_async_streaming_chat_completion( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - assert span["op"] == OP.GEN_AI_CHAT - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True def test_embeddings_create( sentry_init, - capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, @@ -458,8 +456,9 @@ def test_embeddings_create( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = OpenAI(api_key="test-key") @@ -485,32 +484,34 @@ def test_embeddings_create( # Response is processed by litellm, so just check it exists assert response is not None - assert len(events) == 1 - (event,) = events - assert event["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(spans) == 1 span = spans[0] - assert span["op"] == OP.GEN_AI_EMBEDDINGS - assert span["description"] == "embeddings text-embedding-ada-002" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-ada-002" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert span["name"] == "embeddings text-embedding-ada-002" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] + == "text-embedding-ada-002" + ) # Check that embeddings input is captured (it's JSON serialized) - embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + embeddings_input = span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] assert json.loads(embeddings_input) == ["Hello, world!"] @pytest.mark.asyncio(loop_scope="session") async def test_async_embeddings_create( sentry_init, - capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, @@ -525,8 +526,9 @@ async def test_async_embeddings_create( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = AsyncOpenAI(api_key="test-key") @@ -553,31 +555,33 @@ async def test_async_embeddings_create( # Response is processed by litellm, so just check it exists assert response is not None - assert len(events) == 1 - (event,) = events - assert event["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(spans) == 1 span = spans[0] - assert span["op"] == OP.GEN_AI_EMBEDDINGS - assert span["description"] == "embeddings text-embedding-ada-002" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-ada-002" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert span["name"] == "embeddings text-embedding-ada-002" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] + == "text-embedding-ada-002" + ) # Check that embeddings input is captured (it's JSON serialized) - embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + embeddings_input = span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] assert json.loads(embeddings_input) == ["Hello, world!"] def test_embeddings_create_with_list_input( sentry_init, - capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, @@ -587,8 +591,9 @@ def test_embeddings_create_with_list_input( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = OpenAI(api_key="test-key") @@ -614,22 +619,21 @@ def test_embeddings_create_with_list_input( # Response is processed by litellm, so just check it exists assert response is not None - assert len(events) == 1 - (event,) = events - assert event["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(spans) == 1 span = spans[0] - assert span["op"] == OP.GEN_AI_EMBEDDINGS - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" # Check that list of embeddings input is captured (it's JSON serialized) - embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + embeddings_input = span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] assert json.loads(embeddings_input) == [ "First text", "Second text", @@ -640,7 +644,7 @@ def test_embeddings_create_with_list_input( @pytest.mark.asyncio(loop_scope="session") async def test_async_embeddings_create_with_list_input( sentry_init, - capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, @@ -650,8 +654,9 @@ async def test_async_embeddings_create_with_list_input( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = AsyncOpenAI(api_key="test-key") @@ -678,22 +683,21 @@ async def test_async_embeddings_create_with_list_input( # Response is processed by litellm, so just check it exists assert response is not None - assert len(events) == 1 - (event,) = events - assert event["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(spans) == 1 span = spans[0] - assert span["op"] == OP.GEN_AI_EMBEDDINGS - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" # Check that list of embeddings input is captured (it's JSON serialized) - embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + embeddings_input = span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] assert json.loads(embeddings_input) == [ "First text", "Second text", @@ -703,7 +707,7 @@ async def test_async_embeddings_create_with_list_input( def test_embeddings_no_pii( sentry_init, - capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, @@ -713,8 +717,9 @@ def test_embeddings_no_pii( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=False, # PII disabled + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = OpenAI(api_key="test-key") @@ -740,27 +745,26 @@ def test_embeddings_no_pii( # Response is processed by litellm, so just check it exists assert response is not None - assert len(events) == 1 - (event,) = events - assert event["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(spans) == 1 span = spans[0] - assert span["op"] == OP.GEN_AI_EMBEDDINGS + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS # Check that embeddings input is NOT captured when PII is disabled - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["attributes"] @pytest.mark.asyncio(loop_scope="session") async def test_async_embeddings_no_pii( sentry_init, - capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, @@ -770,8 +774,9 @@ async def test_async_embeddings_no_pii( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=False, # PII disabled + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = AsyncOpenAI(api_key="test-key") @@ -798,31 +803,31 @@ async def test_async_embeddings_no_pii( # Response is processed by litellm, so just check it exists assert response is not None - assert len(events) == 1 - (event,) = events - assert event["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(spans) == 1 span = spans[0] - assert span["op"] == OP.GEN_AI_EMBEDDINGS + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS # Check that embeddings input is NOT captured when PII is disabled - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["attributes"] def test_exception_handling( - reset_litellm_executor, sentry_init, capture_events, get_rate_limit_model_response + reset_litellm_executor, sentry_init, capture_items, get_rate_limit_model_response ): sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event") messages = [{"role": "user", "content": "Hello!"}] @@ -843,22 +848,25 @@ def test_exception_handling( client=client, ) - # Should have error event and transaction - assert len(events) >= 1 # Find the error event - error_events = [e for e in events if e.get("level") == "error"] + error_events = [ + item.payload + for item in items + if item.type == "event" and item.payload.get("level") == "error" + ] assert len(error_events) == 1 @pytest.mark.asyncio(loop_scope="session") async def test_async_exception_handling( - sentry_init, capture_events, get_rate_limit_model_response + sentry_init, capture_items, get_rate_limit_model_response ): sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event") messages = [{"role": "user", "content": "Hello!"}] @@ -879,25 +887,28 @@ async def test_async_exception_handling( client=client, ) - # Should have error event and transaction - assert len(events) >= 1 # Find the error event - error_events = [e for e in events if e.get("level") == "error"] + error_events = [ + item.payload + for item in items + if item.type == "event" and item.payload.get("level") == "error" + ] assert len(error_events) == 1 def test_span_origin( reset_litellm_executor, sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [{"role": "user", "content": "Hello!"}] @@ -923,16 +934,17 @@ def test_span_origin( litellm_utils.executor.shutdown(wait=True) - (event,) = events - + (event,) = (item.payload for item in items if item.type == "transaction") assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.litellm" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.litellm" def test_multiple_providers( reset_litellm_executor, sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, nonstreaming_anthropic_model_response, @@ -942,8 +954,9 @@ def test_multiple_providers( sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction") messages = [{"role": "user", "content": "Hello!"}] @@ -1015,18 +1028,19 @@ def test_multiple_providers( litellm_utils.executor.shutdown(wait=True) + events = [item.payload for item in items if item.type == "transaction"] assert len(events) == 3 - for i in range(3): - span = events[i]["spans"][0] + spans = [item.payload for item in items if item.type == "span"] + for span in spans: # The provider should be detected by litellm.get_llm_provider - assert SPANDATA.GEN_AI_SYSTEM in span["data"] + assert SPANDATA.GEN_AI_SYSTEM in span["attributes"] @pytest.mark.asyncio(loop_scope="session") async def test_async_multiple_providers( sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, nonstreaming_anthropic_model_response, @@ -1036,8 +1050,9 @@ async def test_async_multiple_providers( sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [{"role": "user", "content": "Hello!"}] @@ -1112,18 +1127,19 @@ async def test_async_multiple_providers( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) + events = [item.payload for item in items if item.type == "transaction"] assert len(events) == 3 - for i in range(3): - span = events[i]["spans"][0] + spans = [item.payload for item in items if item.type == "span"] + for span in spans: # The provider should be detected by litellm.get_llm_provider - assert SPANDATA.GEN_AI_SYSTEM in span["data"] + assert SPANDATA.GEN_AI_SYSTEM in span["attributes"] def test_additional_parameters( reset_litellm_executor, sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): @@ -1131,8 +1147,9 @@ def test_additional_parameters( sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [{"role": "user", "content": "Hello!"}] client = OpenAI(api_key="test-key") @@ -1162,26 +1179,27 @@ def test_additional_parameters( litellm_utils.executor.shutdown(wait=True) - (event,) = events + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.5 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.5 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.5 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.5 @pytest.mark.asyncio(loop_scope="session") async def test_async_additional_parameters( sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): @@ -1189,8 +1207,9 @@ async def test_async_additional_parameters( sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [{"role": "user", "content": "Hello!"}] client = AsyncOpenAI(api_key="test-key") @@ -1221,34 +1240,36 @@ async def test_async_additional_parameters( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - (event,) = events + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.5 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.5 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.5 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.5 def test_no_integration( reset_litellm_executor, sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): """Test that when integration is not enabled, callbacks don't break.""" sentry_init( traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [{"role": "user", "content": "Hello!"}] client = OpenAI(api_key="test-key") @@ -1273,13 +1294,12 @@ def test_no_integration( litellm_utils.executor.shutdown(wait=True) - (event,) = events - # Should still have the transaction, but no child spans since integration is off - assert event["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 0 @@ -1287,15 +1307,16 @@ def test_no_integration( @pytest.mark.asyncio(loop_scope="session") async def test_async_no_integration( sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): """Test that when integration is not enabled, callbacks don't break.""" sentry_init( traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [{"role": "user", "content": "Hello!"}] client = AsyncOpenAI(api_key="test-key") @@ -1321,24 +1342,24 @@ async def test_async_no_integration( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - (event,) = events - # Should still have the transaction, but no child spans since integration is off - assert event["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 0 -def test_response_without_usage(sentry_init, capture_events): +def test_response_without_usage(sentry_init, capture_items): """Test handling of responses without usage information.""" sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [{"role": "user", "content": "Hello!"}] @@ -1366,12 +1387,11 @@ def test_response_without_usage(sentry_init, capture_events): datetime.now(), ) - (event,) = events - (span,) = event["spans"] + (span,) = (item.payload for item in items if item.type == "span") # Span should still be created even without usage info - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat gpt-3.5-turbo" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat gpt-3.5-turbo" def test_integration_setup(sentry_init): @@ -1379,6 +1399,7 @@ def test_integration_setup(sentry_init): sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Check that callbacks are registered @@ -1387,14 +1408,15 @@ def test_integration_setup(sentry_init): assert _failure_callback in (litellm.failure_callback or []) -def test_litellm_message_truncation(sentry_init, capture_events): +def test_litellm_message_truncation(sentry_init, capture_items): """Test that large messages are truncated properly in LiteLLM integration.""" sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") large_content = ( "This is a very long message that will exceed our size limits. " * 1000 @@ -1422,25 +1444,24 @@ def test_litellm_message_truncation(sentry_init, capture_events): datetime.now(), ) - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" - + spans = [item.payload for item in items if item.type == "span"] chat_spans = [ - span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_CHAT + span for span in spans if span["attributes"].get("sentry.op") == OP.GEN_AI_CHAT ] assert len(chat_spans) > 0 chat_span = chat_spans[0] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["attributes"] - messages_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + messages_data = chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] assert isinstance(messages_data, str) parsed_messages = json.loads(messages_data) assert isinstance(parsed_messages, list) assert len(parsed_messages) == 1 assert "small message 5" in str(parsed_messages[0]) + + tx = next(item.payload for item in items if item.type == "transaction") assert tx["_meta"]["spans"]["0"]["data"]["gen_ai.request.messages"][""]["len"] == 5 @@ -1452,7 +1473,7 @@ def test_litellm_message_truncation(sentry_init, capture_events): def test_binary_content_encoding_image_url( reset_litellm_executor, sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): @@ -1460,8 +1481,9 @@ def test_binary_content_encoding_image_url( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1498,15 +1520,16 @@ def test_binary_content_encoding_image_url( litellm_utils.executor.shutdown(wait=True) - (event,) = events + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) blob_item = next( ( @@ -1530,7 +1553,7 @@ def test_binary_content_encoding_image_url( @pytest.mark.asyncio(loop_scope="session") async def test_async_binary_content_encoding_image_url( sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): @@ -1538,8 +1561,9 @@ async def test_async_binary_content_encoding_image_url( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1577,15 +1601,16 @@ async def test_async_binary_content_encoding_image_url( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - (event,) = events + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) blob_item = next( ( @@ -1609,7 +1634,7 @@ async def test_async_binary_content_encoding_image_url( def test_binary_content_encoding_mixed_content( reset_litellm_executor, sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): @@ -1617,8 +1642,9 @@ def test_binary_content_encoding_mixed_content( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1656,15 +1682,16 @@ def test_binary_content_encoding_mixed_content( litellm_utils.executor.shutdown(wait=True) - (event,) = events + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content_items = [ item for msg in messages_data if "content" in msg for item in msg["content"] @@ -1676,7 +1703,7 @@ def test_binary_content_encoding_mixed_content( @pytest.mark.asyncio(loop_scope="session") async def test_async_binary_content_encoding_mixed_content( sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): @@ -1684,8 +1711,9 @@ async def test_async_binary_content_encoding_mixed_content( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1724,15 +1752,16 @@ async def test_async_binary_content_encoding_mixed_content( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - (event,) = events + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content_items = [ item for msg in messages_data if "content" in msg for item in msg["content"] @@ -1744,7 +1773,7 @@ async def test_async_binary_content_encoding_mixed_content( def test_binary_content_encoding_uri_type( reset_litellm_executor, sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): @@ -1752,8 +1781,9 @@ def test_binary_content_encoding_uri_type( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1789,15 +1819,16 @@ def test_binary_content_encoding_uri_type( litellm_utils.executor.shutdown(wait=True) - (event,) = events + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) uri_item = next( ( @@ -1816,7 +1847,7 @@ def test_binary_content_encoding_uri_type( @pytest.mark.asyncio(loop_scope="session") async def test_async_binary_content_encoding_uri_type( sentry_init, - capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, ): @@ -1824,8 +1855,9 @@ async def test_async_binary_content_encoding_uri_type( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") messages = [ { @@ -1862,15 +1894,16 @@ async def test_async_binary_content_encoding_uri_type( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - (event,) = events + spans = [item.payload for item in items if item.type == "span"] chat_spans = list( x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) assert len(chat_spans) == 1 span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) uri_item = next( ( diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index ada2e633de..c4d77db5c8 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -132,14 +132,15 @@ async def __call__(self, *args, **kwargs): ], ) def test_nonstreaming_chat_completion_no_prompts( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock(return_value=EXAMPLE_CHAT_COMPLETION) @@ -163,27 +164,26 @@ def test_nonstreaming_chat_completion_no_prompts( ) assert response == "the model response" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.parametrize( @@ -229,13 +229,14 @@ def test_nonstreaming_chat_completion_no_prompts( ), ], ) -def test_nonstreaming_chat_completion(sentry_init, capture_events, messages, request): +def test_nonstreaming_chat_completion(sentry_init, capture_items, messages, request): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock(return_value=EXAMPLE_CHAT_COMPLETION) @@ -256,30 +257,29 @@ def test_nonstreaming_chat_completion(sentry_init, capture_events, messages, req ) assert response == "the model response" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 param_id = request.node.callspec.id if "blocks" in param_id: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ { "type": "text", "content": "You are a helpful assistant.", } ] else: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ { "type": "text", "content": "You are a helpful assistant.", @@ -290,12 +290,12 @@ def test_nonstreaming_chat_completion(sentry_init, capture_events, messages, req }, ] - assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "hello" in span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "the model response" in span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.asyncio @@ -308,14 +308,15 @@ def test_nonstreaming_chat_completion(sentry_init, capture_events, messages, req ], ) async def test_nonstreaming_chat_completion_async_no_prompts( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") client.chat.completions._post = mock.AsyncMock(return_value=EXAMPLE_CHAT_COMPLETION) @@ -336,27 +337,26 @@ async def test_nonstreaming_chat_completion_async_no_prompts( response = response.choices[0].message.content assert response == "the model response" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.asyncio @@ -404,14 +404,15 @@ async def test_nonstreaming_chat_completion_async_no_prompts( ], ) async def test_nonstreaming_chat_completion_async( - sentry_init, capture_events, messages, request + sentry_init, capture_items, messages, request ): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") client.chat.completions._post = AsyncMock(return_value=EXAMPLE_CHAT_COMPLETION) @@ -429,30 +430,29 @@ async def test_nonstreaming_chat_completion_async( response = response.choices[0].message.content assert response == "the model response" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 param_id = request.node.callspec.id if "blocks" in param_id: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ { "type": "text", "content": "You are a helpful assistant.", } ] else: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ { "type": "text", "content": "You are a helpful assistant.", @@ -463,12 +463,12 @@ async def test_nonstreaming_chat_completion_async( }, ] - assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "hello" in span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "the model response" in span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 def tiktoken_encoding_if_installed(): @@ -491,7 +491,7 @@ def tiktoken_encoding_if_installed(): ) def test_streaming_chat_completion_no_prompts( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -506,8 +506,9 @@ def test_streaming_chat_completion_no_prompts( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -581,32 +582,31 @@ def test_streaming_chat_completion_no_prompts( ) assert response_string == "hello world" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" - - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] try: import tiktoken # type: ignore # noqa # pylint: disable=unused-import - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 7 - assert span["data"]["gen_ai.usage.total_tokens"] == 9 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 7 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 9 except ImportError: pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly @@ -617,7 +617,7 @@ def test_streaming_chat_completion_no_prompts( ) def test_streaming_chat_completion_with_usage_in_stream( sentry_init, - capture_events, + capture_items, get_model_response, server_side_event_chunks, ): @@ -626,8 +626,9 @@ def test_streaming_chat_completion_with_usage_in_stream( integrations=[OpenAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=False, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -684,13 +685,11 @@ def test_streaming_chat_completion_with_usage_in_stream( for _ in response_stream: pass - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.skipif( @@ -699,7 +698,7 @@ def test_streaming_chat_completion_with_usage_in_stream( ) def test_streaming_chat_completion_empty_content_preserves_token_usage( sentry_init, - capture_events, + capture_items, get_model_response, server_side_event_chunks, ): @@ -708,8 +707,9 @@ def test_streaming_chat_completion_empty_content_preserves_token_usage( integrations=[OpenAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=False, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -747,13 +747,11 @@ def test_streaming_chat_completion_empty_content_preserves_token_usage( for _ in response_stream: pass - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert "gen_ai.usage.output_tokens" not in span["data"] - assert span["data"]["gen_ai.usage.total_tokens"] == 20 + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert "gen_ai.usage.output_tokens" not in span["attributes"] + assert span["attributes"]["gen_ai.usage.total_tokens"] == 20 @pytest.mark.skipif( @@ -763,7 +761,7 @@ def test_streaming_chat_completion_empty_content_preserves_token_usage( @pytest.mark.asyncio async def test_streaming_chat_completion_empty_content_preserves_token_usage_async( sentry_init, - capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, @@ -773,8 +771,9 @@ async def test_streaming_chat_completion_empty_content_preserves_token_usage_asy integrations=[OpenAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=False, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -814,13 +813,11 @@ async def test_streaming_chat_completion_empty_content_preserves_token_usage_asy async for _ in response_stream: pass - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert "gen_ai.usage.output_tokens" not in span["data"] - assert span["data"]["gen_ai.usage.total_tokens"] == 20 + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert "gen_ai.usage.output_tokens" not in span["attributes"] + assert span["attributes"]["gen_ai.usage.total_tokens"] == 20 @pytest.mark.skipif( @@ -830,7 +827,7 @@ async def test_streaming_chat_completion_empty_content_preserves_token_usage_asy @pytest.mark.asyncio async def test_streaming_chat_completion_async_with_usage_in_stream( sentry_init, - capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, @@ -840,8 +837,9 @@ async def test_streaming_chat_completion_async_with_usage_in_stream( integrations=[OpenAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=False, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -900,13 +898,11 @@ async def test_streaming_chat_completion_async_with_usage_in_stream( async for _ in response_stream: pass - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 # noinspection PyTypeChecker @@ -955,7 +951,7 @@ async def test_streaming_chat_completion_async_with_usage_in_stream( ) def test_streaming_chat_completion( sentry_init, - capture_events, + capture_items, messages, request, get_model_response, @@ -970,8 +966,9 @@ def test_streaming_chat_completion( ], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -1041,30 +1038,29 @@ def test_streaming_chat_completion( map(lambda x: x.choices[0].delta.content, response_stream) ) assert response_string == "hello world" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 param_id = request.node.callspec.id if "blocks" in param_id: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ { "type": "text", "content": "You are a helpful assistant.", } ] else: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ { "type": "text", "content": "You are a helpful assistant.", @@ -1075,22 +1071,22 @@ def test_streaming_chat_completion( }, ] - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" - assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert "hello world" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "hello" in span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "hello world" in span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] try: import tiktoken # type: ignore # noqa # pylint: disable=unused-import if "blocks" in param_id: - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 7 - assert span["data"]["gen_ai.usage.total_tokens"] == 9 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 7 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 9 else: - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 12 - assert span["data"]["gen_ai.usage.total_tokens"] == 14 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 12 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 14 except ImportError: pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly @@ -1107,7 +1103,7 @@ def test_streaming_chat_completion( ) async def test_streaming_chat_completion_async_no_prompts( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -1123,8 +1119,9 @@ async def test_streaming_chat_completion_async_no_prompts( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -1201,32 +1198,31 @@ async def test_streaming_chat_completion_async_no_prompts( response_string += x.choices[0].delta.content assert response_string == "hello world" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" - - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] try: import tiktoken # type: ignore # noqa # pylint: disable=unused-import - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 7 - assert span["data"]["gen_ai.usage.total_tokens"] == 9 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 7 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 9 except ImportError: pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly @@ -1279,7 +1275,7 @@ async def test_streaming_chat_completion_async_no_prompts( ) async def test_streaming_chat_completion_async( sentry_init, - capture_events, + capture_items, messages, request, get_model_response, @@ -1295,8 +1291,9 @@ async def test_streaming_chat_completion_async( ], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") @@ -1371,32 +1368,31 @@ async def test_streaming_chat_completion_async( response_string += x.choices[0].delta.content assert response_string == "hello world" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" param_id = request.node.callspec.id if "blocks" in param_id: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ { "type": "text", "content": "You are a helpful assistant.", } ] else: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ { "type": "text", "content": "You are a helpful assistant.", @@ -1407,28 +1403,32 @@ async def test_streaming_chat_completion_async( }, ] - assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert "hello world" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "hello" in span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "hello world" in span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] try: import tiktoken # type: ignore # noqa # pylint: disable=unused-import if "blocks" in param_id: - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 7 - assert span["data"]["gen_ai.usage.total_tokens"] == 9 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 7 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 9 else: - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 12 - assert span["data"]["gen_ai.usage.total_tokens"] == 14 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 12 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 14 except ImportError: pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly -def test_bad_chat_completion(sentry_init, capture_events): - sentry_init(integrations=[OpenAIIntegration()], traces_sample_rate=1.0) - events = capture_events() +def test_bad_chat_completion(sentry_init, capture_items): + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("event") client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock( @@ -1440,13 +1440,17 @@ def test_bad_chat_completion(sentry_init, capture_events): messages=[{"role": "system", "content": "hello"}], ) - (event,) = events + (event,) = (item.payload for item in items if item.type == "event") assert event["level"] == "error" -def test_span_status_error(sentry_init, capture_events): - sentry_init(integrations=[OpenAIIntegration()], traces_sample_rate=1.0) - events = capture_events() +def test_span_status_error(sentry_init, capture_items): + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("event", "transaction", "span") with start_transaction(name="test"): client = OpenAI(api_key="z") @@ -1458,17 +1462,24 @@ def test_span_status_error(sentry_init, capture_events): model="some-model", messages=[{"role": "system", "content": "hello"}] ) - (error, transaction) = events - assert error["level"] == "error" - assert transaction["spans"][0]["status"] == "internal_error" - assert transaction["spans"][0]["tags"]["status"] == "internal_error" + (event,) = (item.payload for item in items if item.type == "event") + assert event["level"] == "error" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["status"] == "error" + + (transaction,) = (item.payload for item in items if item.type == "transaction") assert transaction["contexts"]["trace"]["status"] == "internal_error" @pytest.mark.asyncio -async def test_bad_chat_completion_async(sentry_init, capture_events): - sentry_init(integrations=[OpenAIIntegration()], traces_sample_rate=1.0) - events = capture_events() +async def test_bad_chat_completion_async(sentry_init, capture_items): + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("event") client = AsyncOpenAI(api_key="z") client.chat.completions._post = AsyncMock( @@ -1479,7 +1490,7 @@ async def test_bad_chat_completion_async(sentry_init, capture_events): model="some-model", messages=[{"role": "system", "content": "hello"}] ) - (event,) = events + (event,) = (item.payload for item in items if item.type == "event") assert event["level"] == "error" @@ -1492,14 +1503,15 @@ async def test_bad_chat_completion_async(sentry_init, capture_events): ], ) def test_embeddings_create_no_pii( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") @@ -1521,17 +1533,15 @@ def test_embeddings_create_no_pii( assert len(response.data[0].embedding) == 3 - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.embeddings" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.embeddings" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["attributes"] - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.parametrize( @@ -1577,13 +1587,14 @@ def test_embeddings_create_no_pii( ), ], ) -def test_embeddings_create(sentry_init, capture_events, input, request): +def test_embeddings_create(sentry_init, capture_items, input, request): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") @@ -1603,24 +1614,24 @@ def test_embeddings_create(sentry_init, capture_events, input, request): assert len(response.data[0].embedding) == 3 - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.embeddings" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.embeddings" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" param_id = request.node.callspec.id if param_id == "string": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == ["hello"] + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + "hello" + ] elif param_id == "string_sequence" or param_id == "string_iterable": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ "First text", "Second text", "Third text", ] elif param_id == "tokens" or param_id == "token_iterable": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ 5, 8, 13, @@ -1628,13 +1639,13 @@ def test_embeddings_create(sentry_init, capture_events, input, request): 34, ] else: - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ [5, 8, 13, 21, 34], [8, 13, 21, 34, 55], ] - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.asyncio @@ -1647,14 +1658,15 @@ def test_embeddings_create(sentry_init, capture_events, input, request): ], ) async def test_embeddings_create_async_no_pii( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") @@ -1676,17 +1688,15 @@ async def test_embeddings_create_async_no_pii( assert len(response.data[0].embedding) == 3 - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.embeddings" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.embeddings" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["attributes"] - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.asyncio @@ -1733,13 +1743,14 @@ async def test_embeddings_create_async_no_pii( ), ], ) -async def test_embeddings_create_async(sentry_init, capture_events, input, request): +async def test_embeddings_create_async(sentry_init, capture_items, input, request): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") @@ -1761,24 +1772,24 @@ async def test_embeddings_create_async(sentry_init, capture_events, input, reque assert len(response.data[0].embedding) == 3 - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.embeddings" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.embeddings" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" param_id = request.node.callspec.id if param_id == "string": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == ["hello"] + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + "hello" + ] elif param_id == "string_sequence" or param_id == "string_iterable": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ "First text", "Second text", "Third text", ] elif param_id == "tokens" or param_id == "token_iterable": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ 5, 8, 13, @@ -1786,13 +1797,13 @@ async def test_embeddings_create_async(sentry_init, capture_events, input, reque 34, ] else: - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ [5, 8, 13, 21, 34], [8, 13, 21, 34, 55], ] - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.parametrize( @@ -1800,14 +1811,15 @@ async def test_embeddings_create_async(sentry_init, capture_events, input, reque [(True, True), (True, False), (False, True), (False, False)], ) def test_embeddings_create_raises_error( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event") client = OpenAI(api_key="z") @@ -1818,7 +1830,7 @@ def test_embeddings_create_raises_error( with pytest.raises(OpenAIError): client.embeddings.create(input="hello", model="text-embedding-3-large") - (event,) = events + (event,) = (item.payload for item in items if item.type == "event") assert event["level"] == "error" @@ -1828,14 +1840,15 @@ def test_embeddings_create_raises_error( [(True, True), (True, False), (False, True), (False, False)], ) async def test_embeddings_create_raises_error_async( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event") client = AsyncOpenAI(api_key="z") @@ -1846,16 +1859,17 @@ async def test_embeddings_create_raises_error_async( with pytest.raises(OpenAIError): await client.embeddings.create(input="hello", model="text-embedding-3-large") - (event,) = events + (event,) = (item.payload for item in items if item.type == "event") assert event["level"] == "error" -def test_span_origin_nonstreaming_chat(sentry_init, capture_events): +def test_span_origin_nonstreaming_chat(sentry_init, capture_items): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock(return_value=EXAMPLE_CHAT_COMPLETION) @@ -1865,19 +1879,21 @@ def test_span_origin_nonstreaming_chat(sentry_init, capture_events): model="some-model", messages=[{"role": "system", "content": "hello"}] ) - (event,) = events - + (event,) = (item.payload for item in items if item.type == "transaction") assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" @pytest.mark.asyncio -async def test_span_origin_nonstreaming_chat_async(sentry_init, capture_events): +async def test_span_origin_nonstreaming_chat_async(sentry_init, capture_items): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = AsyncOpenAI(api_key="z") client.chat.completions._post = AsyncMock(return_value=EXAMPLE_CHAT_COMPLETION) @@ -1887,18 +1903,20 @@ async def test_span_origin_nonstreaming_chat_async(sentry_init, capture_events): model="some-model", messages=[{"role": "system", "content": "hello"}] ) - (event,) = events - + (event,) = (item.payload for item in items if item.type == "transaction") assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" -def test_span_origin_streaming_chat(sentry_init, capture_events): + +def test_span_origin_streaming_chat(sentry_init, capture_items): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = OpenAI(api_key="z") returned_stream = Stream(cast_to=None, response=None, client=client) @@ -1946,21 +1964,23 @@ def test_span_origin_streaming_chat(sentry_init, capture_events): "".join(map(lambda x: x.choices[0].delta.content, response_stream)) - (event,) = events + (transaction,) = (item.payload for item in items if item.type == "transaction") + assert transaction["contexts"]["trace"]["origin"] == "manual" - assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" @pytest.mark.asyncio async def test_span_origin_streaming_chat_async( - sentry_init, capture_events, async_iterator + sentry_init, capture_items, async_iterator ): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = AsyncOpenAI(api_key="z") returned_stream = AsyncStream(cast_to=None, response=None, client=client) @@ -2014,18 +2034,20 @@ async def test_span_origin_streaming_chat_async( # "".join(map(lambda x: x.choices[0].delta.content, response_stream)) - (event,) = events - + (event,) = (item.payload for item in items if item.type == "transaction") assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" -def test_span_origin_embeddings(sentry_init, capture_events): +def test_span_origin_embeddings(sentry_init, capture_items): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = OpenAI(api_key="z") @@ -2043,19 +2065,21 @@ def test_span_origin_embeddings(sentry_init, capture_events): with start_transaction(name="openai tx"): client.embeddings.create(input="hello", model="text-embedding-3-large") - (event,) = events - + (event,) = [item.payload for item in items if item.type == "transaction"] assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" @pytest.mark.asyncio -async def test_span_origin_embeddings_async(sentry_init, capture_events): +async def test_span_origin_embeddings_async(sentry_init, capture_items): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = AsyncOpenAI(api_key="z") @@ -2073,10 +2097,11 @@ async def test_span_origin_embeddings_async(sentry_init, capture_events): with start_transaction(name="openai tx"): await client.embeddings.create(input="hello", model="text-embedding-3-large") - (event,) = events - + (event,) = [item.payload for item in items if item.type == "transaction"] assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" def test_completions_token_usage_from_response(): @@ -2442,12 +2467,13 @@ def count_tokens(msg): @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") -def test_ai_client_span_responses_api_no_pii(sentry_init, capture_events): +def test_ai_client_span_responses_api_no_pii(sentry_init, capture_items): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") client.responses._post = mock.Mock(return_value=EXAMPLE_RESPONSE) @@ -2462,13 +2488,10 @@ def test_ai_client_span_responses_api_no_pii(sentry_init, capture_events): top_p=0.9, ) - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] assert len(spans) == 1 - assert spans[0]["op"] == "gen_ai.responses" - assert spans[0]["origin"] == "auto.ai.openai" - assert spans[0]["data"] == { + assert spans[0]["attributes"] == { "gen_ai.operation.name": "responses", "gen_ai.request.max_tokens": 100, "gen_ai.request.temperature": 0.7, @@ -2482,13 +2505,21 @@ def test_ai_client_span_responses_api_no_pii(sentry_init, capture_events): "gen_ai.usage.output_tokens": 10, "gen_ai.usage.output_tokens.reasoning": 8, "gen_ai.usage.total_tokens": 30, + "sentry.environment": "production", + "sentry.op": "gen_ai.responses", + "sentry.origin": "auto.ai.openai", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "openai tx", "thread.id": mock.ANY, "thread.name": mock.ANY, } - assert "gen_ai.system_instructions" not in spans[0]["data"] - assert "gen_ai.request.messages" not in spans[0]["data"] - assert "gen_ai.response.text" not in spans[0]["data"] + assert "gen_ai.system_instructions" not in spans[0]["attributes"] + assert "gen_ai.request.messages" not in spans[0]["attributes"] + assert "gen_ai.response.text" not in spans[0]["attributes"] @pytest.mark.parametrize( @@ -2557,14 +2588,15 @@ def test_ai_client_span_responses_api_no_pii(sentry_init, capture_events): ) @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") def test_ai_client_span_responses_api( - sentry_init, capture_events, instructions, input, request + sentry_init, capture_items, instructions, input, request ): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") client.responses._post = mock.Mock(return_value=EXAMPLE_RESPONSE) @@ -2579,12 +2611,9 @@ def test_ai_client_span_responses_api( top_p=0.9, ) - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] assert len(spans) == 1 - assert spans[0]["op"] == "gen_ai.responses" - assert spans[0]["origin"] == "auto.ai.openai" expected_data = { "gen_ai.operation.name": "responses", @@ -2601,6 +2630,14 @@ def test_ai_client_span_responses_api( "gen_ai.usage.total_tokens": 30, "gen_ai.request.model": "gpt-4o", "gen_ai.response.text": "the model response", + "sentry.environment": "production", + "sentry.op": "gen_ai.responses", + "sentry.origin": "auto.ai.openai", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "openai tx", "thread.id": mock.ANY, "thread.name": mock.ANY, } @@ -2759,17 +2796,18 @@ def test_ai_client_span_responses_api( } ) - assert spans[0]["data"] == expected_data + assert spans[0]["attributes"] == expected_data @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") -def test_error_in_responses_api(sentry_init, capture_events): +def test_error_in_responses_api(sentry_init, capture_items): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event", "transaction", "span") client = OpenAI(api_key="z") client.responses._post = mock.Mock( @@ -2784,15 +2822,17 @@ def test_error_in_responses_api(sentry_init, capture_events): input="How do I check if a Python object is an instance of a class?", ) - (error_event, transaction_event) = events - - assert transaction_event["type"] == "transaction" # make sure the span where the error occurred is captured - assert transaction_event["spans"][0]["op"] == "gen_ai.responses" + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.op"] == "gen_ai.responses" + (error_event,) = (item.payload for item in items if item.type == "event") assert error_event["level"] == "error" assert error_event["exception"]["values"][0]["type"] == "OpenAIError" + (transaction_event,) = ( + item.payload for item in items if item.type == "transaction" + ) assert ( error_event["contexts"]["trace"]["trace_id"] == transaction_event["contexts"]["trace"]["trace_id"] @@ -2866,14 +2906,15 @@ def test_error_in_responses_api(sentry_init, capture_events): ], ) async def test_ai_client_span_responses_async_api( - sentry_init, capture_events, instructions, input, request + sentry_init, capture_items, instructions, input, request ): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") client.responses._post = AsyncMock(return_value=EXAMPLE_RESPONSE) @@ -2888,12 +2929,9 @@ async def test_ai_client_span_responses_async_api( top_p=0.9, ) - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] assert len(spans) == 1 - assert spans[0]["op"] == "gen_ai.responses" - assert spans[0]["origin"] == "auto.ai.openai" expected_data = { "gen_ai.operation.name": "responses", @@ -2911,6 +2949,14 @@ async def test_ai_client_span_responses_async_api( "gen_ai.usage.output_tokens.reasoning": 8, "gen_ai.usage.total_tokens": 30, "gen_ai.response.text": "the model response", + "sentry.environment": "production", + "sentry.op": "gen_ai.responses", + "sentry.origin": "auto.ai.openai", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "openai tx", "thread.id": mock.ANY, "thread.name": mock.ANY, } @@ -3069,7 +3115,7 @@ async def test_ai_client_span_responses_async_api( } ) - assert spans[0]["data"] == expected_data + assert spans[0]["attributes"] == expected_data @pytest.mark.asyncio @@ -3140,7 +3186,7 @@ async def test_ai_client_span_responses_async_api( @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") async def test_ai_client_span_streaming_responses_async_api( sentry_init, - capture_events, + capture_items, instructions, input, request, @@ -3152,8 +3198,9 @@ async def test_ai_client_span_streaming_responses_async_api( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -3178,11 +3225,12 @@ async def test_ai_client_span_streaming_responses_async_api( async for _ in result: pass - (transaction,) = events - spans = [span for span in transaction["spans"] if span["op"] == OP.GEN_AI_RESPONSES] + spans = [item.payload for item in items if item.type == "span"] + spans = [ + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_RESPONSES + ] assert len(spans) == 1 - assert spans[0]["origin"] == "auto.ai.openai" expected_data = { "gen_ai.operation.name": "responses", @@ -3200,6 +3248,14 @@ async def test_ai_client_span_streaming_responses_async_api( "gen_ai.usage.total_tokens": 30, "gen_ai.request.model": "gpt-4o", "gen_ai.response.text": "hello world", + "sentry.environment": "production", + "sentry.op": "gen_ai.responses", + "sentry.origin": "auto.ai.openai", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "openai tx", "thread.id": mock.ANY, "thread.name": mock.ANY, } @@ -3358,18 +3414,19 @@ async def test_ai_client_span_streaming_responses_async_api( } ) - assert spans[0]["data"] == expected_data + assert spans[0]["attributes"] == expected_data @pytest.mark.asyncio @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") -async def test_error_in_responses_async_api(sentry_init, capture_events): +async def test_error_in_responses_async_api(sentry_init, capture_items): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event", "transaction", "span") client = AsyncOpenAI(api_key="z") client.responses._post = AsyncMock( @@ -3384,15 +3441,17 @@ async def test_error_in_responses_async_api(sentry_init, capture_events): input="How do I check if a Python object is an instance of a class?", ) - (error_event, transaction_event) = events - - assert transaction_event["type"] == "transaction" # make sure the span where the error occurred is captured - assert transaction_event["spans"][0]["op"] == "gen_ai.responses" + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.op"] == "gen_ai.responses" + (error_event,) = (item.payload for item in items if item.type == "event") assert error_event["level"] == "error" assert error_event["exception"]["values"][0]["type"] == "OpenAIError" + (transaction_event,) = ( + item.payload for item in items if item.type == "transaction" + ) assert ( error_event["contexts"]["trace"]["trace_id"] == transaction_event["contexts"]["trace"]["trace_id"] @@ -3479,7 +3538,7 @@ async def test_error_in_responses_async_api(sentry_init, capture_events): @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") def test_streaming_responses_api( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -3493,8 +3552,9 @@ def test_streaming_responses_api( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -3525,26 +3585,25 @@ def test_streaming_responses_api( assert response_string == "hello world" - (transaction,) = events - (span,) = transaction["spans"] - assert span["op"] == "gen_ai.responses" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + (span,) = (item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.responses" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "response-model-id" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "response-model-id" if send_default_pii and include_prompts: - assert span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '["hello"]' - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "hello world" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '["hello"]' + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "hello world" else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.asyncio @@ -3555,7 +3614,7 @@ def test_streaming_responses_api( @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") async def test_streaming_responses_api_async( sentry_init, - capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, @@ -3570,8 +3629,9 @@ async def test_streaming_responses_api_async( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -3600,26 +3660,25 @@ async def test_streaming_responses_api_async( assert response_string == "hello world" - (transaction,) = events - (span,) = transaction["spans"] - assert span["op"] == "gen_ai.responses" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + (span,) = (item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.responses" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "response-model-id" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "response-model-id" if send_default_pii and include_prompts: - assert span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '["hello"]' - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "hello world" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '["hello"]' + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "hello world" else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.skipif( @@ -3630,12 +3689,13 @@ async def test_streaming_responses_api_async( "tools", [[], None, NOT_GIVEN, omit], ) -def test_empty_tools_in_chat_completion(sentry_init, capture_events, tools): +def test_empty_tools_in_chat_completion(sentry_init, capture_items, tools): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock(return_value=EXAMPLE_CHAT_COMPLETION) @@ -3647,10 +3707,9 @@ def test_empty_tools_in_chat_completion(sentry_init, capture_events, tools): tools=tools, ) - (event,) = events - span = event["spans"][0] + span = next(item.payload for item in items if item.type == "span") - assert "gen_ai.request.available_tools" not in span["data"] + assert "gen_ai.request.available_tools" not in span["attributes"] # Test messages with mixed roles including "ai" that should be mapped to "assistant" @@ -3669,7 +3728,7 @@ def test_empty_tools_in_chat_completion(sentry_init, capture_events, tools): ], ) def test_openai_message_role_mapping( - sentry_init, capture_events, test_message, expected_role + sentry_init, capture_items, test_message, expected_role ): """Test that OpenAI integration properly maps message roles like 'ai' to 'assistant'""" @@ -3677,8 +3736,9 @@ def test_openai_message_role_mapping( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock(return_value=EXAMPLE_CHAT_COMPLETION) @@ -3688,28 +3748,28 @@ def test_openai_message_role_mapping( with start_transaction(name="openai tx"): client.chat.completions.create(model="test-model", messages=test_messages) # Verify that the span was created correctly - (event,) = events - span = event["spans"][0] - assert span["op"] == "gen_ai.chat" - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] # Parse the stored messages import json - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == expected_role -def test_openai_message_truncation(sentry_init, capture_events): +def test_openai_message_truncation(sentry_init, capture_items): """Test that large messages are truncated properly in OpenAI integration.""" sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock(return_value=EXAMPLE_CHAT_COMPLETION) @@ -3730,17 +3790,17 @@ def test_openai_message_truncation(sentry_init, capture_events): messages=large_messages, ) - (event,) = events - span = event["spans"][0] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + span = next(item.payload for item in items if item.type == "span") + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] - messages_data = span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + messages_data = span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] assert isinstance(messages_data, str) parsed_messages = json.loads(messages_data) assert isinstance(parsed_messages, list) assert len(parsed_messages) <= len(large_messages) + (event,) = (item.payload for item in items if item.type == "transaction") meta_path = event["_meta"] span_meta = meta_path["spans"]["0"]["data"] messages_meta = span_meta[SPANDATA.GEN_AI_REQUEST_MESSAGES] @@ -3749,7 +3809,7 @@ def test_openai_message_truncation(sentry_init, capture_events): # noinspection PyTypeChecker def test_streaming_chat_completion_ttft( - sentry_init, capture_events, get_model_response, server_side_event_chunks + sentry_init, capture_items, get_model_response, server_side_event_chunks ): """ Test that streaming chat completions capture time-to-first-token (TTFT). @@ -3757,8 +3817,9 @@ def test_streaming_chat_completion_ttft( sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -3810,13 +3871,12 @@ def test_streaming_chat_completion_ttft( for _ in response_stream: pass - (tx,) = events - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" # Verify TTFT is captured - assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] - ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["attributes"] + ttft = span["attributes"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] assert isinstance(ttft, float) assert ttft > 0 @@ -3825,7 +3885,7 @@ def test_streaming_chat_completion_ttft( @pytest.mark.asyncio async def test_streaming_chat_completion_ttft_async( sentry_init, - capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, @@ -3836,8 +3896,9 @@ async def test_streaming_chat_completion_ttft_async( sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -3891,13 +3952,12 @@ async def test_streaming_chat_completion_ttft_async( async for _ in response_stream: pass - (tx,) = events - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" # Verify TTFT is captured - assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] - ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["attributes"] + ttft = span["attributes"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] assert isinstance(ttft, float) assert ttft > 0 @@ -3905,7 +3965,7 @@ async def test_streaming_chat_completion_ttft_async( # noinspection PyTypeChecker @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") def test_streaming_responses_api_ttft( - sentry_init, capture_events, get_model_response, server_side_event_chunks + sentry_init, capture_items, get_model_response, server_side_event_chunks ): """ Test that streaming responses API captures time-to-first-token (TTFT). @@ -3913,8 +3973,9 @@ def test_streaming_responses_api_ttft( sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -3936,13 +3997,12 @@ def test_streaming_responses_api_ttft( for _ in response_stream: pass - (tx,) = events - span = tx["spans"][0] - assert span["op"] == "gen_ai.responses" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.responses" # Verify TTFT is captured - assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] - ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["attributes"] + ttft = span["attributes"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] assert isinstance(ttft, float) assert ttft > 0 @@ -3952,7 +4012,7 @@ def test_streaming_responses_api_ttft( @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") async def test_streaming_responses_api_ttft_async( sentry_init, - capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, @@ -3963,8 +4023,9 @@ async def test_streaming_responses_api_ttft_async( sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -3986,12 +4047,11 @@ async def test_streaming_responses_api_ttft_async( async for _ in response_stream: pass - (tx,) = events - span = tx["spans"][0] - assert span["op"] == "gen_ai.responses" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.responses" # Verify TTFT is captured - assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] - ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["attributes"] + ttft = span["attributes"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] assert isinstance(ttft, float) assert ttft > 0 diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index 7310e86df5..9e74848a04 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -160,7 +160,7 @@ def test_agent_custom_model(): @pytest.mark.asyncio async def test_agent_invocation_span_no_pii( sentry_init, - capture_events, + capture_items, test_agent, nonstreaming_responses_model_response, get_model_response, @@ -182,9 +182,10 @@ async def test_agent_invocation_span_no_pii( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=False, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = await agents.Runner.run( agent, "Test input", run_config=test_run_config @@ -193,38 +194,44 @@ async def test_agent_invocation_span_no_pii( assert result is not None assert result.final_output == "Hello, how can I help you?" - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT ) - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + transactions = [item.payload for item in items if item.type == "transaction"] + assert len(transactions) == 1 + transaction = transactions[0] assert transaction["transaction"] == "test_agent workflow" assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - assert invoke_agent_span["description"] == "invoke_agent test_agent" + assert invoke_agent_span["name"] == "invoke_agent test_agent" - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_agent_span["data"] - assert "gen_ai.request.messages" not in invoke_agent_span["data"] - assert "gen_ai.response.text" not in invoke_agent_span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_agent_span["attributes"] + assert "gen_ai.request.messages" not in invoke_agent_span["attributes"] + assert "gen_ai.response.text" not in invoke_agent_span["attributes"] - assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" - assert invoke_agent_span["data"]["gen_ai.system"] == "openai" - assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" - assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 - assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" - assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 - assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 + assert invoke_agent_span["attributes"]["gen_ai.operation.name"] == "invoke_agent" + assert invoke_agent_span["attributes"]["gen_ai.system"] == "openai" + assert invoke_agent_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["attributes"]["gen_ai.request.top_p"] == 1.0 - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" - assert ai_client_span["data"]["gen_ai.system"] == "openai" - assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" - assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["attributes"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["attributes"]["gen_ai.request.top_p"] == 1.0 @pytest.mark.asyncio @@ -305,7 +312,7 @@ async def test_agent_invocation_span_no_pii( ) async def test_agent_invocation_span( sentry_init, - capture_events, + capture_items, test_agent_with_instructions, nonstreaming_responses_model_response, instructions, @@ -333,9 +340,10 @@ async def test_agent_invocation_span( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = await agents.Runner.run( agent, @@ -346,28 +354,34 @@ async def test_agent_invocation_span( assert result is not None assert result.final_output == "Hello, how can I help you?" - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span, ai_client_span = spans - + transactions = [item.payload for item in items if item.type == "transaction"] + assert len(transactions) == 1 + transaction = transactions[0] assert transaction["transaction"] == "test_agent workflow" assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - assert invoke_agent_span["description"] == "invoke_agent test_agent" + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span, ai_client_span = spans + + assert invoke_agent_span["name"] == "invoke_agent test_agent" # Only first case checks "gen_ai.request.messages" until further input handling work. param_id = request.node.callspec.id if "string" in param_id and instructions is None: # type: ignore - assert "gen_ai.system_instructions" not in ai_client_span["data"] + assert "gen_ai.system_instructions" not in ai_client_span["attributes"] - assert invoke_agent_span["data"]["gen_ai.request.messages"] == safe_serialize( + assert invoke_agent_span["attributes"][ + "gen_ai.request.messages" + ] == safe_serialize( [ {"content": [{"text": "Test input", "type": "text"}], "role": "user"}, ] ) elif "string" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ { "type": "text", @@ -376,13 +390,17 @@ async def test_agent_invocation_span( ] ) elif "blocks_no_type" in param_id and instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ {"type": "text", "content": "You are a helpful assistant."}, ] ) elif "blocks_no_type" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ { "type": "text", @@ -392,13 +410,17 @@ async def test_agent_invocation_span( ] ) elif "blocks" in param_id and instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ {"type": "text", "content": "You are a helpful assistant."}, ] ) elif "blocks" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ { "type": "text", @@ -408,14 +430,18 @@ async def test_agent_invocation_span( ] ) elif "parts_no_type" in param_id and instructions is None: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ {"type": "text", "content": "You are a helpful assistant."}, {"type": "text", "content": "Be concise and clear."}, ] ) elif "parts_no_type" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ { "type": "text", @@ -426,14 +452,18 @@ async def test_agent_invocation_span( ] ) elif instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ {"type": "text", "content": "You are a helpful assistant."}, {"type": "text", "content": "Be concise and clear."}, ] ) else: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ { "type": "text", @@ -445,32 +475,32 @@ async def test_agent_invocation_span( ) assert ( - invoke_agent_span["data"]["gen_ai.response.text"] + invoke_agent_span["attributes"]["gen_ai.response.text"] == "Hello, how can I help you?" ) - assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" - assert invoke_agent_span["data"]["gen_ai.system"] == "openai" - assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" - assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 - assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" - assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 - assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 + assert invoke_agent_span["attributes"]["gen_ai.operation.name"] == "invoke_agent" + assert invoke_agent_span["attributes"]["gen_ai.system"] == "openai" + assert invoke_agent_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["attributes"]["gen_ai.request.top_p"] == 1.0 - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" - assert ai_client_span["data"]["gen_ai.system"] == "openai" - assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" - assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["attributes"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["attributes"]["gen_ai.request.top_p"] == 1.0 @pytest.mark.asyncio async def test_client_span_custom_model( sentry_init, - capture_events, + capture_items, test_agent_custom_model, nonstreaming_responses_model_response, get_model_response, @@ -495,9 +525,10 @@ async def test_client_span_custom_model( sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span") result = await agents.Runner.run( agent, "Test input", run_config=test_run_config @@ -506,17 +537,18 @@ async def test_client_span_custom_model( assert result is not None assert result.final_output == "Hello, how can I help you?" - (transaction,) = events - spans = transaction["spans"] - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert ai_client_span["description"] == "chat my-custom-model" - assert ai_client_span["data"]["gen_ai.request.model"] == "my-custom-model" + assert ai_client_span["name"] == "chat my-custom-model" + assert ai_client_span["attributes"]["gen_ai.request.model"] == "my-custom-model" def test_agent_invocation_span_sync_no_pii( sentry_init, - capture_events, + capture_items, test_agent, nonstreaming_responses_model_response, get_model_response, @@ -541,44 +573,51 @@ def test_agent_invocation_span_sync_no_pii( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=False, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = agents.Runner.run_sync(agent, "Test input", run_config=test_run_config) assert result is not None assert result.final_output == "Hello, how can I help you?" - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT - ) - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) - + transactions = [item.payload for item in items if item.type == "transaction"] + assert len(transactions) == 1 + transaction = transactions[0] assert transaction["transaction"] == "test_agent workflow" assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" - assert invoke_agent_span["data"]["gen_ai.system"] == "openai" - assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" - assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 - assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" - assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 - assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) + + assert invoke_agent_span["name"] == "invoke_agent test_agent" + assert invoke_agent_span["attributes"]["gen_ai.operation.name"] == "invoke_agent" + assert invoke_agent_span["attributes"]["gen_ai.system"] == "openai" + assert invoke_agent_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["attributes"]["gen_ai.request.top_p"] == 1.0 - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" - assert ai_client_span["data"]["gen_ai.system"] == "openai" - assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" - assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["attributes"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["attributes"]["gen_ai.request.top_p"] == 1.0 - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_agent_span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_agent_span["attributes"] @pytest.mark.parametrize( @@ -658,7 +697,7 @@ def test_agent_invocation_span_sync_no_pii( ) def test_agent_invocation_span_sync( sentry_init, - capture_events, + capture_items, test_agent_with_instructions, nonstreaming_responses_model_response, instructions, @@ -686,9 +725,10 @@ def test_agent_invocation_span_sync( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = agents.Runner.run_sync( agent, @@ -699,36 +739,40 @@ def test_agent_invocation_span_sync( assert result is not None assert result.final_output == "Hello, how can I help you?" - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span, ai_client_span = spans - + transactions = [item.payload for item in items if item.type == "transaction"] + assert len(transactions) == 1 + transaction = transactions[0] assert transaction["transaction"] == "test_agent workflow" assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" - assert invoke_agent_span["data"]["gen_ai.system"] == "openai" - assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" - assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 - assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" - assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 - assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 - - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" - assert ai_client_span["data"]["gen_ai.system"] == "openai" - assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" - assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span, ai_client_span = spans + + assert invoke_agent_span["name"] == "invoke_agent test_agent" + assert invoke_agent_span["attributes"]["gen_ai.operation.name"] == "invoke_agent" + assert invoke_agent_span["attributes"]["gen_ai.system"] == "openai" + assert invoke_agent_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["attributes"]["gen_ai.request.top_p"] == 1.0 + + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["attributes"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["attributes"]["gen_ai.request.top_p"] == 1.0 param_id = request.node.callspec.id if "string" in param_id and instructions is None: # type: ignore - assert "gen_ai.system_instructions" not in ai_client_span["data"] + assert "gen_ai.system_instructions" not in ai_client_span["attributes"] elif "string" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ { "type": "text", @@ -737,13 +781,17 @@ def test_agent_invocation_span_sync( ] ) elif "blocks_no_type" in param_id and instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ {"type": "text", "content": "You are a helpful assistant."}, ] ) elif "blocks_no_type" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ { "type": "text", @@ -753,13 +801,17 @@ def test_agent_invocation_span_sync( ] ) elif "blocks" in param_id and instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ {"type": "text", "content": "You are a helpful assistant."}, ] ) elif "blocks" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ { "type": "text", @@ -769,14 +821,18 @@ def test_agent_invocation_span_sync( ] ) elif "parts_no_type" in param_id and instructions is None: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ {"type": "text", "content": "You are a helpful assistant."}, {"type": "text", "content": "Be concise and clear."}, ] ) elif "parts_no_type" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ { "type": "text", @@ -787,14 +843,18 @@ def test_agent_invocation_span_sync( ] ) elif instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ {"type": "text", "content": "You are a helpful assistant."}, {"type": "text", "content": "Be concise and clear."}, ] ) else: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( [ { "type": "text", @@ -807,7 +867,7 @@ def test_agent_invocation_span_sync( @pytest.mark.asyncio -async def test_handoff_span(sentry_init, capture_events, get_model_response): +async def test_handoff_span(sentry_init, capture_items, get_model_response): """ Test that handoff spans are created when agents hand off to other agents. """ @@ -908,9 +968,10 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response): sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") result = await agents.Runner.run( primary_agent, @@ -920,21 +981,22 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response): assert result is not None - (transaction,) = events - spans = transaction["spans"] - handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF) + spans = [item.payload for item in items if item.type == "span"] + handoff_span = next( + span + for span in spans + if span["attributes"].get("sentry.op") == OP.GEN_AI_HANDOFF + ) # Verify handoff span was created assert handoff_span is not None - assert ( - handoff_span["description"] == "handoff from primary_agent to secondary_agent" - ) - assert handoff_span["data"]["gen_ai.operation.name"] == "handoff" + assert handoff_span["name"] == "handoff from primary_agent to secondary_agent" + assert handoff_span["attributes"]["gen_ai.operation.name"] == "handoff" @pytest.mark.asyncio async def test_max_turns_before_handoff_span( - sentry_init, capture_events, get_model_response + sentry_init, capture_items, get_model_response ): """ Example raising agents.exceptions.AgentsException after the agent invocation span is complete. @@ -1036,9 +1098,10 @@ async def test_max_turns_before_handoff_span( sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") with pytest.raises(MaxTurnsExceeded): await agents.Runner.run( @@ -1048,22 +1111,23 @@ async def test_max_turns_before_handoff_span( max_turns=1, ) - (error, transaction) = events - spans = transaction["spans"] - handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF) + spans = [item.payload for item in items if item.type == "span"] + handoff_span = next( + span + for span in spans + if span["attributes"].get("sentry.op") == OP.GEN_AI_HANDOFF + ) # Verify handoff span was created assert handoff_span is not None - assert ( - handoff_span["description"] == "handoff from primary_agent to secondary_agent" - ) - assert handoff_span["data"]["gen_ai.operation.name"] == "handoff" + assert handoff_span["name"] == "handoff from primary_agent to secondary_agent" + assert handoff_span["attributes"]["gen_ai.operation.name"] == "handoff" @pytest.mark.asyncio async def test_tool_execution_span( sentry_init, - capture_events, + capture_items, test_agent, get_model_response, responses_tool_call_model_responses, @@ -1133,9 +1197,10 @@ def simple_test_tool(message: str) -> str: integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") await agents.Runner.run( agent_with_tool, @@ -1143,13 +1208,26 @@ def simple_test_tool(message: str) -> str: run_config=test_run_config, ) - (transaction,) = events - spans = transaction["spans"] - agent_span = next(span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT) + transactions = [item.payload for item in items if item.type == "transaction"] + assert len(transactions) == 1 + transaction = transactions[0] + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + + spans = [item.payload for item in items if item.type == "span"] + agent_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) ai_client_span1, ai_client_span2 = ( - span for span in spans if span["op"] == OP.GEN_AI_CHAT + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) + tool_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_EXECUTE_TOOL ) - tool_span = next(span for span in spans if span["op"] == OP.GEN_AI_EXECUTE_TOOL) available_tool = { "name": "simple_test_tool", @@ -1189,39 +1267,36 @@ def simple_test_tool(message: str) -> str: } ) - assert transaction["transaction"] == "test_agent workflow" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - - assert agent_span["description"] == "invoke_agent test_agent" - assert agent_span["origin"] == "auto.ai.openai_agents" - assert agent_span["data"]["gen_ai.agent.name"] == "test_agent" - assert agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" + assert agent_span["name"] == "invoke_agent test_agent" + assert agent_span["attributes"]["sentry.origin"] == "auto.ai.openai_agents" + assert agent_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert agent_span["attributes"]["gen_ai.operation.name"] == "invoke_agent" agent_span_available_tool = json.loads( - agent_span["data"]["gen_ai.request.available_tools"] + agent_span["attributes"]["gen_ai.request.available_tools"] )[0] assert all(agent_span_available_tool[k] == v for k, v in available_tool.items()) - assert agent_span["data"]["gen_ai.request.max_tokens"] == 100 - assert agent_span["data"]["gen_ai.request.model"] == "gpt-4" - assert agent_span["data"]["gen_ai.request.temperature"] == 0.7 - assert agent_span["data"]["gen_ai.request.top_p"] == 1.0 - assert agent_span["data"]["gen_ai.system"] == "openai" + assert agent_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert agent_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert agent_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert agent_span["attributes"]["gen_ai.request.top_p"] == 1.0 + assert agent_span["attributes"]["gen_ai.system"] == "openai" - assert ai_client_span1["description"] == "chat gpt-4" - assert ai_client_span1["data"]["gen_ai.operation.name"] == "chat" - assert ai_client_span1["data"]["gen_ai.system"] == "openai" - assert ai_client_span1["data"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span1["name"] == "chat gpt-4" + assert ai_client_span1["attributes"]["gen_ai.operation.name"] == "chat" + assert ai_client_span1["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span1["attributes"]["gen_ai.agent.name"] == "test_agent" ai_client_span1_available_tool = json.loads( - ai_client_span1["data"]["gen_ai.request.available_tools"] + ai_client_span1["attributes"]["gen_ai.request.available_tools"] )[0] assert all( ai_client_span1_available_tool[k] == v for k, v in available_tool.items() ) - assert ai_client_span1["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span1["data"]["gen_ai.request.messages"] == safe_serialize( + assert ai_client_span1["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span1["attributes"]["gen_ai.request.messages"] == safe_serialize( [ { "role": "user", @@ -1231,14 +1306,14 @@ def simple_test_tool(message: str) -> str: }, ] ) - assert ai_client_span1["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span1["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span1["data"]["gen_ai.request.top_p"] == 1.0 - assert ai_client_span1["data"]["gen_ai.usage.input_tokens"] == 10 - assert ai_client_span1["data"]["gen_ai.usage.input_tokens.cached"] == 0 - assert ai_client_span1["data"]["gen_ai.usage.output_tokens"] == 5 - assert ai_client_span1["data"]["gen_ai.usage.output_tokens.reasoning"] == 0 - assert ai_client_span1["data"]["gen_ai.usage.total_tokens"] == 15 + assert ai_client_span1["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span1["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span1["attributes"]["gen_ai.request.top_p"] == 1.0 + assert ai_client_span1["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert ai_client_span1["attributes"]["gen_ai.usage.input_tokens.cached"] == 0 + assert ai_client_span1["attributes"]["gen_ai.usage.output_tokens"] == 5 + assert ai_client_span1["attributes"]["gen_ai.usage.output_tokens.reasoning"] == 0 + assert ai_client_span1["attributes"]["gen_ai.usage.total_tokens"] == 15 tool_call = { "arguments": '{"message": "hello"}', @@ -1252,41 +1327,41 @@ def simple_test_tool(message: str) -> str: if OPENAI_VERSION >= (2, 25, 0): tool_call["namespace"] = None - assert json.loads(ai_client_span1["data"]["gen_ai.response.tool_calls"]) == [ + assert json.loads(ai_client_span1["attributes"]["gen_ai.response.tool_calls"]) == [ tool_call ] - assert tool_span["description"] == "execute_tool simple_test_tool" - assert tool_span["data"]["gen_ai.agent.name"] == "test_agent" - assert tool_span["data"]["gen_ai.operation.name"] == "execute_tool" + assert tool_span["name"] == "execute_tool simple_test_tool" + assert tool_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert tool_span["attributes"]["gen_ai.operation.name"] == "execute_tool" tool_span_available_tool = json.loads( - tool_span["data"]["gen_ai.request.available_tools"] + tool_span["attributes"]["gen_ai.request.available_tools"] )[0] assert all(tool_span_available_tool[k] == v for k, v in available_tool.items()) - assert tool_span["data"]["gen_ai.request.max_tokens"] == 100 - assert tool_span["data"]["gen_ai.request.model"] == "gpt-4" - assert tool_span["data"]["gen_ai.request.temperature"] == 0.7 - assert tool_span["data"]["gen_ai.request.top_p"] == 1.0 - assert tool_span["data"]["gen_ai.system"] == "openai" - assert tool_span["data"]["gen_ai.tool.description"] == "A simple tool" - assert tool_span["data"]["gen_ai.tool.input"] == '{"message": "hello"}' - assert tool_span["data"]["gen_ai.tool.name"] == "simple_test_tool" - assert tool_span["data"]["gen_ai.tool.output"] == "Tool executed with: hello" - assert ai_client_span2["description"] == "chat gpt-4" - assert ai_client_span2["data"]["gen_ai.agent.name"] == "test_agent" - assert ai_client_span2["data"]["gen_ai.operation.name"] == "chat" + assert tool_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert tool_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert tool_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert tool_span["attributes"]["gen_ai.request.top_p"] == 1.0 + assert tool_span["attributes"]["gen_ai.system"] == "openai" + assert tool_span["attributes"]["gen_ai.tool.description"] == "A simple tool" + assert tool_span["attributes"]["gen_ai.tool.input"] == '{"message": "hello"}' + assert tool_span["attributes"]["gen_ai.tool.name"] == "simple_test_tool" + assert tool_span["attributes"]["gen_ai.tool.output"] == "Tool executed with: hello" + assert ai_client_span2["name"] == "chat gpt-4" + assert ai_client_span2["attributes"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span2["attributes"]["gen_ai.operation.name"] == "chat" ai_client_span2_available_tool = json.loads( - ai_client_span2["data"]["gen_ai.request.available_tools"] + ai_client_span2["attributes"]["gen_ai.request.available_tools"] )[0] assert all( ai_client_span2_available_tool[k] == v for k, v in available_tool.items() ) - assert ai_client_span2["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span2["data"]["gen_ai.request.messages"] == safe_serialize( + assert ai_client_span2["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span2["attributes"]["gen_ai.request.messages"] == safe_serialize( [ { "role": "tool", @@ -1300,19 +1375,19 @@ def simple_test_tool(message: str) -> str: }, ] ) - assert ai_client_span2["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span2["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span2["data"]["gen_ai.request.top_p"] == 1.0 + assert ai_client_span2["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span2["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span2["attributes"]["gen_ai.request.top_p"] == 1.0 assert ( - ai_client_span2["data"]["gen_ai.response.text"] + ai_client_span2["attributes"]["gen_ai.response.text"] == "Task completed using the tool" ) - assert ai_client_span2["data"]["gen_ai.system"] == "openai" - assert ai_client_span2["data"]["gen_ai.usage.input_tokens.cached"] == 0 - assert ai_client_span2["data"]["gen_ai.usage.input_tokens"] == 15 - assert ai_client_span2["data"]["gen_ai.usage.output_tokens.reasoning"] == 0 - assert ai_client_span2["data"]["gen_ai.usage.output_tokens"] == 10 - assert ai_client_span2["data"]["gen_ai.usage.total_tokens"] == 25 + assert ai_client_span2["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span2["attributes"]["gen_ai.usage.input_tokens.cached"] == 0 + assert ai_client_span2["attributes"]["gen_ai.usage.input_tokens"] == 15 + assert ai_client_span2["attributes"]["gen_ai.usage.output_tokens.reasoning"] == 0 + assert ai_client_span2["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert ai_client_span2["attributes"]["gen_ai.usage.total_tokens"] == 25 @pytest.mark.asyncio @@ -1351,6 +1426,7 @@ async def test_hosted_mcp_tool_propagation_header_streamed( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, release="d08ebdb9309e1b004c6f52202de58a09c2268e42", + _experiments={"gen_ai_as_v2_spans": True}, ) request_headers = {} @@ -1513,6 +1589,7 @@ async def test_hosted_mcp_tool_propagation_headers( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, release="d08ebdb9309e1b004c6f52202de58a09c2268e42", + _experiments={"gen_ai_as_v2_spans": True}, ) response = get_model_response(EXAMPLE_RESPONSE, serialize_pydantic=True) @@ -1570,7 +1647,7 @@ async def test_hosted_mcp_tool_propagation_headers( @pytest.mark.asyncio -async def test_model_behavior_error(sentry_init, capture_events, test_agent): +async def test_model_behavior_error(sentry_init, capture_items, test_agent): """ Example raising agents.exceptions.AgentsException before the agent invocation span is complete. The mocked API response indicates that "wrong_tool" was called. @@ -1611,9 +1688,10 @@ def simple_test_tool(message: str) -> str: integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") with pytest.raises(ModelBehaviorError): await agents.Runner.run( @@ -1622,26 +1700,27 @@ def simple_test_tool(message: str) -> str: run_config=test_run_config, ) - (error, transaction) = events - spans = transaction["spans"] + transactions = [item.payload for item in items if item.type == "transaction"] + assert len(transactions) == 1 + transaction = transactions[0] + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + + spans = [item.payload for item in items if item.type == "span"] ( agent_span, ai_client_span1, ) = spans - assert transaction["transaction"] == "test_agent workflow" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - - assert agent_span["description"] == "invoke_agent test_agent" - assert agent_span["origin"] == "auto.ai.openai_agents" + assert agent_span["name"] == "invoke_agent test_agent" + assert agent_span["attributes"]["sentry.origin"] == "auto.ai.openai_agents" # Error due to unrecognized tool in model response. - assert agent_span["status"] == "internal_error" - assert agent_span["tags"]["status"] == "internal_error" + assert agent_span["status"] == "error" @pytest.mark.asyncio -async def test_error_handling(sentry_init, capture_events, test_agent): +async def test_error_handling(sentry_init, capture_items, test_agent): """ Test error handling in agent execution. """ @@ -1658,41 +1737,42 @@ async def test_error_handling(sentry_init, capture_events, test_agent): LoggingIntegration(event_level=logging.CRITICAL), ], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event", "span", "transaction") with pytest.raises(Exception, match="Model Error"): await agents.Runner.run( test_agent, "Test input", run_config=test_run_config ) - ( - error_event, - transaction, - ) = events - + error_events = [item.payload for item in items if item.type == "event"] + assert len(error_events) == 1 + error_event = error_events[0] assert error_event["exception"]["values"][0]["type"] == "Exception" assert error_event["exception"]["values"][0]["value"] == "Model Error" assert error_event["exception"]["values"][0]["mechanism"]["type"] == "openai_agents" - spans = transaction["spans"] - (invoke_agent_span, ai_client_span) = spans - + transactions = [item.payload for item in items if item.type == "transaction"] + assert len(transactions) == 1 + transaction = transactions[0] assert transaction["transaction"] == "test_agent workflow" assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert invoke_agent_span["origin"] == "auto.ai.openai_agents" + spans = [item.payload for item in items if item.type == "span"] + (invoke_agent_span, ai_client_span) = spans + + assert invoke_agent_span["name"] == "invoke_agent test_agent" + assert invoke_agent_span["attributes"]["sentry.origin"] == "auto.ai.openai_agents" - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["origin"] == "auto.ai.openai_agents" - assert ai_client_span["status"] == "internal_error" - assert ai_client_span["tags"]["status"] == "internal_error" + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["attributes"]["sentry.origin"] == "auto.ai.openai_agents" + assert ai_client_span["status"] == "error" @pytest.mark.asyncio -async def test_error_captures_input_data(sentry_init, capture_events, test_agent): +async def test_error_captures_input_data(sentry_init, capture_items, test_agent): """ Test that input data is captured even when the API call raises an exception. This verifies that _set_input_data is called before the API call. @@ -1723,39 +1803,39 @@ async def test_error_captures_input_data(sentry_init, capture_events, test_agent ], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event", "span") with pytest.raises(InternalServerError, match="Error code: 500"): await agents.Runner.run(agent, "Test input", run_config=test_run_config) - ( - error_event, - transaction, - ) = events - + error_events = [item.payload for item in items if item.type == "event"] + assert len(error_events) == 1 + error_event = error_events[0] assert error_event["exception"]["values"][0]["type"] == "InternalServerError" assert error_event["exception"]["values"][0]["value"] == "Error code: 500" - spans = transaction["spans"] - ai_client_span = [s for s in spans if s["op"] == "gen_ai.chat"][0] + spans = [item.payload for item in items if item.type == "span"] + ai_client_span = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ][0] - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["status"] == "internal_error" - assert ai_client_span["tags"]["status"] == "internal_error" + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["status"] == "error" - assert "gen_ai.request.messages" in ai_client_span["data"] + assert "gen_ai.request.messages" in ai_client_span["attributes"] request_messages = safe_serialize( [ {"role": "user", "content": [{"type": "text", "text": "Test input"}]}, ] ) - assert ai_client_span["data"]["gen_ai.request.messages"] == request_messages + assert ai_client_span["attributes"]["gen_ai.request.messages"] == request_messages @pytest.mark.asyncio -async def test_span_status_error(sentry_init, capture_events, test_agent): +async def test_span_status_error(sentry_init, capture_items, test_agent): with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): with patch( "agents.models.openai_responses.OpenAIResponsesModel.get_response" @@ -1768,25 +1848,29 @@ async def test_span_status_error(sentry_init, capture_events, test_agent): LoggingIntegration(event_level=logging.CRITICAL), ], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event", "transaction", "span") with pytest.raises(ValueError, match="Model Error"): await agents.Runner.run( test_agent, "Test input", run_config=test_run_config ) - (error, transaction) = events + (error,) = (item.payload for item in items if item.type == "event") assert error["level"] == "error" - assert transaction["spans"][0]["status"] == "internal_error" - assert transaction["spans"][0]["tags"]["status"] == "internal_error" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["status"] == "error" + + (transaction,) = (item.payload for item in items if item.type == "transaction") assert transaction["contexts"]["trace"]["status"] == "internal_error" @pytest.mark.asyncio async def test_mcp_tool_execution_spans( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, capture_items, test_agent, get_model_response ): """ Test that MCP (Model Context Protocol) tool calls create execute_tool spans. @@ -1878,9 +1962,10 @@ async def test_mcp_tool_execution_spans( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") await agents.Runner.run( agent, @@ -1888,33 +1973,35 @@ async def test_mcp_tool_execution_spans( run_config=test_run_config, ) - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find the MCP execute_tool span mcp_tool_span = None for span in spans: - if span.get("description") == "execute_tool test_mcp_tool": + if span.get("name") == "execute_tool test_mcp_tool": mcp_tool_span = span break # Verify the MCP tool span was created assert mcp_tool_span is not None, "MCP execute_tool span was not created" - assert mcp_tool_span["description"] == "execute_tool test_mcp_tool" - assert mcp_tool_span["data"]["gen_ai.tool.name"] == "test_mcp_tool" - assert mcp_tool_span["data"]["gen_ai.tool.input"] == '{"query": "search term"}' + assert mcp_tool_span["name"] == "execute_tool test_mcp_tool" + assert mcp_tool_span["attributes"]["gen_ai.tool.name"] == "test_mcp_tool" assert ( - mcp_tool_span["data"]["gen_ai.tool.output"] == "MCP tool executed successfully" + mcp_tool_span["attributes"]["gen_ai.tool.input"] == '{"query": "search term"}' + ) + assert ( + mcp_tool_span["attributes"]["gen_ai.tool.output"] + == "MCP tool executed successfully" ) # Verify no error status since error was None - assert mcp_tool_span.get("status") != "internal_error" - assert mcp_tool_span.get("tags", {}).get("status") != "internal_error" + assert mcp_tool_span.get("status") != "error" + assert mcp_tool_span.get("tags", {}).get("status") != "error" @pytest.mark.asyncio async def test_mcp_tool_execution_with_error( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, capture_items, test_agent, get_model_response ): """ Test that MCP tool calls with errors are tracked with error status. @@ -2006,9 +2093,10 @@ async def test_mcp_tool_execution_with_error( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") await agents.Runner.run( agent, @@ -2016,31 +2104,29 @@ async def test_mcp_tool_execution_with_error( run_config=test_run_config, ) - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find the MCP execute_tool span with error mcp_tool_span = None for span in spans: - if span.get("description") == "execute_tool failing_mcp_tool": + if span.get("name") == "execute_tool failing_mcp_tool": mcp_tool_span = span break # Verify the MCP tool span was created with error status assert mcp_tool_span is not None, "MCP execute_tool span was not created" - assert mcp_tool_span["description"] == "execute_tool failing_mcp_tool" - assert mcp_tool_span["data"]["gen_ai.tool.name"] == "failing_mcp_tool" - assert mcp_tool_span["data"]["gen_ai.tool.input"] == '{"query": "test"}' - assert mcp_tool_span["data"]["gen_ai.tool.output"] is None + assert mcp_tool_span["name"] == "execute_tool failing_mcp_tool" + assert mcp_tool_span["attributes"]["gen_ai.tool.name"] == "failing_mcp_tool" + assert mcp_tool_span["attributes"]["gen_ai.tool.input"] == '{"query": "test"}' + assert mcp_tool_span["attributes"]["gen_ai.tool.output"] == "None" # Verify error status was set - assert mcp_tool_span["status"] == "internal_error" - assert mcp_tool_span["tags"]["status"] == "internal_error" + assert mcp_tool_span["status"] == "error" @pytest.mark.asyncio async def test_mcp_tool_execution_without_pii( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, capture_items, test_agent, get_model_response ): """ Test that MCP tool input/output are not included when send_default_pii is False. @@ -2132,9 +2218,10 @@ async def test_mcp_tool_execution_without_pii( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=False, # PII disabled + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") await agents.Runner.run( agent, @@ -2142,30 +2229,29 @@ async def test_mcp_tool_execution_without_pii( run_config=test_run_config, ) - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find the MCP execute_tool span mcp_tool_span = None for span in spans: - if span.get("description") == "execute_tool test_mcp_tool": + if span.get("name") == "execute_tool test_mcp_tool": mcp_tool_span = span break # Verify the MCP tool span was created but without input/output assert mcp_tool_span is not None, "MCP execute_tool span was not created" - assert mcp_tool_span["description"] == "execute_tool test_mcp_tool" - assert mcp_tool_span["data"]["gen_ai.tool.name"] == "test_mcp_tool" + assert mcp_tool_span["name"] == "execute_tool test_mcp_tool" + assert mcp_tool_span["attributes"]["gen_ai.tool.name"] == "test_mcp_tool" # Verify input and output are not included when send_default_pii is False - assert "gen_ai.tool.input" not in mcp_tool_span["data"] - assert "gen_ai.tool.output" not in mcp_tool_span["data"] + assert "gen_ai.tool.input" not in mcp_tool_span["attributes"] + assert "gen_ai.tool.output" not in mcp_tool_span["attributes"] @pytest.mark.asyncio async def test_multiple_agents_asyncio( sentry_init, - capture_events, + capture_items, test_agent, nonstreaming_responses_model_response, get_model_response, @@ -2190,9 +2276,10 @@ async def test_multiple_agents_asyncio( sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") async def run(): await agents.Runner.run( @@ -2203,14 +2290,10 @@ async def run(): await asyncio.gather(*[run() for _ in range(3)]) - assert len(events) == 3 - txn1, txn2, txn3 = events + txn1, txn2, txn3 = (item.payload for item in items if item.type == "transaction") - assert txn1["type"] == "transaction" assert txn1["transaction"] == "test_agent workflow" - assert txn2["type"] == "transaction" assert txn2["transaction"] == "test_agent workflow" - assert txn3["type"] == "transaction" assert txn3["transaction"] == "test_agent workflow" @@ -2230,13 +2313,14 @@ async def run(): ], ) def test_openai_agents_message_role_mapping( - sentry_init, capture_events, test_message, expected_role + sentry_init, capture_items, test_message, expected_role ): """Test that OpenAI Agents integration properly maps message roles like 'ai' to 'assistant'""" sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) get_response_kwargs = {"input": [test_message]} @@ -2259,7 +2343,7 @@ def test_openai_agents_message_role_mapping( @pytest.mark.asyncio async def test_tool_execution_error_tracing( sentry_init, - capture_events, + capture_items, test_agent, get_model_response, responses_tool_call_model_responses, @@ -2336,9 +2420,10 @@ def failing_tool(message: str) -> str: integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") # Note: The agents library catches tool exceptions internally, # so we don't expect this to raise @@ -2348,13 +2433,12 @@ def failing_tool(message: str) -> str: run_config=test_run_config, ) - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find the execute_tool span execute_tool_span = None for span in spans: - description = span.get("description", "") + description = span.get("name", "") if description is not None and description.startswith( "execute_tool failing_tool" ): @@ -2363,19 +2447,18 @@ def failing_tool(message: str) -> str: # Verify the execute_tool span was created assert execute_tool_span is not None, "execute_tool span was not created" - assert execute_tool_span["description"] == "execute_tool failing_tool" - assert execute_tool_span["data"]["gen_ai.tool.name"] == "failing_tool" + assert execute_tool_span["name"] == "execute_tool failing_tool" + assert execute_tool_span["attributes"]["gen_ai.tool.name"] == "failing_tool" # Verify error status was set (this is the key test for our patch) # The span should be marked as error because the tool execution failed - assert execute_tool_span["status"] == "internal_error" - assert execute_tool_span["tags"]["status"] == "internal_error" + assert execute_tool_span["status"] == "error" @pytest.mark.asyncio async def test_invoke_agent_span_includes_usage_data( sentry_init, - capture_events, + capture_items, test_agent, get_model_response, ): @@ -2435,9 +2518,10 @@ async def test_invoke_agent_span_includes_usage_data( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = await agents.Runner.run( agent, "Test input", run_config=test_run_config @@ -2445,29 +2529,30 @@ async def test_invoke_agent_span_includes_usage_data( assert result is not None - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ) # Verify invoke_agent span has usage data from context_wrapper - assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + assert invoke_agent_span["name"] == "invoke_agent test_agent" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["attributes"] - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens.cached"] == 0 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens.reasoning"] == 5 + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 30 + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens.cached"] == 0 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens.reasoning"] == 5 @pytest.mark.asyncio async def test_ai_client_span_includes_response_model( sentry_init, - capture_events, + capture_items, test_agent, get_model_response, ): @@ -2527,9 +2612,10 @@ async def test_ai_client_span_includes_response_model( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = await agents.Runner.run( agent, "Test input", run_config=test_run_config @@ -2537,20 +2623,21 @@ async def test_ai_client_span_includes_response_model( assert result is not None - (transaction,) = events - spans = transaction["spans"] - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) # Verify ai_client span has response model from API response - assert ai_client_span["description"] == "chat gpt-4" - assert "gen_ai.response.model" in ai_client_span["data"] - assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + assert ai_client_span["name"] == "chat gpt-4" + assert "gen_ai.response.model" in ai_client_span["attributes"] + assert ai_client_span["attributes"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" @pytest.mark.asyncio async def test_ai_client_span_response_model_with_chat_completions( sentry_init, - capture_events, + capture_items, get_model_response, ): """ @@ -2614,9 +2701,10 @@ async def test_ai_client_span_response_model_with_chat_completions( sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = await agents.Runner.run( agent, "Test input", run_config=test_run_config @@ -2624,18 +2712,22 @@ async def test_ai_client_span_response_model_with_chat_completions( assert result is not None - (transaction,) = events - spans = transaction["spans"] - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) # Verify response model from API response is captured - assert "gen_ai.response.model" in ai_client_span["data"] - assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4o-mini-2024-07-18" + assert "gen_ai.response.model" in ai_client_span["attributes"] + assert ( + ai_client_span["attributes"]["gen_ai.response.model"] + == "gpt-4o-mini-2024-07-18" + ) @pytest.mark.asyncio async def test_multiple_llm_calls_aggregate_usage( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, capture_items, test_agent, get_model_response ): """ Test that invoke_agent spans show aggregated usage across multiple LLM calls @@ -2732,9 +2824,10 @@ def calculator(a: int, b: int) -> int: integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = await agents.Runner.run( agent_with_tool, @@ -2744,25 +2837,24 @@ def calculator(a: int, b: int) -> int: assert result is not None - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] invoke_agent_span = spans[0] # Verify invoke_agent span has aggregated usage from both API calls # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 50 # Cached tokens should be aggregated: 0 + 5 = 5 - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens.cached"] == 5 + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens.cached"] == 5 # Reasoning tokens should be aggregated: 0 + 3 = 3 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens.reasoning"] == 3 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens.reasoning"] == 3 @pytest.mark.asyncio async def test_invoke_agent_span_includes_response_model( sentry_init, - capture_events, + capture_items, test_agent, get_model_response, ): @@ -2821,9 +2913,10 @@ async def test_invoke_agent_span_includes_response_model( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = await agents.Runner.run( agent, "Test input", run_config=test_run_config @@ -2831,27 +2924,32 @@ async def test_invoke_agent_span_includes_response_model( assert result is not None - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT ) - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) # Verify invoke_agent span has response model from API - assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + assert invoke_agent_span["name"] == "invoke_agent test_agent" + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) # Also verify ai_client span has it - assert "gen_ai.response.model" in ai_client_span["data"] - assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + assert "gen_ai.response.model" in ai_client_span["attributes"] + assert ai_client_span["attributes"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" @pytest.mark.asyncio async def test_invoke_agent_span_uses_last_response_model( sentry_init, - capture_events, + capture_items, test_agent, get_model_response, ): @@ -2950,9 +3048,10 @@ def calculator(a: int, b: int) -> int: integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = await agents.Runner.run( agent_with_tool, @@ -2962,24 +3061,26 @@ def calculator(a: int, b: int) -> int: assert result is not None - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] invoke_agent_span = spans[0] first_ai_client_span = spans[1] second_ai_client_span = spans[3] # After tool span # Invoke_agent span uses the LAST response model - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) # Each ai_client span has its own response model from the API - assert first_ai_client_span["data"]["gen_ai.response.model"] == "gpt-4-0613" + assert first_ai_client_span["attributes"]["gen_ai.response.model"] == "gpt-4-0613" assert ( - second_ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + second_ai_client_span["attributes"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" ) -def test_openai_agents_message_truncation(sentry_init, capture_events): +def test_openai_agents_message_truncation(sentry_init, capture_items): """Test that large messages are truncated properly in OpenAI Agents integration.""" large_content = ( @@ -2990,6 +3091,7 @@ def test_openai_agents_message_truncation(sentry_init, capture_events): integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) test_messages = [ @@ -3036,6 +3138,7 @@ async def test_streaming_span_update_captures_response_data( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) # Create a mock streaming response object (similar to what we'd get from ResponseCompletedEvent) @@ -3101,6 +3204,7 @@ async def test_streaming_ttft_on_chat_span( sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) request_headers = {} @@ -3230,7 +3334,7 @@ async def test_streaming_ttft_on_chat_span( @pytest.mark.asyncio async def test_conversation_id_on_all_spans( sentry_init, - capture_events, + capture_items, test_agent, nonstreaming_responses_model_response, get_model_response, @@ -3255,9 +3359,10 @@ async def test_conversation_id_on_all_spans( sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") result = await agents.Runner.run( agent, @@ -3268,24 +3373,28 @@ async def test_conversation_id_on_all_spans( assert result is not None - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT ) - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) # Verify workflow span (transaction) has conversation_id + (transaction,) = (item.payload for item in items if item.type == "transaction") assert ( transaction["contexts"]["trace"]["data"]["gen_ai.conversation.id"] == "conv_test_123" ) # Verify invoke_agent span has conversation_id - assert invoke_agent_span["data"]["gen_ai.conversation.id"] == "conv_test_123" + assert invoke_agent_span["attributes"]["gen_ai.conversation.id"] == "conv_test_123" # Verify ai_client span has conversation_id - assert ai_client_span["data"]["gen_ai.conversation.id"] == "conv_test_123" + assert ai_client_span["attributes"]["gen_ai.conversation.id"] == "conv_test_123" @pytest.mark.skipif( @@ -3294,7 +3403,7 @@ async def test_conversation_id_on_all_spans( ) @pytest.mark.asyncio async def test_conversation_id_on_tool_span( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, capture_items, test_agent, get_model_response ): """ Test that gen_ai.conversation.id is set on tool execution spans when passed to Runner.run(). @@ -3389,9 +3498,10 @@ def simple_tool(message: str) -> str: sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") await agents.Runner.run( agent_with_tool, @@ -3400,21 +3510,20 @@ def simple_tool(message: str) -> str: conversation_id="conv_tool_test_456", ) - (transaction,) = events - spans = transaction["spans"] - + spans = [item.payload for item in items if item.type == "span"] # Find the tool span tool_span = None for span in spans: - if span.get("description", "").startswith("execute_tool"): + if span.get("name", "").startswith("execute_tool"): tool_span = span break assert tool_span is not None # Tool span should have the conversation_id passed to Runner.run() - assert tool_span["data"]["gen_ai.conversation.id"] == "conv_tool_test_456" + assert tool_span["attributes"]["gen_ai.conversation.id"] == "conv_tool_test_456" # Workflow span (transaction) should have the same conversation_id + (transaction,) = (item.payload for item in items if item.type == "transaction") assert ( transaction["contexts"]["trace"]["data"]["gen_ai.conversation.id"] == "conv_tool_test_456" @@ -3428,7 +3537,7 @@ def simple_tool(message: str) -> str: @pytest.mark.asyncio async def test_no_conversation_id_when_not_provided( sentry_init, - capture_events, + capture_items, test_agent, nonstreaming_responses_model_response, get_model_response, @@ -3453,9 +3562,10 @@ async def test_no_conversation_id_when_not_provided( sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("span", "transaction") # Don't pass conversation_id result = await agents.Runner.run( @@ -3464,16 +3574,23 @@ async def test_no_conversation_id_when_not_provided( assert result is not None - (transaction,) = events - spans = transaction["spans"] + transactions = [item.payload for item in items if item.type == "transaction"] + assert len(transactions) == 1 + transaction = transactions[0] + + spans = [item.payload for item in items if item.type == "span"] invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT ) - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) # Verify conversation_id is NOT set on any spans assert "gen_ai.conversation.id" not in transaction["contexts"]["trace"].get( - "data", {} + "attributes", {} ) - assert "gen_ai.conversation.id" not in invoke_agent_span.get("data", {}) - assert "gen_ai.conversation.id" not in ai_client_span.get("data", {}) + assert "gen_ai.conversation.id" not in invoke_agent_span.get("attributes", {}) + assert "gen_ai.conversation.id" not in ai_client_span.get("attributes", {}) diff --git a/tests/integrations/pydantic_ai/test_pydantic_ai.py b/tests/integrations/pydantic_ai/test_pydantic_ai.py index 50ce155f5b..9faccb0a84 100644 --- a/tests/integrations/pydantic_ai/test_pydantic_ai.py +++ b/tests/integrations/pydantic_ai/test_pydantic_ai.py @@ -53,7 +53,7 @@ def inner(): @pytest.mark.asyncio -async def test_agent_run_async(sentry_init, capture_events, get_test_agent): +async def test_agent_run_async(sentry_init, capture_items, get_test_agent): """ Test that the integration creates spans for async agent runs. """ @@ -61,9 +61,10 @@ async def test_agent_run_async(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent = get_test_agent() result = await test_agent.run("Test input") @@ -71,8 +72,7 @@ async def test_agent_run_async(sentry_init, capture_events, get_test_agent): assert result is not None assert result.output is not None - (transaction,) = events - spans = transaction["spans"] + (transaction,) = (item.payload for item in items if item.type == "transaction") # Verify transaction (the transaction IS the invoke_agent span) assert transaction["transaction"] == "invoke_agent test_agent" @@ -81,28 +81,32 @@ async def test_agent_run_async(sentry_init, capture_events, get_test_agent): # The transaction itself should have invoke_agent data assert transaction["contexts"]["trace"]["op"] == "gen_ai.invoke_agent" + spans = [item.payload for item in items if item.type == "span"] # Find child span types (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 # Check chat span chat_span = chat_spans[0] - assert "chat" in chat_span["description"] - assert chat_span["data"]["gen_ai.operation.name"] == "chat" - assert chat_span["data"]["gen_ai.response.streaming"] is False - assert "gen_ai.request.messages" in chat_span["data"] - assert "gen_ai.usage.input_tokens" in chat_span["data"] - assert "gen_ai.usage.output_tokens" in chat_span["data"] + assert "chat" in chat_span["name"] + assert chat_span["attributes"]["gen_ai.operation.name"] == "chat" + assert chat_span["attributes"]["gen_ai.response.streaming"] is False + assert "gen_ai.request.messages" in chat_span["attributes"] + assert "gen_ai.usage.input_tokens" in chat_span["attributes"] + assert "gen_ai.usage.output_tokens" in chat_span["attributes"] @pytest.mark.asyncio -async def test_agent_run_async_model_error(sentry_init, capture_events): +async def test_agent_run_async_model_error(sentry_init, capture_items): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event", "transaction", "span") def failing_model(messages, info): raise RuntimeError("model exploded") @@ -115,17 +119,17 @@ def failing_model(messages, info): with pytest.raises(RuntimeError, match="model exploded"): await agent.run("Test input") - (error, transaction) = events + (error,) = (item.payload for item in items if item.type == "event") assert error["level"] == "error" - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] assert len(spans) == 1 - assert spans[0]["status"] == "internal_error" + assert spans[0]["status"] == "error" @pytest.mark.asyncio -async def test_agent_run_async_usage_data(sentry_init, capture_events, get_test_agent): +async def test_agent_run_async_usage_data(sentry_init, capture_items, get_test_agent): """ Test that the invoke_agent span includes token usage and model data. """ @@ -133,9 +137,10 @@ async def test_agent_run_async_usage_data(sentry_init, capture_events, get_test_ integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent = get_test_agent() result = await test_agent.run("Test input") @@ -143,8 +148,7 @@ async def test_agent_run_async_usage_data(sentry_init, capture_events, get_test_ assert result is not None assert result.output is not None - (transaction,) = events - + (transaction,) = (item.payload for item in items if item.type == "transaction") # Verify transaction (the transaction IS the invoke_agent span) assert transaction["transaction"] == "invoke_agent test_agent" @@ -170,7 +174,7 @@ async def test_agent_run_async_usage_data(sentry_init, capture_events, get_test_ assert trace_data["gen_ai.response.model"] == "test" # Test model name -def test_agent_run_sync(sentry_init, capture_events, get_test_agent): +def test_agent_run_sync(sentry_init, capture_items, get_test_agent): """ Test that the integration creates spans for sync agent runs. """ @@ -178,9 +182,10 @@ def test_agent_run_sync(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent = get_test_agent() result = test_agent.run_sync("Test input") @@ -188,29 +193,32 @@ def test_agent_run_sync(sentry_init, capture_events, get_test_agent): assert result is not None assert result.output is not None - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Verify transaction + (transaction,) = (item.payload for item in items if item.type == "transaction") assert transaction["transaction"] == "invoke_agent test_agent" assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" # Find span types - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 # Verify streaming flag is False for sync for chat_span in chat_spans: - assert chat_span["data"]["gen_ai.response.streaming"] is False + assert chat_span["attributes"]["gen_ai.response.streaming"] is False -def test_agent_run_sync_model_error(sentry_init, capture_events): +def test_agent_run_sync_model_error(sentry_init, capture_items): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("event", "transaction", "span") def failing_model(messages, info): raise RuntimeError("model exploded") @@ -223,17 +231,17 @@ def failing_model(messages, info): with pytest.raises(RuntimeError, match="model exploded"): agent.run_sync("Test input") - (error, transaction) = events + (error,) = (item.payload for item in items if item.type == "event") assert error["level"] == "error" - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] assert len(spans) == 1 - assert spans[0]["status"] == "internal_error" + assert spans[0]["status"] == "error" @pytest.mark.asyncio -async def test_agent_run_stream(sentry_init, capture_events, get_test_agent): +async def test_agent_run_stream(sentry_init, capture_items, get_test_agent): """ Test that the integration creates spans for streaming agent runs. """ @@ -241,9 +249,10 @@ async def test_agent_run_stream(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent = get_test_agent() async with test_agent.run_stream("Test input") as result: @@ -251,31 +260,33 @@ async def test_agent_run_stream(sentry_init, capture_events, get_test_agent): async for _ in result.stream_output(): pass - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Verify transaction + (transaction,) = (item.payload for item in items if item.type == "transaction") assert transaction["transaction"] == "invoke_agent test_agent" assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" # Find chat spans - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 # Verify streaming flag is True for streaming for chat_span in chat_spans: - assert chat_span["data"]["gen_ai.response.streaming"] is True - assert "gen_ai.request.messages" in chat_span["data"] - assert "gen_ai.usage.input_tokens" in chat_span["data"] + assert chat_span["attributes"]["gen_ai.response.streaming"] is True + assert "gen_ai.request.messages" in chat_span["attributes"] + assert "gen_ai.usage.input_tokens" in chat_span["attributes"] # Streaming responses should still have output data assert ( - "gen_ai.response.text" in chat_span["data"] - or "gen_ai.response.model" in chat_span["data"] + "gen_ai.response.text" in chat_span["attributes"] + or "gen_ai.response.model" in chat_span["attributes"] ) @pytest.mark.asyncio -async def test_agent_run_stream_events(sentry_init, capture_events, get_test_agent): +async def test_agent_run_stream_events(sentry_init, capture_items, get_test_agent): """ Test that run_stream_events creates spans (it uses run internally, so non-streaming). """ @@ -283,32 +294,34 @@ async def test_agent_run_stream_events(sentry_init, capture_events, get_test_age integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Consume all events test_agent = get_test_agent() async for _ in test_agent.run_stream_events("Test input"): pass - (transaction,) = events - # Verify transaction + (transaction,) = (item.payload for item in items if item.type == "transaction") assert transaction["transaction"] == "invoke_agent test_agent" # Find chat spans - spans = transaction["spans"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 # run_stream_events uses run() internally, so streaming should be False for chat_span in chat_spans: - assert chat_span["data"]["gen_ai.response.streaming"] is False + assert chat_span["attributes"]["gen_ai.response.streaming"] is False @pytest.mark.asyncio -async def test_agent_with_tools(sentry_init, capture_events, get_test_agent): +async def test_agent_with_tools(sentry_init, capture_items, get_test_agent): """ Test that tool execution creates execute_tool spans. """ @@ -316,6 +329,7 @@ async def test_agent_with_tools(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) test_agent = get_test_agent() @@ -325,34 +339,39 @@ def add_numbers(a: int, b: int) -> int: """Add two numbers together.""" return a + b - events = capture_events() + items = capture_items("transaction", "span") result = await test_agent.run("What is 5 + 3?") assert result is not None - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find child span types (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] # Should have tool spans assert len(tool_spans) >= 1 # Check tool span tool_span = tool_spans[0] - assert "execute_tool" in tool_span["description"] - assert tool_span["data"]["gen_ai.operation.name"] == "execute_tool" - assert tool_span["data"]["gen_ai.tool.name"] == "add_numbers" - assert "gen_ai.tool.input" in tool_span["data"] - assert "gen_ai.tool.output" in tool_span["data"] + assert "execute_tool" in tool_span["name"] + assert tool_span["attributes"]["gen_ai.operation.name"] == "execute_tool" + assert tool_span["attributes"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in tool_span["attributes"] + assert "gen_ai.tool.output" in tool_span["attributes"] # Check chat spans have available_tools for chat_span in chat_spans: - assert "gen_ai.request.available_tools" in chat_span["data"] - available_tools_str = chat_span["data"]["gen_ai.request.available_tools"] + assert "gen_ai.request.available_tools" in chat_span["attributes"] + available_tools_str = chat_span["attributes"]["gen_ai.request.available_tools"] # Available tools is serialized as a string assert "add_numbers" in available_tools_str @@ -363,7 +382,7 @@ def add_numbers(a: int, b: int) -> int: ) @pytest.mark.asyncio async def test_agent_with_tool_model_retry( - sentry_init, capture_events, get_test_agent, handled_tool_call_exceptions + sentry_init, capture_items, get_test_agent, handled_tool_call_exceptions ): """ Test that a handled exception is captured when a tool raises ModelRetry. @@ -376,6 +395,7 @@ async def test_agent_with_tool_model_retry( ], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) retries = 0 @@ -391,47 +411,51 @@ def add_numbers(a: int, b: int) -> float: raise ModelRetry(message="Try again with the same arguments.") return a + b - events = capture_events() + items = capture_items("event", "transaction", "span") result = await test_agent.run("What is 5 + 3?") assert result is not None if handled_tool_call_exceptions: - (error, transaction) = events - else: - (transaction,) = events - spans = transaction["spans"] - - if handled_tool_call_exceptions: + (error,) = (item.payload for item in items if item.type == "event") assert error["level"] == "error" assert error["exception"]["values"][0]["mechanism"]["handled"] + spans = [item.payload for item in items if item.type == "span"] # Find child span types (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] # Should have tool spans assert len(tool_spans) >= 1 # Check tool spans model_retry_tool_span = tool_spans[0] - assert "execute_tool" in model_retry_tool_span["description"] - assert model_retry_tool_span["data"]["gen_ai.operation.name"] == "execute_tool" - assert model_retry_tool_span["data"]["gen_ai.tool.name"] == "add_numbers" - assert "gen_ai.tool.input" in model_retry_tool_span["data"] + assert "execute_tool" in model_retry_tool_span["name"] + assert ( + model_retry_tool_span["attributes"]["gen_ai.operation.name"] == "execute_tool" + ) + assert model_retry_tool_span["attributes"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in model_retry_tool_span["attributes"] tool_span = tool_spans[1] - assert "execute_tool" in tool_span["description"] - assert tool_span["data"]["gen_ai.operation.name"] == "execute_tool" - assert tool_span["data"]["gen_ai.tool.name"] == "add_numbers" - assert "gen_ai.tool.input" in tool_span["data"] - assert "gen_ai.tool.output" in tool_span["data"] + assert "execute_tool" in tool_span["name"] + assert tool_span["attributes"]["gen_ai.operation.name"] == "execute_tool" + assert tool_span["attributes"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in tool_span["attributes"] + assert "gen_ai.tool.output" in tool_span["attributes"] # Check chat spans have available_tools for chat_span in chat_spans: - assert "gen_ai.request.available_tools" in chat_span["data"] - available_tools_str = chat_span["data"]["gen_ai.request.available_tools"] + assert "gen_ai.request.available_tools" in chat_span["attributes"] + available_tools_str = chat_span["attributes"]["gen_ai.request.available_tools"] # Available tools is serialized as a string assert "add_numbers" in available_tools_str @@ -442,7 +466,7 @@ def add_numbers(a: int, b: int) -> float: ) @pytest.mark.asyncio async def test_agent_with_tool_validation_error( - sentry_init, capture_events, get_test_agent, handled_tool_call_exceptions + sentry_init, capture_items, get_test_agent, handled_tool_call_exceptions ): """ Test that a handled exception is captured when a tool has unsatisfiable constraints. @@ -455,6 +479,7 @@ async def test_agent_with_tool_validation_error( ], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) test_agent = get_test_agent() @@ -464,7 +489,7 @@ def add_numbers(a: Annotated[int, Field(gt=0, lt=0)], b: int) -> int: """Add two numbers together.""" return a + b - events = capture_events() + items = capture_items("event", "transaction", "span") result = None with pytest.raises(UnexpectedModelBehavior): @@ -473,42 +498,45 @@ def add_numbers(a: Annotated[int, Field(gt=0, lt=0)], b: int) -> int: assert result is None if handled_tool_call_exceptions: - (error, model_behaviour_error, transaction) = events - else: ( + error, model_behaviour_error, - transaction, - ) = events - spans = transaction["spans"] - - if handled_tool_call_exceptions: + ) = (item.payload for item in items if item.type == "event") assert error["level"] == "error" assert error["exception"]["values"][0]["mechanism"]["handled"] - # Find child span types (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] # Should have tool spans assert len(tool_spans) >= 1 # Check tool spans model_retry_tool_span = tool_spans[0] - assert "execute_tool" in model_retry_tool_span["description"] - assert model_retry_tool_span["data"]["gen_ai.operation.name"] == "execute_tool" - assert model_retry_tool_span["data"]["gen_ai.tool.name"] == "add_numbers" - assert "gen_ai.tool.input" in model_retry_tool_span["data"] + assert "execute_tool" in model_retry_tool_span["name"] + assert ( + model_retry_tool_span["attributes"]["gen_ai.operation.name"] == "execute_tool" + ) + assert model_retry_tool_span["attributes"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in model_retry_tool_span["attributes"] # Check chat spans have available_tools for chat_span in chat_spans: - assert "gen_ai.request.available_tools" in chat_span["data"] - available_tools_str = chat_span["data"]["gen_ai.request.available_tools"] + assert "gen_ai.request.available_tools" in chat_span["attributes"] + available_tools_str = chat_span["attributes"]["gen_ai.request.available_tools"] # Available tools is serialized as a string assert "add_numbers" in available_tools_str @pytest.mark.asyncio -async def test_agent_with_tools_streaming(sentry_init, capture_events, get_test_agent): +async def test_agent_with_tools_streaming(sentry_init, capture_items, get_test_agent): """ Test that tool execution works correctly with streaming. """ @@ -516,6 +544,7 @@ async def test_agent_with_tools_streaming(sentry_init, capture_events, get_test_ integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) test_agent = get_test_agent() @@ -525,62 +554,67 @@ def multiply(a: int, b: int) -> int: """Multiply two numbers.""" return a * b - events = capture_events() + items = capture_items("transaction", "span") async with test_agent.run_stream("What is 7 times 8?") as result: async for _ in result.stream_output(): pass - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find span types - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] # Should have tool spans assert len(tool_spans) >= 1 # Verify streaming flag is True for chat_span in chat_spans: - assert chat_span["data"]["gen_ai.response.streaming"] is True + assert chat_span["attributes"]["gen_ai.response.streaming"] is True # Check tool span tool_span = tool_spans[0] - assert tool_span["data"]["gen_ai.tool.name"] == "multiply" - assert "gen_ai.tool.input" in tool_span["data"] - assert "gen_ai.tool.output" in tool_span["data"] + assert tool_span["attributes"]["gen_ai.tool.name"] == "multiply" + assert "gen_ai.tool.input" in tool_span["attributes"] + assert "gen_ai.tool.output" in tool_span["attributes"] @pytest.mark.asyncio -async def test_model_settings( - sentry_init, capture_events, get_test_agent_with_settings -): +async def test_model_settings(sentry_init, capture_items, get_test_agent_with_settings): """ Test that model settings are captured in spans. """ sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent_with_settings = get_test_agent_with_settings() await test_agent_with_settings.run("Test input") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find chat span - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 chat_span = chat_spans[0] # Check that model settings are captured - assert chat_span["data"].get("gen_ai.request.temperature") == 0.7 - assert chat_span["data"].get("gen_ai.request.max_tokens") == 100 - assert chat_span["data"].get("gen_ai.request.top_p") == 0.9 + assert chat_span["attributes"].get("gen_ai.request.temperature") == 0.7 + assert chat_span["attributes"].get("gen_ai.request.max_tokens") == 100 + assert chat_span["attributes"].get("gen_ai.request.top_p") == 0.9 @pytest.mark.asyncio @@ -594,7 +628,7 @@ async def test_model_settings( ], ) async def test_system_prompt_attribute( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): """ Test that system prompts are included as the first message. @@ -609,23 +643,27 @@ async def test_system_prompt_attribute( integrations=[PydanticAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") await agent.run("Hello") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # The transaction IS the invoke_agent span, check for messages in chat spans instead - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 chat_span = chat_spans[0] if send_default_pii and include_prompts: - system_instructions = chat_span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + system_instructions = chat_span["attributes"][ + SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS + ] assert json.loads(system_instructions) == [ { "type": "text", @@ -633,11 +671,11 @@ async def test_system_prompt_attribute( } ] else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_span["attributes"] @pytest.mark.asyncio -async def test_error_handling(sentry_init, capture_events): +async def test_error_handling(sentry_init, capture_items): """ Test error handling in agent execution. """ @@ -651,16 +689,16 @@ async def test_error_handling(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Simple run that should succeed await agent.run("Hello") # At minimum, we should have a transaction - assert len(events) >= 1 - transaction = [e for e in events if e.get("type") == "transaction"][0] + transaction = next(item.payload for item in items if item.type == "transaction") assert transaction["transaction"] == "invoke_agent test_error" # Transaction should complete successfully (status key may not exist if no error) trace_status = transaction["contexts"]["trace"].get("status") @@ -668,7 +706,7 @@ async def test_error_handling(sentry_init, capture_events): @pytest.mark.asyncio -async def test_without_pii(sentry_init, capture_events, get_test_agent): +async def test_without_pii(sentry_init, capture_items, get_test_agent): """ Test that PII is not captured when send_default_pii is False. """ @@ -676,27 +714,29 @@ async def test_without_pii(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=False, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent = get_test_agent() await test_agent.run("Sensitive input") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find child spans (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] # Verify that messages and response text are not captured for span in chat_spans: - assert "gen_ai.request.messages" not in span["data"] - assert "gen_ai.response.text" not in span["data"] + assert "gen_ai.request.messages" not in span["attributes"] + assert "gen_ai.response.text" not in span["attributes"] @pytest.mark.asyncio -async def test_without_pii_tools(sentry_init, capture_events, get_test_agent): +async def test_without_pii_tools(sentry_init, capture_items, get_test_agent): """ Test that tool input/output are not captured when send_default_pii is False. """ @@ -704,6 +744,7 @@ async def test_without_pii_tools(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=False, + _experiments={"gen_ai_as_v2_spans": True}, ) test_agent = get_test_agent() @@ -713,33 +754,37 @@ def sensitive_tool(data: str) -> str: """A tool with sensitive data.""" return f"Processed: {data}" - events = capture_events() + items = capture_items("transaction", "span") await test_agent.run("Use sensitive tool with private data") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find tool spans - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] # If tool was executed, verify input/output are not captured for tool_span in tool_spans: - assert "gen_ai.tool.input" not in tool_span["data"] - assert "gen_ai.tool.output" not in tool_span["data"] + assert "gen_ai.tool.input" not in tool_span["attributes"] + assert "gen_ai.tool.output" not in tool_span["attributes"] @pytest.mark.asyncio -async def test_multiple_agents_concurrent(sentry_init, capture_events, get_test_agent): +async def test_multiple_agents_concurrent(sentry_init, capture_items, get_test_agent): """ Test that multiple agents can run concurrently without interfering. """ sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent = get_test_agent() @@ -750,18 +795,15 @@ async def run_agent(input_text): results = await asyncio.gather(*[run_agent(f"Input {i}") for i in range(3)]) assert len(results) == 3 - assert len(events) == 3 # Verify each transaction is separate + events = [item.payload for item in items if item.type == "transaction"] for i, transaction in enumerate(events): - assert transaction["type"] == "transaction" assert transaction["transaction"] == "invoke_agent test_agent" - # Each should have its own spans - assert len(transaction["spans"]) >= 1 @pytest.mark.asyncio -async def test_message_history(sentry_init, capture_events): +async def test_message_history(sentry_init, capture_items): """ Test that full conversation history is captured in chat spans. """ @@ -774,9 +816,10 @@ async def test_message_history(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # First message await agent.run("Hello, I'm Alice") @@ -797,51 +840,56 @@ async def test_message_history(sentry_init, capture_events): await agent.run("What is my name?", message_history=history) # We should have 2 transactions + events = [item.payload for item in items if item.type == "transaction"] assert len(events) >= 2 # Check the second transaction has the full history second_transaction = events[1] spans = second_transaction["spans"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] if chat_spans: chat_span = chat_spans[0] - if "gen_ai.request.messages" in chat_span["data"]: - messages_data = chat_span["data"]["gen_ai.request.messages"] + if "gen_ai.request.messages" in chat_span["attributes"]: + messages_data = chat_span["attributes"]["gen_ai.request.messages"] # Should have multiple messages including history assert len(messages_data) > 1 @pytest.mark.asyncio -async def test_gen_ai_system(sentry_init, capture_events, get_test_agent): +async def test_gen_ai_system(sentry_init, capture_items, get_test_agent): """ Test that gen_ai.system is set from the model. """ sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent = get_test_agent() await test_agent.run("Test input") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find chat span - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 chat_span = chat_spans[0] # gen_ai.system should be set from the model (TestModel -> 'test') - assert "gen_ai.system" in chat_span["data"] - assert chat_span["data"]["gen_ai.system"] == "test" + assert "gen_ai.system" in chat_span["attributes"] + assert chat_span["attributes"]["gen_ai.system"] == "test" @pytest.mark.asyncio -async def test_include_prompts_false(sentry_init, capture_events, get_test_agent): +async def test_include_prompts_false(sentry_init, capture_items, get_test_agent): """ Test that prompts are not captured when include_prompts=False. """ @@ -849,27 +897,29 @@ async def test_include_prompts_false(sentry_init, capture_events, get_test_agent integrations=[PydanticAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=True, # Even with PII enabled, prompts should not be captured + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent = get_test_agent() await test_agent.run("Sensitive prompt") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find child spans (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] # Verify that messages and response text are not captured for span in chat_spans: - assert "gen_ai.request.messages" not in span["data"] - assert "gen_ai.response.text" not in span["data"] + assert "gen_ai.request.messages" not in span["attributes"] + assert "gen_ai.response.text" not in span["attributes"] @pytest.mark.asyncio -async def test_include_prompts_true(sentry_init, capture_events, get_test_agent): +async def test_include_prompts_true(sentry_init, capture_items, get_test_agent): """ Test that prompts are captured when include_prompts=True (default). """ @@ -877,28 +927,30 @@ async def test_include_prompts_true(sentry_init, capture_events, get_test_agent) integrations=[PydanticAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent = get_test_agent() await test_agent.run("Test prompt") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find child spans (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] # Verify that messages are captured in chat spans assert len(chat_spans) >= 1 for chat_span in chat_spans: - assert "gen_ai.request.messages" in chat_span["data"] + assert "gen_ai.request.messages" in chat_span["attributes"] @pytest.mark.asyncio async def test_include_prompts_false_with_tools( - sentry_init, capture_events, get_test_agent + sentry_init, capture_items, get_test_agent ): """ Test that tool input/output are not captured when include_prompts=False. @@ -907,6 +959,7 @@ async def test_include_prompts_false_with_tools( integrations=[PydanticAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) test_agent = get_test_agent() @@ -916,26 +969,27 @@ def test_tool(value: int) -> int: """A test tool.""" return value * 2 - events = capture_events() + items = capture_items("transaction", "span") await test_agent.run("Use the test tool with value 5") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find tool spans - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] # If tool was executed, verify input/output are not captured for tool_span in tool_spans: - assert "gen_ai.tool.input" not in tool_span["data"] - assert "gen_ai.tool.output" not in tool_span["data"] + assert "gen_ai.tool.input" not in tool_span["attributes"] + assert "gen_ai.tool.output" not in tool_span["attributes"] @pytest.mark.asyncio -async def test_include_prompts_requires_pii( - sentry_init, capture_events, get_test_agent -): +async def test_include_prompts_requires_pii(sentry_init, capture_items, get_test_agent): """ Test that include_prompts requires send_default_pii=True. """ @@ -943,27 +997,29 @@ async def test_include_prompts_requires_pii( integrations=[PydanticAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=False, # PII disabled + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") test_agent = get_test_agent() await test_agent.run("Test prompt") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # Find child spans (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] # Even with include_prompts=True, if PII is disabled, messages should not be captured for span in chat_spans: - assert "gen_ai.request.messages" not in span["data"] - assert "gen_ai.response.text" not in span["data"] + assert "gen_ai.request.messages" not in span["attributes"] + assert "gen_ai.response.text" not in span["attributes"] @pytest.mark.asyncio -async def test_mcp_tool_execution_spans(sentry_init, capture_events): +async def test_mcp_tool_execution_spans(sentry_init, capture_items): """ Test that MCP (Model Context Protocol) tool calls create execute_tool spans. @@ -1033,14 +1089,13 @@ async def mock_map_tool_result_part(part): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Simulate MCP tool execution within a transaction through CombinedToolset - with sentry_sdk.start_transaction( - op="ai.run", name="invoke_agent test_mcp_agent" - ) as transaction: + with sentry_sdk.start_transaction(op="ai.run", name="invoke_agent test_mcp_agent"): # Set up the agent context scope = sentry_sdk.get_current_scope() scope._contexts["pydantic_ai_agent"] = { @@ -1080,13 +1135,10 @@ async def mock_map_tool_result_part(part): # MCP tool might raise if not fully mocked, that's okay pass - events_list = events + events_list = items if len(events_list) == 0: pytest.skip("No events captured, MCP test setup incomplete") - (transaction,) = events_list - transaction["spans"] - # Note: This test manually calls combined.call_tool which doesn't go through # ToolManager._call_tool (which is what the integration patches). # In real-world usage, MCP tools are called through agent.run() which uses ToolManager. @@ -1107,6 +1159,7 @@ async def test_context_cleanup_after_run(sentry_init, get_test_agent): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Verify context is not set before run @@ -1130,6 +1183,7 @@ def test_context_cleanup_after_run_sync(sentry_init, get_test_agent): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Verify context is not set before run @@ -1154,6 +1208,7 @@ async def test_context_cleanup_after_streaming(sentry_init, get_test_agent): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Verify context is not set before run @@ -1180,6 +1235,7 @@ async def test_context_cleanup_on_error(sentry_init, get_test_agent): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) test_agent = get_test_agent() @@ -1214,6 +1270,7 @@ async def test_context_isolation_concurrent_agents(sentry_init, get_test_agent): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Create a second agent @@ -1256,7 +1313,7 @@ async def run_and_check_context(agent, agent_name): @pytest.mark.asyncio -async def test_invoke_agent_with_list_user_prompt(sentry_init, capture_events): +async def test_invoke_agent_with_list_user_prompt(sentry_init, capture_items): """ Test that invoke_agent span handles list user prompts correctly. """ @@ -1269,17 +1326,17 @@ async def test_invoke_agent_with_list_user_prompt(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Use a list as user prompt await agent.run(["First part", "Second part"]) - (transaction,) = events - # Check that the invoke_agent transaction has messages data # The invoke_agent is the transaction itself + (transaction,) = [item.payload for item in items if item.type == "transaction"] if "gen_ai.request.messages" in transaction["contexts"]["trace"]["data"]: messages_str = transaction["contexts"]["trace"]["data"][ "gen_ai.request.messages" @@ -1299,7 +1356,7 @@ async def test_invoke_agent_with_list_user_prompt(sentry_init, capture_events): ], ) async def test_invoke_agent_with_instructions( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, capture_items, send_default_pii, include_prompts ): """ Test that invoke_agent span handles instructions correctly. @@ -1320,33 +1377,37 @@ async def test_invoke_agent_with_instructions( integrations=[PydanticAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") await agent.run("Test input") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] # The transaction IS the invoke_agent span, check for messages in chat spans instead - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 chat_span = chat_spans[0] if send_default_pii and include_prompts: - system_instructions = chat_span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + system_instructions = chat_span["attributes"][ + SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS + ] assert json.loads(system_instructions) == [ {"type": "text", "content": "System prompt"}, {"type": "text", "content": "Instruction 1\nInstruction 2"}, ] else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_span["attributes"] @pytest.mark.asyncio -async def test_model_name_extraction_with_callable(sentry_init, capture_events): +async def test_model_name_extraction_with_callable(sentry_init, capture_items): """ Test model name extraction when model has a callable name() method. """ @@ -1356,6 +1417,7 @@ async def test_model_name_extraction_with_callable(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Test the utility function directly @@ -1372,7 +1434,7 @@ async def test_model_name_extraction_with_callable(sentry_init, capture_events): @pytest.mark.asyncio -async def test_model_name_extraction_fallback_to_str(sentry_init, capture_events): +async def test_model_name_extraction_fallback_to_str(sentry_init, capture_items): """ Test model name extraction falls back to str() when no name attribute exists. """ @@ -1382,6 +1444,7 @@ async def test_model_name_extraction_fallback_to_str(sentry_init, capture_events sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Test the utility function directly @@ -1399,7 +1462,7 @@ async def test_model_name_extraction_fallback_to_str(sentry_init, capture_events @pytest.mark.asyncio -async def test_model_settings_object_style(sentry_init, capture_events): +async def test_model_settings_object_style(sentry_init, capture_items): """ Test that object-style model settings (non-dict) are handled correctly. """ @@ -1410,6 +1473,7 @@ async def test_model_settings_object_style(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1433,7 +1497,7 @@ async def test_model_settings_object_style(sentry_init, capture_events): @pytest.mark.asyncio -async def test_usage_data_partial(sentry_init, capture_events): +async def test_usage_data_partial(sentry_init, capture_items): """ Test that usage data is correctly handled when only some fields are present. """ @@ -1445,16 +1509,18 @@ async def test_usage_data_partial(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") await agent.run("Test input") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 # Check that usage data fields exist (they may or may not be set depending on TestModel) @@ -1464,7 +1530,7 @@ async def test_usage_data_partial(sentry_init, capture_events): @pytest.mark.asyncio -async def test_agent_data_from_scope(sentry_init, capture_events): +async def test_agent_data_from_scope(sentry_init, capture_items): """ Test that agent data can be retrieved from Sentry scope when not passed directly. """ @@ -1477,22 +1543,22 @@ async def test_agent_data_from_scope(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # The integration automatically sets agent in scope during execution await agent.run("Test input") - (transaction,) = events - - # Verify agent name is captured + # Verify agent name is capture + (transaction,) = (item.payload for item in items if item.type == "transaction") assert transaction["transaction"] == "invoke_agent test_scope_agent" @pytest.mark.asyncio async def test_available_tools_without_description( - sentry_init, capture_events, get_test_agent + sentry_init, capture_items, get_test_agent ): """ Test that available tools are captured even when description is missing. @@ -1500,6 +1566,7 @@ async def test_available_tools_without_description( sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) test_agent = get_test_agent() @@ -1509,23 +1576,24 @@ def tool_without_desc(x: int) -> int: # No docstring = no description return x * 2 - events = capture_events() + items = capture_items("transaction", "span") await test_agent.run("Use the tool with 5") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] if chat_spans: chat_span = chat_spans[0] - if "gen_ai.request.available_tools" in chat_span["data"]: - tools_str = chat_span["data"]["gen_ai.request.available_tools"] + if "gen_ai.request.available_tools" in chat_span["attributes"]: + tools_str = chat_span["attributes"]["gen_ai.request.available_tools"] assert "tool_without_desc" in tools_str @pytest.mark.asyncio -async def test_output_with_tool_calls(sentry_init, capture_events, get_test_agent): +async def test_output_with_tool_calls(sentry_init, capture_items, get_test_agent): """ Test that tool calls in model response are captured correctly. """ @@ -1533,6 +1601,7 @@ async def test_output_with_tool_calls(sentry_init, capture_events, get_test_agen integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) test_agent = get_test_agent() @@ -1542,14 +1611,15 @@ def calc_tool(value: int) -> int: """Calculate something.""" return value + 10 - events = capture_events() + items = capture_items("transaction", "span") await test_agent.run("Use calc_tool with 5") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] # At least one chat span should exist assert len(chat_spans) >= 1 @@ -1558,11 +1628,11 @@ def calc_tool(value: int) -> int: for chat_span in chat_spans: # Tool calls may or may not be in response depending on TestModel behavior # Just verify the span was created and has basic data - assert "gen_ai.operation.name" in chat_span["data"] + assert "gen_ai.operation.name" in chat_span["attributes"] @pytest.mark.asyncio -async def test_message_formatting_with_different_parts(sentry_init, capture_events): +async def test_message_formatting_with_different_parts(sentry_init, capture_items): """ Test that different message part types are handled correctly in ai_client span. """ @@ -1577,9 +1647,10 @@ async def test_message_formatting_with_different_parts(sentry_init, capture_even integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Create message history with different part types history = [ @@ -1594,24 +1665,25 @@ async def test_message_formatting_with_different_parts(sentry_init, capture_even await agent.run("What did I say?", message_history=history) - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] # Should have chat spans assert len(chat_spans) >= 1 # Check that messages are captured chat_span = chat_spans[0] - if "gen_ai.request.messages" in chat_span["data"]: - messages_data = chat_span["data"]["gen_ai.request.messages"] + if "gen_ai.request.messages" in chat_span["attributes"]: + messages_data = chat_span["attributes"]["gen_ai.request.messages"] # Should contain message history assert messages_data is not None @pytest.mark.asyncio -async def test_update_invoke_agent_span_with_none_output(sentry_init, capture_events): +async def test_update_invoke_agent_span_with_none_output(sentry_init, capture_items): """ Test that update_invoke_agent_span handles None output gracefully. """ @@ -1624,6 +1696,7 @@ async def test_update_invoke_agent_span_with_none_output(sentry_init, capture_ev integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1639,7 +1712,7 @@ async def test_update_invoke_agent_span_with_none_output(sentry_init, capture_ev @pytest.mark.asyncio -async def test_update_ai_client_span_with_none_response(sentry_init, capture_events): +async def test_update_ai_client_span_with_none_response(sentry_init, capture_items): """ Test that update_ai_client_span handles None response gracefully. """ @@ -1651,6 +1724,7 @@ async def test_update_ai_client_span_with_none_response(sentry_init, capture_eve sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1666,7 +1740,7 @@ async def test_update_ai_client_span_with_none_response(sentry_init, capture_eve @pytest.mark.asyncio -async def test_agent_without_name(sentry_init, capture_events): +async def test_agent_without_name(sentry_init, capture_items): """ Test that agent without a name is handled correctly. """ @@ -1676,22 +1750,21 @@ async def test_agent_without_name(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") await agent.run("Test input") - (transaction,) = events - # Should still create transaction, just with default name - assert transaction["type"] == "transaction" + (transaction,) = (item.payload for item in items if item.type == "transaction") # Transaction name should be "invoke_agent agent" or similar default assert "invoke_agent" in transaction["transaction"] @pytest.mark.asyncio -async def test_model_response_without_parts(sentry_init, capture_events): +async def test_model_response_without_parts(sentry_init, capture_items): """ Test handling of model response without parts attribute. """ @@ -1703,6 +1776,7 @@ async def test_model_response_without_parts(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1723,7 +1797,7 @@ async def test_model_response_without_parts(sentry_init, capture_events): @pytest.mark.asyncio -async def test_input_messages_error_handling(sentry_init, capture_events): +async def test_input_messages_error_handling(sentry_init, capture_items): """ Test that _set_input_messages handles errors gracefully. """ @@ -1733,6 +1807,7 @@ async def test_input_messages_error_handling(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1751,7 +1826,7 @@ async def test_input_messages_error_handling(sentry_init, capture_events): @pytest.mark.asyncio -async def test_available_tools_error_handling(sentry_init, capture_events): +async def test_available_tools_error_handling(sentry_init, capture_items): """ Test that _set_available_tools handles errors gracefully. """ @@ -1762,6 +1837,7 @@ async def test_available_tools_error_handling(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1781,7 +1857,7 @@ async def test_available_tools_error_handling(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_usage_data_with_none_usage(sentry_init, capture_events): +async def test_set_usage_data_with_none_usage(sentry_init, capture_items): """ Test that _set_usage_data handles None usage gracefully. """ @@ -1791,6 +1867,7 @@ async def test_set_usage_data_with_none_usage(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1806,7 +1883,7 @@ async def test_set_usage_data_with_none_usage(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_usage_data_with_partial_fields(sentry_init, capture_events): +async def test_set_usage_data_with_partial_fields(sentry_init, capture_items): """ Test that _set_usage_data handles usage with only some fields. """ @@ -1817,6 +1894,7 @@ async def test_set_usage_data_with_partial_fields(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1838,7 +1916,7 @@ async def test_set_usage_data_with_partial_fields(sentry_init, capture_events): @pytest.mark.asyncio -async def test_message_parts_with_tool_return(sentry_init, capture_events): +async def test_message_parts_with_tool_return(sentry_init, capture_items): """ Test that ToolReturnPart messages are handled correctly. """ @@ -1858,24 +1936,26 @@ def test_tool(x: int) -> int: integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") # Run with history containing tool return await agent.run("Use test_tool with 5") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] # Should have chat spans assert len(chat_spans) >= 1 @pytest.mark.asyncio -async def test_message_parts_with_list_content(sentry_init, capture_events): +async def test_message_parts_with_list_content(sentry_init, capture_items): """ Test that message parts with list content are handled correctly. """ @@ -1886,6 +1966,7 @@ async def test_message_parts_with_list_content(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1910,7 +1991,7 @@ async def test_message_parts_with_list_content(sentry_init, capture_events): @pytest.mark.asyncio -async def test_output_data_with_text_and_tool_calls(sentry_init, capture_events): +async def test_output_data_with_text_and_tool_calls(sentry_init, capture_items): """ Test that _set_output_data handles both text and tool calls in response. """ @@ -1922,6 +2003,7 @@ async def test_output_data_with_text_and_tool_calls(sentry_init, capture_events) integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1949,7 +2031,7 @@ async def test_output_data_with_text_and_tool_calls(sentry_init, capture_events) @pytest.mark.asyncio -async def test_output_data_error_handling(sentry_init, capture_events): +async def test_output_data_error_handling(sentry_init, capture_items): """ Test that _set_output_data handles errors in formatting gracefully. """ @@ -1961,6 +2043,7 @@ async def test_output_data_error_handling(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -1981,7 +2064,7 @@ async def test_output_data_error_handling(sentry_init, capture_events): @pytest.mark.asyncio -async def test_message_with_system_prompt_part(sentry_init, capture_events): +async def test_message_with_system_prompt_part(sentry_init, capture_items): """ Test that SystemPromptPart is handled with correct role. """ @@ -1993,6 +2076,7 @@ async def test_message_with_system_prompt_part(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2017,7 +2101,7 @@ async def test_message_with_system_prompt_part(sentry_init, capture_events): @pytest.mark.asyncio -async def test_message_with_instructions(sentry_init, capture_events): +async def test_message_with_instructions(sentry_init, capture_items): """ Test that messages with instructions field are handled correctly. """ @@ -2028,6 +2112,7 @@ async def test_message_with_instructions(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2052,7 +2137,7 @@ async def test_message_with_instructions(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_input_messages_without_prompts(sentry_init, capture_events): +async def test_set_input_messages_without_prompts(sentry_init, capture_items): """ Test that _set_input_messages respects _should_send_prompts(). """ @@ -2062,6 +2147,7 @@ async def test_set_input_messages_without_prompts(sentry_init, capture_events): integrations=[PydanticAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2078,7 +2164,7 @@ async def test_set_input_messages_without_prompts(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_output_data_without_prompts(sentry_init, capture_events): +async def test_set_output_data_without_prompts(sentry_init, capture_items): """ Test that _set_output_data respects _should_send_prompts(). """ @@ -2090,6 +2176,7 @@ async def test_set_output_data_without_prompts(sentry_init, capture_events): integrations=[PydanticAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2107,7 +2194,7 @@ async def test_set_output_data_without_prompts(sentry_init, capture_events): @pytest.mark.asyncio -async def test_get_model_name_with_exception_in_callable(sentry_init, capture_events): +async def test_get_model_name_with_exception_in_callable(sentry_init, capture_items): """ Test that _get_model_name handles exceptions in name() callable. """ @@ -2117,6 +2204,7 @@ async def test_get_model_name_with_exception_in_callable(sentry_init, capture_ev sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Create model with callable name that raises exception @@ -2131,7 +2219,7 @@ async def test_get_model_name_with_exception_in_callable(sentry_init, capture_ev @pytest.mark.asyncio -async def test_get_model_name_with_string_model(sentry_init, capture_events): +async def test_get_model_name_with_string_model(sentry_init, capture_items): """ Test that _get_model_name handles string models. """ @@ -2140,6 +2228,7 @@ async def test_get_model_name_with_string_model(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Pass a string as model @@ -2150,7 +2239,7 @@ async def test_get_model_name_with_string_model(sentry_init, capture_events): @pytest.mark.asyncio -async def test_get_model_name_with_none(sentry_init, capture_events): +async def test_get_model_name_with_none(sentry_init, capture_items): """ Test that _get_model_name handles None model. """ @@ -2159,6 +2248,7 @@ async def test_get_model_name_with_none(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Pass None @@ -2169,7 +2259,7 @@ async def test_get_model_name_with_none(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_model_data_with_system(sentry_init, capture_events): +async def test_set_model_data_with_system(sentry_init, capture_items): """ Test that _set_model_data captures system from model. """ @@ -2180,6 +2270,7 @@ async def test_set_model_data_with_system(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2200,7 +2291,7 @@ async def test_set_model_data_with_system(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_model_data_from_agent_scope(sentry_init, capture_events): +async def test_set_model_data_from_agent_scope(sentry_init, capture_items): """ Test that _set_model_data retrieves model from agent in scope when not passed. """ @@ -2211,6 +2302,7 @@ async def test_set_model_data_from_agent_scope(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2234,7 +2326,7 @@ async def test_set_model_data_from_agent_scope(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_model_data_with_none_settings_values(sentry_init, capture_events): +async def test_set_model_data_with_none_settings_values(sentry_init, capture_items): """ Test that _set_model_data skips None values in settings. """ @@ -2244,6 +2336,7 @@ async def test_set_model_data_with_none_settings_values(sentry_init, capture_eve sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2266,7 +2359,7 @@ async def test_set_model_data_with_none_settings_values(sentry_init, capture_eve @pytest.mark.asyncio -async def test_should_send_prompts_without_pii(sentry_init, capture_events): +async def test_should_send_prompts_without_pii(sentry_init, capture_items): """ Test that _should_send_prompts returns False when PII disabled. """ @@ -2276,6 +2369,7 @@ async def test_should_send_prompts_without_pii(sentry_init, capture_events): integrations=[PydanticAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=False, # PII disabled + _experiments={"gen_ai_as_v2_spans": True}, ) # Should return False @@ -2284,7 +2378,7 @@ async def test_should_send_prompts_without_pii(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_agent_data_without_agent(sentry_init, capture_events): +async def test_set_agent_data_without_agent(sentry_init, capture_items): """ Test that _set_agent_data handles None agent gracefully. """ @@ -2294,6 +2388,7 @@ async def test_set_agent_data_without_agent(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2309,7 +2404,7 @@ async def test_set_agent_data_without_agent(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_agent_data_from_scope(sentry_init, capture_events): +async def test_set_agent_data_from_scope(sentry_init, capture_items): """ Test that _set_agent_data retrieves agent from scope when not passed. """ @@ -2320,6 +2415,7 @@ async def test_set_agent_data_from_scope(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2341,7 +2437,7 @@ async def test_set_agent_data_from_scope(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_agent_data_without_name(sentry_init, capture_events): +async def test_set_agent_data_without_name(sentry_init, capture_items): """ Test that _set_agent_data handles agent without name attribute. """ @@ -2352,6 +2448,7 @@ async def test_set_agent_data_without_name(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2371,7 +2468,7 @@ async def test_set_agent_data_without_name(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_available_tools_without_toolset(sentry_init, capture_events): +async def test_set_available_tools_without_toolset(sentry_init, capture_items): """ Test that _set_available_tools handles agent without toolset. """ @@ -2382,6 +2479,7 @@ async def test_set_available_tools_without_toolset(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2401,7 +2499,7 @@ async def test_set_available_tools_without_toolset(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_available_tools_with_schema(sentry_init, capture_events): +async def test_set_available_tools_with_schema(sentry_init, capture_items): """ Test that _set_available_tools extracts tool schema correctly. """ @@ -2412,6 +2510,7 @@ async def test_set_available_tools_with_schema(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2437,7 +2536,7 @@ async def test_set_available_tools_with_schema(sentry_init, capture_events): @pytest.mark.asyncio -async def test_execute_tool_span_creation(sentry_init, capture_events): +async def test_execute_tool_span_creation(sentry_init, capture_items): """ Test direct creation of execute_tool span. """ @@ -2451,6 +2550,7 @@ async def test_execute_tool_span_creation(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2464,7 +2564,7 @@ async def test_execute_tool_span_creation(sentry_init, capture_events): @pytest.mark.asyncio -async def test_execute_tool_span_with_mcp_type(sentry_init, capture_events): +async def test_execute_tool_span_with_mcp_type(sentry_init, capture_items): """ Test execute_tool span with MCP tool type. """ @@ -2477,6 +2577,7 @@ async def test_execute_tool_span_with_mcp_type(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2490,7 +2591,7 @@ async def test_execute_tool_span_with_mcp_type(sentry_init, capture_events): @pytest.mark.asyncio -async def test_execute_tool_span_without_prompts(sentry_init, capture_events): +async def test_execute_tool_span_without_prompts(sentry_init, capture_items): """ Test that execute_tool span respects _should_send_prompts(). """ @@ -2504,6 +2605,7 @@ async def test_execute_tool_span_without_prompts(sentry_init, capture_events): integrations=[PydanticAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2517,7 +2619,7 @@ async def test_execute_tool_span_without_prompts(sentry_init, capture_events): @pytest.mark.asyncio -async def test_execute_tool_span_with_none_args(sentry_init, capture_events): +async def test_execute_tool_span_with_none_args(sentry_init, capture_items): """ Test execute_tool span with None args. """ @@ -2528,6 +2630,7 @@ async def test_execute_tool_span_with_none_args(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2540,7 +2643,7 @@ async def test_execute_tool_span_with_none_args(sentry_init, capture_events): @pytest.mark.asyncio -async def test_update_execute_tool_span_with_none_span(sentry_init, capture_events): +async def test_update_execute_tool_span_with_none_span(sentry_init, capture_items): """ Test that update_execute_tool_span handles None span gracefully. """ @@ -2551,6 +2654,7 @@ async def test_update_execute_tool_span_with_none_span(sentry_init, capture_even sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Update with None span - should not raise @@ -2561,7 +2665,7 @@ async def test_update_execute_tool_span_with_none_span(sentry_init, capture_even @pytest.mark.asyncio -async def test_update_execute_tool_span_with_none_result(sentry_init, capture_events): +async def test_update_execute_tool_span_with_none_result(sentry_init, capture_items): """ Test that update_execute_tool_span handles None result gracefully. """ @@ -2575,6 +2679,7 @@ async def test_update_execute_tool_span_with_none_result(sentry_init, capture_ev integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2588,7 +2693,7 @@ async def test_update_execute_tool_span_with_none_result(sentry_init, capture_ev @pytest.mark.asyncio -async def test_tool_execution_without_span_context(sentry_init, capture_events): +async def test_tool_execution_without_span_context(sentry_init, capture_items): """ Test that tool execution patch handles case when no span context exists. This tests the code path where current_span is None in _patch_tool_execution. @@ -2598,6 +2703,7 @@ async def test_tool_execution_without_span_context(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) # Create a simple agent with no tools (won't have function_toolset) @@ -2617,7 +2723,7 @@ async def test_tool_execution_without_span_context(sentry_init, capture_events): @pytest.mark.asyncio -async def test_invoke_agent_span_with_callable_instruction(sentry_init, capture_events): +async def test_invoke_agent_span_with_callable_instruction(sentry_init, capture_items): """ Test that invoke_agent_span skips callable instructions correctly. """ @@ -2629,6 +2735,7 @@ async def test_invoke_agent_span_with_callable_instruction(sentry_init, capture_ integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2650,7 +2757,7 @@ async def test_invoke_agent_span_with_callable_instruction(sentry_init, capture_ @pytest.mark.asyncio -async def test_invoke_agent_span_with_string_instructions(sentry_init, capture_events): +async def test_invoke_agent_span_with_string_instructions(sentry_init, capture_items): """ Test that invoke_agent_span handles string instructions (not list). """ @@ -2662,6 +2769,7 @@ async def test_invoke_agent_span_with_string_instructions(sentry_init, capture_e integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2680,7 +2788,7 @@ async def test_invoke_agent_span_with_string_instructions(sentry_init, capture_e @pytest.mark.asyncio -async def test_ai_client_span_with_streaming_flag(sentry_init, capture_events): +async def test_ai_client_span_with_streaming_flag(sentry_init, capture_items): """ Test that ai_client_span reads streaming flag from scope. """ @@ -2690,6 +2798,7 @@ async def test_ai_client_span_with_streaming_flag(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2706,7 +2815,7 @@ async def test_ai_client_span_with_streaming_flag(sentry_init, capture_events): @pytest.mark.asyncio -async def test_ai_client_span_gets_agent_from_scope(sentry_init, capture_events): +async def test_ai_client_span_gets_agent_from_scope(sentry_init, capture_items): """ Test that ai_client_span gets agent from scope when not passed. """ @@ -2717,6 +2826,7 @@ async def test_ai_client_span_gets_agent_from_scope(sentry_init, capture_events) sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, ) with sentry_sdk.start_transaction(op="test", name="test") as transaction: @@ -2759,15 +2869,16 @@ def _find_binary_content(messages_data, expected_modality, expected_mime_type): @pytest.mark.asyncio -async def test_binary_content_encoding_image(sentry_init, capture_events): +async def test_binary_content_encoding_image(sentry_init, capture_items): """Test that BinaryContent with image data is properly encoded in messages.""" sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") with sentry_sdk.start_transaction(op="test", name="test"): span = sentry_sdk.start_span(op="test_span") @@ -2782,22 +2893,23 @@ async def test_binary_content_encoding_image(sentry_init, capture_events): _set_input_messages(span, [mock_msg]) span.finish() - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") span_data = event["spans"][0]["data"] messages_data = _get_messages_from_span(span_data) assert _find_binary_content(messages_data, "image", "image/png") @pytest.mark.asyncio -async def test_binary_content_encoding_mixed_content(sentry_init, capture_events): +async def test_binary_content_encoding_mixed_content(sentry_init, capture_items): """Test that BinaryContent mixed with text content is properly handled.""" sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") with sentry_sdk.start_transaction(op="test", name="test"): span = sentry_sdk.start_span(op="test_span") @@ -2814,7 +2926,7 @@ async def test_binary_content_encoding_mixed_content(sentry_init, capture_events _set_input_messages(span, [mock_msg]) span.finish() - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") span_data = event["spans"][0]["data"] messages_data = _get_messages_from_span(span_data) @@ -2830,7 +2942,7 @@ async def test_binary_content_encoding_mixed_content(sentry_init, capture_events @pytest.mark.asyncio -async def test_binary_content_in_agent_run(sentry_init, capture_events): +async def test_binary_content_in_agent_run(sentry_init, capture_items): """Test that BinaryContent in actual agent run is properly captured in spans.""" agent = Agent("test", name="test_binary_agent") @@ -2838,30 +2950,37 @@ async def test_binary_content_in_agent_run(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") binary_content = BinaryContent( data=b"fake_image_data_for_testing", media_type="image/png" ) await agent.run(["Analyze this image:", binary_content]) - (transaction,) = events - chat_spans = [s for s in transaction["spans"] if s["op"] == "gen_ai.chat"] + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 chat_span = chat_spans[0] - if "gen_ai.request.messages" in chat_span["data"]: - messages_str = str(chat_span["data"]["gen_ai.request.messages"]) + if "gen_ai.request.messages" in chat_span["attributes"]: + messages_str = str(chat_span["attributes"]["gen_ai.request.messages"]) assert any(keyword in messages_str for keyword in ["blob", "image", "base64"]) @pytest.mark.asyncio -async def test_set_usage_data_with_cache_tokens(sentry_init, capture_events): +async def test_set_usage_data_with_cache_tokens(sentry_init, capture_items): """Test that cache_read_tokens and cache_write_tokens are tracked.""" - sentry_init(integrations=[PydanticAIIntegration()], traces_sample_rate=1.0) + sentry_init( + integrations=[PydanticAIIntegration()], + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) - events = capture_events() + items = capture_items("transaction", "span") with sentry_sdk.start_transaction(op="test", name="test"): span = sentry_sdk.start_span(op="test_span") @@ -2874,7 +2993,7 @@ async def test_set_usage_data_with_cache_tokens(sentry_init, capture_events): _set_usage_data(span, usage) span.finish() - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") (span_data,) = event["spans"] assert span_data["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 assert span_data["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 @@ -2922,7 +3041,7 @@ async def test_set_usage_data_with_cache_tokens(sentry_init, capture_events): ], ) def test_image_url_base64_content_in_span( - sentry_init, capture_events, url, image_url_kwargs, expected_content + sentry_init, capture_items, url, image_url_kwargs, expected_content ): from sentry_sdk.integrations.pydantic_ai.spans.ai_client import ai_client_span @@ -2930,9 +3049,10 @@ def test_image_url_base64_content_in_span( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") with sentry_sdk.start_transaction(op="test", name="test"): image_url = ImageUrl(url=url, **image_url_kwargs) @@ -2944,10 +3064,12 @@ def test_image_url_base64_content_in_span( span = ai_client_span([mock_msg], None, None, None) span.finish() - (event,) = events - chat_spans = [s for s in event["spans"] if s["op"] == "gen_ai.chat"] + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] assert len(chat_spans) >= 1 - messages_data = _get_messages_from_span(chat_spans[0]["data"]) + messages_data = _get_messages_from_span(chat_spans[0]["attributes"]) found_image = False for msg in messages_data: @@ -2992,27 +3114,29 @@ def test_image_url_base64_content_in_span( ], ) async def test_invoke_agent_image_url( - sentry_init, capture_events, url, image_url_kwargs, expected_content + sentry_init, capture_items, url, image_url_kwargs, expected_content ): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) agent = Agent("test", name="test_image_url_agent") - events = capture_events() + items = capture_items("transaction", "span") image_url = ImageUrl(url=url, **image_url_kwargs) await agent.run([image_url, "Describe this image"]) - (transaction,) = events - found_image = False - chat_spans = [s for s in transaction["spans"] if s["op"] == "gen_ai.chat"] + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] for chat_span in chat_spans: - messages_data = _get_messages_from_span(chat_span["data"]) + messages_data = _get_messages_from_span(chat_span["attributes"]) for msg in messages_data: if "content" not in msg: continue @@ -3025,7 +3149,7 @@ async def test_invoke_agent_image_url( @pytest.mark.asyncio -async def test_tool_description_in_execute_tool_span(sentry_init, capture_events): +async def test_tool_description_in_execute_tool_span(sentry_init, capture_items): """ Test that tool description from the tool's docstring is included in execute_tool spans. """ @@ -3044,20 +3168,27 @@ def multiply_numbers(a: int, b: int) -> int: integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + _experiments={"gen_ai_as_v2_spans": True}, ) - events = capture_events() + items = capture_items("transaction", "span") result = await agent.run("What is 5 times 3?") assert result is not None - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] assert len(tool_spans) >= 1 tool_span = tool_spans[0] - assert tool_span["data"]["gen_ai.tool.name"] == "multiply_numbers" - assert SPANDATA.GEN_AI_TOOL_DESCRIPTION in tool_span["data"] - assert "Multiply two numbers" in tool_span["data"][SPANDATA.GEN_AI_TOOL_DESCRIPTION] + assert tool_span["attributes"]["gen_ai.tool.name"] == "multiply_numbers" + assert SPANDATA.GEN_AI_TOOL_DESCRIPTION in tool_span["attributes"] + assert ( + "Multiply two numbers" + in tool_span["attributes"][SPANDATA.GEN_AI_TOOL_DESCRIPTION] + ) diff --git a/tests/tracing/test_decorator.py b/tests/tracing/test_decorator.py index 15432f5862..d370b4bbc9 100644 --- a/tests/tracing/test_decorator.py +++ b/tests/tracing/test_decorator.py @@ -121,9 +121,12 @@ async def _some_function_traced(a, b, c): ) -def test_span_templates_ai_dicts(sentry_init, capture_events): - sentry_init(traces_sample_rate=1.0) - events = capture_events() +def test_span_templates_ai_dicts(sentry_init, capture_items): + sentry_init( + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("span") @sentry_sdk.trace(template=SPANTEMPLATE.AI_TOOL) def my_tool(arg1, arg2): @@ -166,40 +169,54 @@ def my_agent(): with sentry_sdk.start_transaction(name="test-transaction"): my_agent() - (event,) = events - (agent_span, tool_span, chat_span) = event["spans"] + (agent_span, tool_span, chat_span) = ( + item.payload for item in items if item.type == "span" + ) - assert agent_span["op"] == "gen_ai.invoke_agent" assert ( - agent_span["description"] + agent_span["name"] == "invoke_agent test_decorator.test_span_templates_ai_dicts..my_agent" ) - assert agent_span["data"] == { + assert agent_span["attributes"] == { "gen_ai.agent.name": "test_decorator.test_span_templates_ai_dicts..my_agent", "gen_ai.operation.name": "invoke_agent", + "sentry.environment": "production", + "sentry.op": "gen_ai.invoke_agent", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", "thread.id": mock.ANY, "thread.name": mock.ANY, } - assert tool_span["op"] == "gen_ai.execute_tool" assert ( - tool_span["description"] + tool_span["name"] == "execute_tool test_decorator.test_span_templates_ai_dicts..my_tool" ) - assert tool_span["data"] == { + assert tool_span["attributes"] == { "gen_ai.tool.name": "test_decorator.test_span_templates_ai_dicts..my_tool", "gen_ai.operation.name": "execute_tool", "gen_ai.usage.input_tokens": 10, "gen_ai.usage.output_tokens": 20, "gen_ai.usage.total_tokens": 30, + "sentry.environment": "production", + "sentry.op": "gen_ai.execute_tool", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", "thread.id": mock.ANY, "thread.name": mock.ANY, } - assert "gen_ai.tool.description" not in tool_span["data"] + assert "gen_ai.tool.description" not in tool_span["attributes"] - assert chat_span["op"] == "gen_ai.chat" - assert chat_span["description"] == "chat my-gpt-4o-mini" - assert chat_span["data"] == { + assert chat_span["name"] == "chat my-gpt-4o-mini" + assert chat_span["attributes"] == { "gen_ai.operation.name": "chat", "gen_ai.request.frequency_penalty": 1.0, "gen_ai.request.max_tokens": 100, @@ -213,14 +230,25 @@ def my_agent(): "gen_ai.usage.input_tokens": 11, "gen_ai.usage.output_tokens": 22, "gen_ai.usage.total_tokens": 33, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", "thread.id": mock.ANY, "thread.name": mock.ANY, } -def test_span_templates_ai_objects(sentry_init, capture_events): - sentry_init(traces_sample_rate=1.0) - events = capture_events() +def test_span_templates_ai_objects(sentry_init, capture_items): + sentry_init( + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("span") @sentry_sdk.trace(template=SPANTEMPLATE.AI_TOOL) def my_tool(arg1, arg2): @@ -267,40 +295,54 @@ def my_agent(): with sentry_sdk.start_transaction(name="test-transaction"): my_agent() - (event,) = events - (agent_span, tool_span, chat_span) = event["spans"] + (agent_span, tool_span, chat_span) = ( + item.payload for item in items if item.type == "span" + ) - assert agent_span["op"] == "gen_ai.invoke_agent" assert ( - agent_span["description"] + agent_span["name"] == "invoke_agent test_decorator.test_span_templates_ai_objects..my_agent" ) - assert agent_span["data"] == { + assert agent_span["attributes"] == { "gen_ai.agent.name": "test_decorator.test_span_templates_ai_objects..my_agent", "gen_ai.operation.name": "invoke_agent", + "sentry.environment": "production", + "sentry.op": "gen_ai.invoke_agent", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", "thread.id": mock.ANY, "thread.name": mock.ANY, } - assert tool_span["op"] == "gen_ai.execute_tool" assert ( - tool_span["description"] + tool_span["name"] == "execute_tool test_decorator.test_span_templates_ai_objects..my_tool" ) - assert tool_span["data"] == { + assert tool_span["attributes"] == { "gen_ai.tool.name": "test_decorator.test_span_templates_ai_objects..my_tool", "gen_ai.tool.description": "This is a tool function.", "gen_ai.operation.name": "execute_tool", "gen_ai.usage.input_tokens": 10, "gen_ai.usage.output_tokens": 20, "gen_ai.usage.total_tokens": 30, + "sentry.environment": "production", + "sentry.op": "gen_ai.execute_tool", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", "thread.id": mock.ANY, "thread.name": mock.ANY, } - assert chat_span["op"] == "gen_ai.chat" - assert chat_span["description"] == "chat my-gpt-4o-mini" - assert chat_span["data"] == { + assert chat_span["name"] == "chat my-gpt-4o-mini" + assert chat_span["attributes"] == { "gen_ai.operation.name": "chat", "gen_ai.request.frequency_penalty": 1.0, "gen_ai.request.max_tokens": 100, @@ -314,15 +356,27 @@ def my_agent(): "gen_ai.usage.input_tokens": 11, "gen_ai.usage.output_tokens": 22, "gen_ai.usage.total_tokens": 33, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", "thread.id": mock.ANY, "thread.name": mock.ANY, } @pytest.mark.parametrize("send_default_pii", [True, False]) -def test_span_templates_ai_pii(sentry_init, capture_events, send_default_pii): - sentry_init(traces_sample_rate=1.0, send_default_pii=send_default_pii) - events = capture_events() +def test_span_templates_ai_pii(sentry_init, capture_items, send_default_pii): + sentry_init( + traces_sample_rate=1.0, + send_default_pii=send_default_pii, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("span") @sentry_sdk.trace(template=SPANTEMPLATE.AI_TOOL) def my_tool(arg1, arg2, **kwargs): @@ -352,15 +406,14 @@ def my_agent(*args, **kwargs): with sentry_sdk.start_transaction(name="test-transaction"): my_agent(22, 33, arg1=44, arg2=55) - (event,) = events - (_, tool_span, _) = event["spans"] + (_, tool_span, _) = (item.payload for item in items if item.type == "span") if send_default_pii: assert ( - tool_span["data"]["gen_ai.tool.input"] + tool_span["attributes"]["gen_ai.tool.input"] == "{'args': (1, 2), 'kwargs': {'tool_arg1': '3', 'tool_arg2': '4'}}" ) - assert tool_span["data"]["gen_ai.tool.output"] == "'tool_output'" + assert tool_span["attributes"]["gen_ai.tool.output"] == "'tool_output'" else: - assert "gen_ai.tool.input" not in tool_span["data"] - assert "gen_ai.tool.output" not in tool_span["data"] + assert "gen_ai.tool.input" not in tool_span["attributes"] + assert "gen_ai.tool.output" not in tool_span["attributes"] diff --git a/tests/tracing/test_misc.py b/tests/tracing/test_misc.py index 8895c98dbc..4209a02b4b 100644 --- a/tests/tracing/test_misc.py +++ b/tests/tracing/test_misc.py @@ -647,11 +647,14 @@ def test_conversation_id_propagates_to_span_with_ai_op( assert span_data.get("gen_ai.conversation.id") == "conv-ai-op-test" def test_conversation_id_propagates_to_span_with_gen_ai_op( - self, sentry_init, capture_events + self, sentry_init, capture_items ): """Span with gen_ai.* op should get conversation_id.""" - sentry_init(traces_sample_rate=1.0) - events = capture_events() + sentry_init( + traces_sample_rate=1.0, + _experiments={"gen_ai_as_v2_spans": True}, + ) + items = capture_items("span") scope = sentry_sdk.get_current_scope() scope.set_conversation_id("conv-gen-ai-op-test") @@ -660,8 +663,8 @@ def test_conversation_id_propagates_to_span_with_gen_ai_op( with start_span(op="gen_ai.invoke_agent"): pass - (event,) = events - span_data = event["spans"][0]["data"] + spans = [item.payload for item in items if item.type == "span"] + span_data = spans[0]["attributes"] assert span_data.get("gen_ai.conversation.id") == "conv-gen-ai-op-test" def test_conversation_id_not_propagated_to_non_ai_span(