From f3074326632c6d849822a423aa9baa20962340f3 Mon Sep 17 00:00:00 2001 From: Erica Pisani Date: Tue, 30 Jun 2026 16:01:13 -0400 Subject: [PATCH] ref(data-collection): Rework DataCollection as a TypedDict, drop accessors Replace the class-based DataCollection/KeyValueCollectionBehavior/ GenAICollection/HttpHeadersCollection API with plain TypedDicts (DataCollection, DataCollectionUserOptions, and per-category *CollectionBehaviour/*UserOptions types) defined in _types.py. Collection mode strings move from camelCase (denyList/allowList) to snake_case (deny_list/allow_list) to match Python convention, since this is a Python-only deviation from the spec that is never serialized to Sentry. Drop the should_collect_user_info/should_collect_gen_ai_inputs/ should_collect_gen_ai_outputs accessor methods and their module-level shortcuts in favor of reading data_collection fields directly, and drop the public sentry_sdk.DataCollection/GenAICollection/ HttpHeadersCollection/KeyValueCollectionBehavior exports. Revert the Unreleased CHANGELOG entry and README example for the prior class-based API. --- CHANGELOG.md | 9 - README.md | 2 +- sentry_sdk/__init__.py | 10 - sentry_sdk/_types.py | 61 +++- sentry_sdk/client.py | 87 ++---- sentry_sdk/consts.py | 8 +- sentry_sdk/data_collection.py | 540 ++++++++++++++-------------------- sentry_sdk/scope.py | 17 +- tests/test_data_collection.py | 304 +++++++++++-------- 9 files changed, 476 insertions(+), 562 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bca5a38435..618b96dad7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,5 @@ # Changelog -## Unreleased - -### Features - -- Add the `data_collection` option, a structured configuration that supersedes `send_default_pii` for controlling what data integrations collect automatically (user identity, cookies, HTTP headers, query params, HTTP bodies, generative AI inputs/outputs, stack frame variables, source context). See the [Data Collection spec](https://develop.sentry.dev/sdk/foundations/client/data-collection/). - - Adds `sentry_sdk.DataCollection`, `KeyValueCollectionBehavior`, `HttpHeadersCollection`, and `GenAICollection`. - - When `data_collection` is not set, behavior is derived from `send_default_pii` (now deprecated), so upgrading without configuring `data_collection` changes nothing. - - `frame_context_lines` is now configurable (previously hardcoded to 5); AI integrations' `include_prompts` becomes a per-integration override of `data_collection.gen_ai`. - ## 2.63.0 ### Bug Fixes 🐛 diff --git a/README.md b/README.md index 060e48e314..bc745ce904 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ sentry_sdk.init( # To disable sending user data and HTTP request/response bodies, uncomment # the line below. For more info visit: # https://docs.sentry.io/platforms/python/configuration/options/#data_collection - # data_collection=sentry_sdk.DataCollection(user_info=False, http_bodies=[]), + # data_collection={"user_info": False, "http_bodies": []}, ) ``` diff --git a/sentry_sdk/__init__.py b/sentry_sdk/__init__.py index ec27d153a3..8ce8d739c9 100644 --- a/sentry_sdk/__init__.py +++ b/sentry_sdk/__init__.py @@ -2,12 +2,6 @@ from sentry_sdk.scope import Scope # isort: skip from sentry_sdk.client import Client # isort: skip -from sentry_sdk.data_collection import ( # isort: skip - DataCollection, - GenAICollection, - HttpHeadersCollection, - KeyValueCollectionBehavior, -) from sentry_sdk.consts import VERSION from sentry_sdk.transport import HttpTransport, Transport @@ -17,10 +11,6 @@ "Hub", "Scope", "Client", - "DataCollection", - "GenAICollection", - "HttpHeadersCollection", - "KeyValueCollectionBehavior", "Transport", "HttpTransport", "VERSION", diff --git a/sentry_sdk/_types.py b/sentry_sdk/_types.py index 6c52f95a22..44c8ca4366 100644 --- a/sentry_sdk/_types.py +++ b/sentry_sdk/_types.py @@ -141,7 +141,7 @@ def substituted_because_contains_sensitive_data(cls) -> "AnnotatedValue": from collections.abc import Container, MutableMapping, Sequence from datetime import datetime from types import TracebackType - from typing import Any, Callable, Dict, Mapping, NotRequired, Optional, Type + from typing import Any, Callable, Dict, List, Mapping, NotRequired, Optional, Type from typing_extensions import Literal, TypedDict @@ -152,6 +152,65 @@ class SDKInfo(TypedDict): version: str packages: "Sequence[Mapping[str, str]]" + class KeyValueCollectionBehaviour(TypedDict): + mode: 'Literal["off", "deny_list", "allow_list"]' + terms: "NotRequired[List[str]]" + + class GenAICollectionUserOptions(TypedDict, total=False): + inputs: bool + outputs: bool + + class GenAICollectionBehaviour(TypedDict): + inputs: bool + outputs: bool + + class GraphQLCollectionUserOptions(TypedDict, total=False): + document: bool + variables: bool + + class GraphQLCollectionBehaviour(TypedDict): + document: bool + variables: bool + + class DatabaseCollectionUserOptions(TypedDict, total=False): + query_params: bool + + class DatabaseCollectionBehaviour(TypedDict): + query_params: bool + + class HttpHeadersCollectionUserOptions(TypedDict, total=False): + request: "KeyValueCollectionBehaviour" + response: "KeyValueCollectionBehaviour" + + class HttpHeadersCollectionBehaviour(TypedDict): + request: "KeyValueCollectionBehaviour" + response: "KeyValueCollectionBehaviour" + + class DataCollectionUserOptions(TypedDict, total=False): + user_info: bool + cookies: "KeyValueCollectionBehaviour" + http_headers: "HttpHeadersCollectionBehaviour" + http_bodies: "List[str]" + query_params: "KeyValueCollectionBehaviour" + graphql: "GraphQLCollectionBehaviour" + gen_ai: "GenAICollectionBehaviour" + database: "DatabaseCollectionBehaviour" + stack_frame_variables: bool + frame_context_lines: int + + class DataCollection(TypedDict): + provided_by_user: bool + user_info: bool + cookies: "KeyValueCollectionBehaviour" + http_headers: "HttpHeadersCollectionBehaviour" + http_bodies: "List[str]" + query_params: "KeyValueCollectionBehaviour" + graphql: "GraphQLCollectionBehaviour" + gen_ai: "GenAICollectionBehaviour" + database: "DatabaseCollectionBehaviour" + stack_frame_variables: bool + frame_context_lines: int + # "critical" is an alias of "fatal" recognized by Relay LogLevelStr = Literal["fatal", "critical", "error", "warning", "info", "debug"] diff --git a/sentry_sdk/client.py b/sentry_sdk/client.py index 32b1943d64..7600a5db39 100644 --- a/sentry_sdk/client.py +++ b/sentry_sdk/client.py @@ -24,8 +24,6 @@ ClientConstructor, ) from sentry_sdk.data_collection import ( - OFF_DATA_COLLECTION, - DataCollection, _map_from_send_default_pii, resolve_data_collection, ) @@ -76,6 +74,7 @@ from sentry_sdk._log_batcher import LogBatcher from sentry_sdk._metrics_batcher import MetricsBatcher from sentry_sdk._types import ( + DataCollection, Event, EventDataCategory, Hint, @@ -355,7 +354,7 @@ def _get_options(*args: "Optional[str]", **kwargs: "Any") -> "Dict[str, Any]": if rv["event_scrubber"] is None: rv["event_scrubber"] = EventScrubber( - send_default_pii=rv["data_collection"].user_info + send_default_pii=rv["data_collection"]["user_info"] ) if rv["socket_options"] and not isinstance(rv["socket_options"], list): @@ -392,6 +391,12 @@ def _get_options(*args: "Optional[str]", **kwargs: "Any") -> "Dict[str, Any]": # Older Python versions module_not_found_error = ImportError # type: ignore +_DISABLED_DATA_COLLECTION_CONFIG = _map_from_send_default_pii( + send_default_pii=False, + include_local_variables=True, + include_source_context=True, +) + class BaseClient: """ @@ -433,20 +438,7 @@ def should_send_default_pii(self) -> bool: @property def data_collection(self) -> "DataCollection": - return OFF_DATA_COLLECTION - - def should_collect_user_info(self) -> bool: - return False - - def should_collect_gen_ai_inputs( - self, include_prompts: "Optional[bool]" = None - ) -> bool: - return False - - def should_collect_gen_ai_outputs( - self, include_prompts: "Optional[bool]" = None - ) -> bool: - return False + return _DISABLED_DATA_COLLECTION_CONFIG def is_active(self) -> bool: """ @@ -639,14 +631,16 @@ def _record_lost_event( self.options["profiles_sampler"] = sample_all # data_collection was resolved in _get_options() before this # spotlight override flipped send_default_pii on. Re-derive it so - # the should_collect_* accessors agree with should_send_default_pii() - # in DSN-less spotlight mode (only when the user did not set + # data_collection agrees with should_send_default_pii() in + # DSN-less spotlight mode (only when the user did not set # data_collection explicitly). - if not self.options["data_collection"].explicit: + if not self.options["data_collection"]["provided_by_user"]: self.options["data_collection"] = _map_from_send_default_pii( - True, - self.options["include_local_variables"] is not False, - self.options["include_source_context"] is not False, + send_default_pii=True, + include_local_variables=self.options["include_local_variables"] + is not False, + include_source_context=self.options["include_source_context"] + is not False, ) self.session_flusher = SessionFlusher(capture_func=_capture_envelope) @@ -764,52 +758,7 @@ def data_collection(self) -> "DataCollection": Returns the resolved :class:`~sentry_sdk.data_collection.DataCollection` config for this client. """ - dc = self.options.get("data_collection") - return dc if dc is not None else OFF_DATA_COLLECTION - - def should_collect_user_info(self) -> bool: - """ - Returns whether the SDK should automatically populate ``user.*`` fields - (id, email, username, ip_address) from instrumentation. - """ - return bool(self.data_collection.user_info) - - def should_collect_gen_ai_inputs( - self, include_prompts: "Optional[bool]" = None - ) -> bool: - """ - Returns whether the SDK should collect generative AI input content. - - ``include_prompts`` is the integration-level override (if set, it takes - precedence over the global ``data_collection.gen_ai.inputs`` setting). - """ - return self._should_collect_gen_ai_content("inputs", include_prompts) - - def should_collect_gen_ai_outputs( - self, include_prompts: "Optional[bool]" = None - ) -> bool: - """ - Returns whether the SDK should collect generative AI output content. - - ``include_prompts`` is the integration-level override (if set, it takes - precedence over the global ``data_collection.gen_ai.outputs`` setting). - """ - return self._should_collect_gen_ai_content("outputs", include_prompts) - - def _should_collect_gen_ai_content( - self, direction: str, include_prompts: "Optional[bool]" - ) -> bool: - dc = self.data_collection - if dc.explicit: - # Integration-level override wins over the global gen_ai setting. - if include_prompts is not None: - return include_prompts - return bool(getattr(dc.gen_ai, direction)) - # Legacy (data_collection not set): preserve the historical gate - # `should_send_default_pii() and integration.include_prompts`. - # `include_prompts is None` means "no integration-level override", which - # falls back to the legacy default of True (collect when PII is on). - return self.should_send_default_pii() and (include_prompts is not False) + return self.options["data_collection"] @property def dsn(self) -> "Optional[str]": diff --git a/sentry_sdk/consts.py b/sentry_sdk/consts.py index 276ac70f23..31debe071b 100644 --- a/sentry_sdk/consts.py +++ b/sentry_sdk/consts.py @@ -45,6 +45,7 @@ class CompressionAlgo(Enum): from sentry_sdk._types import ( BreadcrumbProcessor, ContinuousProfilerMode, + DataCollectionUserOptions, Event, EventProcessor, Hint, @@ -56,7 +57,6 @@ class CompressionAlgo(Enum): TracesSampler, TransactionProcessor, ) - from sentry_sdk.data_collection import DataCollection # Experiments are feature flags to enable and disable certain unstable SDK # functionality. Changing them from the defaults (`None`) in production @@ -1273,7 +1273,7 @@ def __init__( transport_queue_size: int = DEFAULT_QUEUE_SIZE, sample_rate: float = 1.0, send_default_pii: "Optional[bool]" = None, - data_collection: "Optional[Union[DataCollection, Dict[str, Any]]]" = None, + data_collection: "Optional[DataCollectionUserOptions]" = None, http_proxy: "Optional[str]" = None, https_proxy: "Optional[str]" = None, ignore_errors: "Sequence[Union[type, str]]" = [], # noqa: B006 @@ -1432,12 +1432,12 @@ def __init__( Use `data_collection` instead. `send_default_pii` is still honored when `data_collection` is not set. :param data_collection: Structured configuration controlling what data integrations collect automatically, - superseding `send_default_pii`. Pass a dict or a :class:`sentry_sdk.DataCollection` instance to enable or + superseding `send_default_pii`. Pass a dict to enable or restrict collection per category (user identity, cookies, HTTP headers/bodies, query params, generative AI inputs/outputs, stack frame variables, source context). When `data_collection` is set, omitted fields use their defaults (most categories are collected, with the - sensitive denylist scrubbing values). When it is not set, the SDK derives behavior from `send_default_pii` + sensitive denylist scrubbing values). When it is not set, the SDK derives behaviour from `send_default_pii` so that upgrading without configuring `data_collection` changes nothing. If both are set, `data_collection` takes precedence. diff --git a/sentry_sdk/data_collection.py b/sentry_sdk/data_collection.py index 0a18320729..5701acd95d 100644 --- a/sentry_sdk/data_collection.py +++ b/sentry_sdk/data_collection.py @@ -16,7 +16,7 @@ * ``data_collection`` set, ``send_default_pii`` unset -> honor ``data_collection`` using the spec defaults for any omitted field. * ``send_default_pii`` set, ``data_collection`` unset -> derive a - ``DataCollection`` that mirrors what ``send_default_pii`` collects today. + resolved ``DataCollection`` that mirrors what ``send_default_pii`` collects today. * neither set -> treated as ``send_default_pii=False``. * both set -> ``data_collection`` wins (it is the single source of truth); a ``DeprecationWarning`` is emitted for ``send_default_pii``. @@ -24,12 +24,12 @@ The new collection-time filtering mechanisms (the partial-match sensitive denylist and allow/deny key-value modes) only become active when ``data_collection`` is provided explicitly. Otherwise the SDK keeps its existing -behavior so that upgrading without configuring ``data_collection`` changes +behaviour so that upgrading without configuring ``data_collection`` changes nothing. """ import warnings -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast from urllib.parse import parse_qsl, urlencode from sentry_sdk._types import SENSITIVE_DATA_SUBSTITUTE @@ -37,41 +37,34 @@ if TYPE_CHECKING: from typing import Any, Dict, List, Mapping, Optional + from typing_extensions import Literal -__all__ = [ - "DataCollection", - "KeyValueCollectionBehavior", - "GenAICollection", - "HttpHeadersCollection", - "SENSITIVE_DENYLIST", - "EXTENDED_GDPR_DENYLIST", -] - - -#: Body type identifiers accepted by ``DataCollection.http_bodies``. These match -#: the spec's camelCase string values so configuration is portable across SDKs. -BODY_TYPE_INCOMING_REQUEST = "incomingRequest" -BODY_TYPE_OUTGOING_REQUEST = "outgoingRequest" -BODY_TYPE_INCOMING_RESPONSE = "incomingResponse" -BODY_TYPE_OUTGOING_RESPONSE = "outgoingResponse" + from sentry_sdk._types import ( + DatabaseCollectionBehaviour, + DataCollection, + DataCollectionUserOptions, + GenAICollectionBehaviour, + GraphQLCollectionBehaviour, + HttpHeadersCollectionBehaviour, + KeyValueCollectionBehaviour, + ) #: All valid body types. ``http_bodies`` defaults to this (collect everything the #: platform supports); an empty list is the explicit opt-out. -ALL_BODY_TYPES = [ - BODY_TYPE_INCOMING_REQUEST, - BODY_TYPE_OUTGOING_REQUEST, - BODY_TYPE_INCOMING_RESPONSE, - BODY_TYPE_OUTGOING_RESPONSE, +ALL_HTTP_BODY_TYPES = [ + "incoming_request", + "outgoing_request", + "incoming_response", + "outgoing_response", ] #: Default number of source lines captured above and below a stack frame. DEFAULT_FRAME_CONTEXT_LINES = 5 #: Collection modes for key-value data (cookies, headers, query params). -COLLECTION_OFF = "off" -COLLECTION_DENYLIST = "denyList" -COLLECTION_ALLOWLIST = "allowList" -_VALID_MODES = (COLLECTION_OFF, COLLECTION_DENYLIST, COLLECTION_ALLOWLIST) +#: snake_case (Python-only deviation from the spec's camelCase); never +#: serialized to Sentry. +_VALID_KEY_VALUE_COLLECTION_BEHAVIOUR_MODES = ("off", "deny_list", "allow_list") #: Canonical sensitive denylist from the spec. Values of keys that contain any of #: these terms (partial, case-insensitive) are always replaced with @@ -96,173 +89,6 @@ "identity", ] -#: Additional GDPR-sensitive terms users may opt into via custom deny terms. -#: Not applied automatically; documented here for convenience. -EXTENDED_GDPR_DENYLIST = ["forwarded", "-ip", "remote-", "via", "-user"] - - -class KeyValueCollectionBehavior: - """ - Controls which *values* of key-value data (cookies, headers, query params) - are sent in plaintext versus replaced with ``"[Filtered]"``. Key names are - always retained. - - :param mode: one of ``"off"``, ``"denyList"`` (default), ``"allowList"``. - :param terms: deny or allow terms (depending on ``mode``) that extend the - built-in sensitive denylist. Matched as a partial, case-insensitive - substring of the key name. - """ - - __slots__ = ("mode", "terms") - - def __init__(self, mode: str = "denyList", terms: "Optional[List[str]]" = None): - if mode not in _VALID_MODES: - raise ValueError( - "Invalid KeyValueCollectionBehavior mode {!r}. Must be one of {}.".format( - mode, _VALID_MODES - ) - ) - self.mode = mode - self.terms: "List[str]" = list(terms) if terms else [] - - def __repr__(self) -> str: - return "KeyValueCollectionBehavior(mode={!r}, terms={!r})".format( - self.mode, self.terms - ) - - -class GenAICollection: - """ - Controls capture of generative AI input and output *content*. Metadata such - as model name and token counts is always collected regardless of these - settings. - """ - - __slots__ = ("inputs", "outputs") - - def __init__(self, inputs: bool = True, outputs: bool = True): - self.inputs = inputs - self.outputs = outputs - - def __repr__(self) -> str: - return "GenAICollection(inputs={!r}, outputs={!r})".format( - self.inputs, self.outputs - ) - - -class HttpHeadersCollection: - """ - Configures request and response header collection independently. Each - direction is a :class:`KeyValueCollectionBehavior`. - """ - - __slots__ = ("request", "response") - - def __init__( - self, - request: "Optional[KeyValueCollectionBehavior]" = None, - response: "Optional[KeyValueCollectionBehavior]" = None, - ): - self.request: "KeyValueCollectionBehavior" = ( - request if request is not None else KeyValueCollectionBehavior() - ) - self.response: "KeyValueCollectionBehavior" = ( - response if response is not None else KeyValueCollectionBehavior() - ) - - def __repr__(self) -> str: - return "HttpHeadersCollection(request={!r}, response={!r})".format( - self.request, self.response - ) - - -class DataCollection: - """ - The ``data_collection`` client option. - - Pass an instance to ``sentry_sdk.init(data_collection=...)``. Any field left - as ``None`` is filled in with its spec default during resolution (see - :func:`resolve_data_collection`). After resolution the instance stored on the - client has concrete values for every field. - - :param user_info: automatically populate ``user.*`` fields (id, email, - username, ip_address) from instrumentation. Default ``True``. - :param cookies: cookie collection behavior. Default ``denyList``. - :param http_headers: request/response header collection. Default - ``denyList`` for both directions. - :param http_bodies: list of body types to collect. ``None`` -> all valid - types; ``[]`` -> off. - :param query_params: URL query parameter collection. Default ``denyList``. - :param gen_ai: generative AI input/output content collection. Default both - ``True``. - :param stack_frame_variables: include local variable values in stack frames. - Default ``True`` (falls back to ``include_local_variables``). - :param frame_context_lines: number of source lines above/below each frame. - Default ``5`` (falls back to ``include_source_context``). - """ - - __slots__ = ( - "user_info", - "cookies", - "http_headers", - "http_bodies", - "query_params", - "gen_ai", - "stack_frame_variables", - "frame_context_lines", - "explicit", - ) - - def __init__( - self, - user_info: bool = True, - cookies: "Optional[KeyValueCollectionBehavior]" = None, - http_headers: "Optional[HttpHeadersCollection]" = None, - http_bodies: "Optional[List[str]]" = None, - query_params: "Optional[KeyValueCollectionBehavior]" = None, - gen_ai: "Optional[GenAICollection]" = None, - stack_frame_variables: "Optional[bool]" = None, - frame_context_lines: "Optional[int]" = None, - ): - # Fields with no legacy fallback default to their spec value, so they are - # always concrete (never None) on a constructed instance. - self.user_info = user_info - self.cookies = cookies if cookies is not None else KeyValueCollectionBehavior() - self.http_headers = ( - http_headers if http_headers is not None else HttpHeadersCollection() - ) - # http_bodies is None == "all valid types"; [] == off. - self.http_bodies = http_bodies - self.query_params = ( - query_params if query_params is not None else KeyValueCollectionBehavior() - ) - self.gen_ai = gen_ai if gen_ai is not None else GenAICollection() - # Frame fields keep None as "inherit from include_local_variables / - # include_source_context" so resolution can apply the legacy fallback. - self.stack_frame_variables = stack_frame_variables - self.frame_context_lines = frame_context_lines - # Whether the user supplied ``data_collection`` explicitly. Set during - # resolution. Collection-time filtering only changes from legacy behavior - # when this is True. - self.explicit: bool = False - - def __repr__(self) -> str: - return ( - "DataCollection(user_info={!r}, cookies={!r}, http_headers={!r}, " - "http_bodies={!r}, query_params={!r}, gen_ai={!r}, " - "stack_frame_variables={!r}, frame_context_lines={!r}, explicit={!r})" - ).format( - self.user_info, - self.cookies, - self.http_headers, - self.http_bodies, - self.query_params, - self.gen_ai, - self.stack_frame_variables, - self.frame_context_lines, - self.explicit, - ) - def is_sensitive_key(key: str, extra_terms: "Optional[List[str]]" = None) -> bool: """ @@ -285,41 +111,42 @@ def is_sensitive_key(key: str, extra_terms: "Optional[List[str]]" = None) -> boo def apply_key_value_collection( items: "Mapping[str, Any]", - behavior: "KeyValueCollectionBehavior", + behaviour: "KeyValueCollectionBehaviour", substitute: "Any" = SENSITIVE_DATA_SUBSTITUTE, ) -> "Dict[str, Any]": """ - Apply a :class:`KeyValueCollectionBehavior` to a mapping of key-value pairs. + Apply a :class:`KeyValueCollectionBehaviour` to a mapping of key-value pairs. Returns a new dict. Key names are always retained (except for ``off`` mode, which collects nothing). Sensitive keys (built-in denylist) are always - scrubbed, even under ``allowList`` mode. + scrubbed, even under ``allow_list`` mode. """ - if behavior.mode == COLLECTION_OFF: + mode = behaviour.get("mode", "deny_list") + terms = behaviour.get("terms") or [] + + if mode == "off": return {} result: "Dict[str, Any]" = {} - if behavior.mode == COLLECTION_ALLOWLIST: - # behavior.terms is the ALLOW list here (not deny terms). A key sends its + if mode == "allow_list": + # ``terms`` is the ALLOW list here (not deny terms). A key sends its # real value only if it matches an allow term AND is not sensitive (the # built-in sensitive denylist always wins, even for allow-listed keys). for key, value in items.items(): allowed = False if isinstance(key, str): lowered = key.lower() - allowed = any( - term and term.lower() in lowered for term in behavior.terms - ) + allowed = any(term and term.lower() in lowered for term in terms) if allowed and not is_sensitive_key(key): result[key] = value else: result[key] = substitute return result - # denyList (default): collect everything, scrub sensitive values. + # deny_list (default): collect everything, scrub sensitive values. for key, value in items.items(): - if isinstance(key, str) and is_sensitive_key(key, behavior.terms): + if isinstance(key, str) and is_sensitive_key(key, terms): result[key] = substitute else: result[key] = value @@ -335,14 +162,14 @@ def apply_key_value_collection( def filter_request_headers( headers: "Mapping[str, Any]", - behavior: "KeyValueCollectionBehavior", + behaviour: "KeyValueCollectionBehaviour", substitute: "Any" = SENSITIVE_DATA_SUBSTITUTE, ) -> "Dict[str, Any]": """ - Apply a header :class:`KeyValueCollectionBehavior`, additionally always + Apply a header :class:`KeyValueCollectionBehaviour`, additionally always filtering the raw Cookie/Set-Cookie header values. """ - filtered = apply_key_value_collection(headers, behavior, substitute=substitute) + filtered = apply_key_value_collection(headers, behaviour, substitute=substitute) for key in filtered: if isinstance(key, str) and key.lower() in _ALWAYS_FILTERED_HEADERS: filtered[key] = substitute @@ -351,17 +178,20 @@ def filter_request_headers( def scrub_query_string( query_string: str, - behavior: "KeyValueCollectionBehavior", + behaviour: "KeyValueCollectionBehaviour", ) -> "Optional[str]": """ - Apply a query-param :class:`KeyValueCollectionBehavior` to a raw query + Apply a query-param :class:`KeyValueCollectionBehaviour` to a raw query string. Returns ``None`` when the mode is ``off`` (do not collect the query string at all), the scrubbed query string otherwise. An unparseable query string is replaced entirely with ``"[Filtered]"``. """ - if behavior.mode == COLLECTION_OFF: + mode = behaviour.get("mode", "deny_list") + terms = behaviour.get("terms") or [] + + if mode == "off": return None try: @@ -374,10 +204,8 @@ def scrub_query_string( scrubbed = [] for key, value in pairs: - if behavior.mode == COLLECTION_ALLOWLIST: - allowed = any( - term and term.lower() in key.lower() for term in behavior.terms - ) + if mode == "allow_list": + allowed = any(term and term.lower() in key.lower() for term in terms) scrubbed.append( ( key, @@ -386,12 +214,12 @@ def scrub_query_string( else SENSITIVE_DATA_SUBSTITUTE, ) ) - else: # denyList + else: # deny_list scrubbed.append( ( key, SENSITIVE_DATA_SUBSTITUTE - if is_sensitive_key(key, behavior.terms) + if is_sensitive_key(key, terms) else value, ) ) @@ -403,7 +231,7 @@ def should_collect_body_type( body_type: str, ) -> bool: """Return whether the given body type should be collected.""" - bodies = data_collection.http_bodies + bodies = data_collection.get("http_bodies") if bodies is None: return True return body_type in bodies @@ -415,48 +243,52 @@ def _map_from_send_default_pii( include_source_context: bool, ) -> "DataCollection": """ - Build a fully-resolved :class:`DataCollection` that mirrors the data + Build a fully-resolved ``DataCollection`` dict that mirrors the data ``send_default_pii`` collects today. Used when ``data_collection`` is not - provided explicitly (resolution cases B and C). + provided explicitly. + + PII-bearing content gates on ``send_default_pii``: ``graphql.variables`` and + ``database.query_params`` follow it, while ``graphql.document`` stays ``True``. """ - resolved = DataCollection( - user_info=send_default_pii, - cookies=KeyValueCollectionBehavior( - COLLECTION_DENYLIST if send_default_pii else COLLECTION_OFF - ), + kv_mode = "deny_list" if send_default_pii else "off" # type: Literal["off", "deny_list", "allow_list"] + return { + "provided_by_user": False, + "user_info": send_default_pii, + "cookies": {"mode": kv_mode}, # Headers are collected in both PII modes today (sensitive ones filtered # when PII is off), so this never maps to "off". - http_headers=HttpHeadersCollection(), + "http_headers": { + "request": {"mode": "deny_list"}, + "response": {"mode": "deny_list"}, + }, # Bodies are collected regardless of PII today, bounded by # ``max_request_body_size``. - http_bodies=list(ALL_BODY_TYPES), - query_params=KeyValueCollectionBehavior( - COLLECTION_DENYLIST if send_default_pii else COLLECTION_OFF - ), - gen_ai=GenAICollection(inputs=send_default_pii, outputs=send_default_pii), - stack_frame_variables=include_local_variables, - frame_context_lines=( + "http_bodies": list(ALL_HTTP_BODY_TYPES), + "query_params": {"mode": kv_mode}, + "graphql": {"document": True, "variables": send_default_pii}, + "gen_ai": {"inputs": send_default_pii, "outputs": send_default_pii}, + "database": {"query_params": send_default_pii}, + "stack_frame_variables": include_local_variables, + "frame_context_lines": ( DEFAULT_FRAME_CONTEXT_LINES if include_source_context else 0 ), - ) - resolved.explicit = False - return resolved + } def _resolve_explicit( - user_dc: "DataCollection", + user_dc: "DataCollectionUserOptions", include_local_variables: bool, include_source_context: bool, ) -> "DataCollection": """ - Fill in any omitted fields of a user-supplied ``DataCollection`` with their - spec defaults (resolution case A). Frame fields fall back to the legacy + Fill in any omitted fields of a user-supplied ``DataCollection`` dict with + their spec defaults. Frame fields fall back to the legacy ``include_local_variables`` / ``include_source_context`` options when unset. """ # frame_context_lines accepts an integer or a boolean fallback (spec: True # -> platform default of 5, False -> 0). bool is a subclass of int, so # coerce explicitly before treating it as a line count. - frame_context_lines = user_dc.frame_context_lines + frame_context_lines = user_dc.get("frame_context_lines") if frame_context_lines is None: frame_context_lines = ( DEFAULT_FRAME_CONTEXT_LINES if include_source_context else 0 @@ -464,107 +296,170 @@ def _resolve_explicit( elif isinstance(frame_context_lines, bool): frame_context_lines = DEFAULT_FRAME_CONTEXT_LINES if frame_context_lines else 0 - resolved = DataCollection( - # These fields are always concrete on a constructed DataCollection. - user_info=user_dc.user_info, - cookies=user_dc.cookies, - http_headers=user_dc.http_headers, - query_params=user_dc.query_params, - gen_ai=user_dc.gen_ai, - # http_bodies: None means "all valid types"; materialize for clarity. - http_bodies=( - list(user_dc.http_bodies) - if user_dc.http_bodies is not None - else list(ALL_BODY_TYPES) - ), - # Frame fields fall back to the legacy options when unset. - stack_frame_variables=( - user_dc.stack_frame_variables - if user_dc.stack_frame_variables is not None - else include_local_variables - ), - frame_context_lines=frame_context_lines, - ) - resolved.explicit = True - return resolved + stack_frame_variables = user_dc.get("stack_frame_variables") + if stack_frame_variables is None: + stack_frame_variables = include_local_variables + # http_bodies: omitted means "all valid types"; [] is the explicit opt-out. + http_bodies = user_dc.get("http_bodies") + http_bodies = ( + list(http_bodies) if http_bodies is not None else list(ALL_HTTP_BODY_TYPES) + ) -def _data_collection_from_dict(d: "Dict[str, Any]") -> "DataCollection": - """Convert a plain dict into a :class:`DataCollection`.""" - kwargs: "Dict[str, Any]" = {} + return { + "provided_by_user": True, + "user_info": user_dc.get("user_info", True), + "cookies": user_dc.get("cookies") or _kvcb_from_value("deny_list"), + "http_headers": user_dc.get("http_headers") + or _http_headers_from_value("deny_list"), + "http_bodies": http_bodies, + "query_params": user_dc.get("query_params") or _kvcb_from_value("deny_list"), + "graphql": user_dc.get("graphql") or _graphql_from_value({}), + "gen_ai": user_dc.get("gen_ai") or _gen_ai_from_value({}), + "database": user_dc.get("database") or _database_from_value({}), + "stack_frame_variables": stack_frame_variables, + "frame_context_lines": frame_context_lines, + } + + +def _data_collection_from_dict(d: "Dict[str, Any]") -> "DataCollectionUserOptions": + """ + Normalize only the keys the user supplied into a partial + ``DataCollectionUserOptions`` dict. Nested config values are coerced (and + their own defaults filled) by the per-field helpers. + """ + result: "DataCollectionUserOptions" = {} if "user_info" in d: - kwargs["user_info"] = d["user_info"] + result["user_info"] = d["user_info"] if "cookies" in d: - kwargs["cookies"] = _kvcb_from_value(d["cookies"]) + result["cookies"] = _kvcb_from_value(d["cookies"]) if "http_headers" in d: - kwargs["http_headers"] = _http_headers_from_value(d["http_headers"]) + result["http_headers"] = _http_headers_from_value(d["http_headers"]) if "http_bodies" in d: - kwargs["http_bodies"] = d["http_bodies"] + result["http_bodies"] = d["http_bodies"] if "query_params" in d: - kwargs["query_params"] = _kvcb_from_value(d["query_params"]) + result["query_params"] = _kvcb_from_value(d["query_params"]) + if "graphql" in d: + result["graphql"] = _graphql_from_value(d["graphql"]) if "gen_ai" in d: - kwargs["gen_ai"] = _gen_ai_from_value(d["gen_ai"]) + result["gen_ai"] = _gen_ai_from_value(d["gen_ai"]) + if "database" in d: + result["database"] = _database_from_value(d["database"]) if "stack_frame_variables" in d: - kwargs["stack_frame_variables"] = d["stack_frame_variables"] + result["stack_frame_variables"] = d["stack_frame_variables"] if "frame_context_lines" in d: - kwargs["frame_context_lines"] = d["frame_context_lines"] + result["frame_context_lines"] = d["frame_context_lines"] - return DataCollection(**kwargs) + return result -def _kvcb_from_value(val: "Any") -> "KeyValueCollectionBehavior": - """Coerce a string or dict to :class:`KeyValueCollectionBehavior`.""" - if isinstance(val, KeyValueCollectionBehavior): - return val +def _kvcb_from_value(val: "Any") -> "KeyValueCollectionBehaviour": + """ + Coerce a string or dict to a ``KeyValueCollectionBehaviour`` dict, defaulting + ``mode`` to ``deny_list`` and validating it against the known modes. + """ if isinstance(val, str): - return KeyValueCollectionBehavior(mode=val) - if isinstance(val, dict): - return KeyValueCollectionBehavior(**val) + mode = val + terms = None + elif isinstance(val, dict): + mode = val.get("mode", "deny_list") + terms = val.get("terms") + else: + raise TypeError( + "Expected a string or dict for key-value collection behaviour, " + "got {!r}".format(type(val).__name__) + ) + + if mode not in _VALID_KEY_VALUE_COLLECTION_BEHAVIOUR_MODES: + raise ValueError( + "Invalid collection mode {!r}. Must be one of {}.".format( + mode, _VALID_KEY_VALUE_COLLECTION_BEHAVIOUR_MODES + ) + ) + + behaviour = {"mode": mode} # type: Dict[str, Any] + if terms is not None: + behaviour["terms"] = list(terms) + return cast("KeyValueCollectionBehaviour", behaviour) + + +def _http_headers_from_value(val: "Any") -> "HttpHeadersCollectionBehaviour": + """ + Coerce a value to an ``HttpHeadersCollectionBehaviour`` dict. + + Accepts ``{"request": ..., "response": ...}`` (each direction defaulting to + ``deny_list``) or a shorthand — a string or single key-value behaviour dict — + applied to both directions. + """ + if isinstance(val, dict) and ("request" in val or "response" in val): + return { + "request": ( + _kvcb_from_value(val["request"]) + if "request" in val + else _kvcb_from_value("deny_list") + ), + "response": ( + _kvcb_from_value(val["response"]) + if "response" in val + else _kvcb_from_value("deny_list") + ), + } + if isinstance(val, (str, dict)): + # Shorthand: a single behaviour applies to both directions. + return { + "request": _kvcb_from_value(val), + "response": _kvcb_from_value(val), + } raise TypeError( - "Expected a KeyValueCollectionBehavior, string, or dict, got {!r}".format( + "Expected a dict or string for http_headers, got {!r}".format( type(val).__name__ ) ) -def _http_headers_from_value(val: "Any") -> "HttpHeadersCollection": - """Coerce a dict to :class:`HttpHeadersCollection`.""" - if isinstance(val, HttpHeadersCollection): - return val - if isinstance(val, dict): - kwargs: "Dict[str, Any]" = {} - if "request" in val: - kwargs["request"] = _kvcb_from_value(val["request"]) - if "response" in val: - kwargs["response"] = _kvcb_from_value(val["response"]) - return HttpHeadersCollection(**kwargs) - raise TypeError( - "Expected an HttpHeadersCollection or dict, got {!r}".format(type(val).__name__) - ) +def _gen_ai_from_value(val: "Any") -> "GenAICollectionBehaviour": + """Coerce a dict to a ``GenAICollectionBehaviour`` dict; ``inputs``/``outputs`` default to ``True``.""" + if not isinstance(val, dict): + raise TypeError( + "Expected a dict for gen_ai, got {!r}".format(type(val).__name__) + ) + return { + "inputs": val.get("inputs", True), + "outputs": val.get("outputs", True), + } -def _gen_ai_from_value(val: "Any") -> "GenAICollection": - """Coerce a dict to :class:`GenAICollection`.""" - if isinstance(val, GenAICollection): - return val - if isinstance(val, dict): - return GenAICollection(**val) - raise TypeError( - "Expected a GenAICollection or dict, got {!r}".format(type(val).__name__) - ) +def _graphql_from_value(val: "Any") -> "GraphQLCollectionBehaviour": + """Coerce a dict to a ``GraphQLCollectionBehaviour`` dict; ``document``/``variables`` default to ``True``.""" + if not isinstance(val, dict): + raise TypeError( + "Expected a dict for graphql, got {!r}".format(type(val).__name__) + ) + return { + "document": val.get("document", True), + "variables": val.get("variables", True), + } + + +def _database_from_value(val: "Any") -> "DatabaseCollectionBehaviour": + """Coerce a dict to a ``DatabaseCollectionBehaviour`` dict; ``query_params`` defaults to ``True``.""" + if not isinstance(val, dict): + raise TypeError( + "Expected a dict for database, got {!r}".format(type(val).__name__) + ) + return {"query_params": val.get("query_params", True)} def resolve_data_collection(options: "Dict[str, Any]") -> "DataCollection": """ - Resolve the effective :class:`DataCollection` from client ``options``. + Resolve the effective ``DataCollection`` dict from client ``options``. Reads ``data_collection``, ``send_default_pii``, ``include_local_variables`` - and ``include_source_context`` and returns a fully-resolved instance with + and ``include_source_context`` and returns a fully-resolved dict with concrete values for every field. - ``data_collection`` may be a :class:`DataCollection` instance or a plain - ``dict`` (which is converted automatically). + ``data_collection`` must be a plain ``dict``. """ user_dc = options.get("data_collection") send_default_pii = options.get("send_default_pii") @@ -576,12 +471,11 @@ def resolve_data_collection(options: "Dict[str, Any]") -> "DataCollection": include_source_context = True if user_dc is not None: - if isinstance(user_dc, dict): - user_dc = _data_collection_from_dict(user_dc) - elif not isinstance(user_dc, DataCollection): + if not isinstance(user_dc, dict): raise TypeError( - "`data_collection` must be a dict or sentry_sdk.DataCollection " - "instance, got {!r}.".format(type(user_dc).__name__) + "`data_collection` must be a dict, got {!r}.".format( + type(user_dc).__name__ + ) ) if send_default_pii is not None: warnings.warn( @@ -592,15 +486,13 @@ def resolve_data_collection(options: "Dict[str, Any]") -> "DataCollection": stacklevel=2, ) return _resolve_explicit( - user_dc, include_local_variables, include_source_context + _data_collection_from_dict(user_dc), + include_local_variables, + include_source_context, ) return _map_from_send_default_pii( - bool(send_default_pii), include_local_variables, include_source_context + send_default_pii=bool(send_default_pii), + include_local_variables=include_local_variables, + include_source_context=include_source_context, ) - - -#: Safe default used by non-recording clients: collect nothing PII-gated. -#: This is a shared, process-wide singleton. Treat it as read-only — do not -#: mutate the returned ``DataCollection`` or its nested config objects. -OFF_DATA_COLLECTION = _map_from_send_default_pii(False, True, True) diff --git a/sentry_sdk/scope.py b/sentry_sdk/scope.py index 6392f5d2ce..07dd218faf 100644 --- a/sentry_sdk/scope.py +++ b/sentry_sdk/scope.py @@ -87,6 +87,7 @@ AttributeValue, Breadcrumb, BreadcrumbHint, + DataCollection, ErrorProcessor, Event, EventProcessor, @@ -98,7 +99,6 @@ SamplingContext, Type, ) - from sentry_sdk.data_collection import DataCollection from sentry_sdk.tracing import TransactionKwargs P = ParamSpec("P") @@ -2174,21 +2174,6 @@ def should_send_default_pii() -> bool: return Scope.get_client().should_send_default_pii() -def should_collect_user_info() -> bool: - """Shortcut for `Scope.get_client().should_collect_user_info()`.""" - return Scope.get_client().should_collect_user_info() - - -def should_collect_gen_ai_inputs(include_prompts: "Optional[bool]" = None) -> bool: - """Shortcut for `Scope.get_client().should_collect_gen_ai_inputs(...)`.""" - return Scope.get_client().should_collect_gen_ai_inputs(include_prompts) - - -def should_collect_gen_ai_outputs(include_prompts: "Optional[bool]" = None) -> bool: - """Shortcut for `Scope.get_client().should_collect_gen_ai_outputs(...)`.""" - return Scope.get_client().should_collect_gen_ai_outputs(include_prompts) - - def get_data_collection() -> "DataCollection": """Return the resolved DataCollection config of the active client.""" return Scope.get_client().data_collection diff --git a/tests/test_data_collection.py b/tests/test_data_collection.py index 4f9aed3dd0..e4813d7dec 100644 --- a/tests/test_data_collection.py +++ b/tests/test_data_collection.py @@ -3,15 +3,11 @@ import pytest import sentry_sdk -from sentry_sdk import ( - DataCollection, - GenAICollection, - HttpHeadersCollection, - KeyValueCollectionBehavior, -) from sentry_sdk.data_collection import ( - ALL_BODY_TYPES, + ALL_HTTP_BODY_TYPES, SENSITIVE_DENYLIST, + _http_headers_from_value, + _kvcb_from_value, apply_key_value_collection, filter_request_headers, is_sensitive_key, @@ -71,31 +67,37 @@ def test_is_sensitive_key_extra_terms(): # --------------------------------------------------------------------------- -# Key-value collection behavior +# Key-value collection behaviour # --------------------------------------------------------------------------- def test_kvcb_invalid_mode(): with pytest.raises(ValueError): - KeyValueCollectionBehavior(mode="nope") + _kvcb_from_value({"mode": "nope"}) + + +def test_kvcb_from_string_defaults_terms(): + assert _kvcb_from_value("allow_list") == {"mode": "allow_list"} + + +def test_kvcb_from_dict_defaults_mode(): + assert _kvcb_from_value({"terms": ["x"]}) == {"mode": "deny_list", "terms": ["x"]} def test_apply_off(): - assert ( - apply_key_value_collection({"a": "1"}, KeyValueCollectionBehavior("off")) == {} - ) + assert apply_key_value_collection({"a": "1"}, {"mode": "off"}) == {} def test_apply_denylist_scrubs_sensitive_keeps_rest(): items = {"Authorization": "secret", "Accept": "json", "X-Id": "1"} - out = apply_key_value_collection(items, KeyValueCollectionBehavior("denyList")) + out = apply_key_value_collection(items, {"mode": "deny_list"}) assert out == {"Authorization": "[Filtered]", "Accept": "json", "X-Id": "1"} def test_apply_denylist_extra_terms(): items = {"X-Custom": "v", "Accept": "json"} out = apply_key_value_collection( - items, KeyValueCollectionBehavior("denyList", ["x-custom"]) + items, {"mode": "deny_list", "terms": ["x-custom"]} ) assert out == {"X-Custom": "[Filtered]", "Accept": "json"} @@ -103,7 +105,7 @@ def test_apply_denylist_extra_terms(): def test_apply_allowlist_only_allowed_real(): items = {"X-Request-Id": "r1", "Accept": "json", "Authorization": "x"} out = apply_key_value_collection( - items, KeyValueCollectionBehavior("allowList", ["x-request-id"]) + items, {"mode": "allow_list", "terms": ["x-request-id"]} ) assert out == { "X-Request-Id": "r1", @@ -116,14 +118,14 @@ def test_apply_allowlist_sensitive_always_scrubbed(): # Even if a sensitive key is allow-listed, it is still scrubbed. items = {"Authorization": "x"} out = apply_key_value_collection( - items, KeyValueCollectionBehavior("allowList", ["authorization"]) + items, {"mode": "allow_list", "terms": ["authorization"]} ) assert out == {"Authorization": "[Filtered]"} def test_filter_request_headers_always_filters_cookie(): items = {"Cookie": "a=b", "Set-Cookie": "c=d", "Accept": "json"} - out = filter_request_headers(items, KeyValueCollectionBehavior("denyList")) + out = filter_request_headers(items, {"mode": "deny_list"}) assert out == { "Cookie": "[Filtered]", "Set-Cookie": "[Filtered]", @@ -137,11 +139,11 @@ def test_filter_request_headers_always_filters_cookie(): def test_scrub_query_off(): - assert scrub_query_string("a=1&token=x", KeyValueCollectionBehavior("off")) is None + assert scrub_query_string("a=1&token=x", {"mode": "off"}) is None def test_scrub_query_denylist(): - out = scrub_query_string("token=abc&page=5", KeyValueCollectionBehavior("denyList")) + out = scrub_query_string("token=abc&page=5", {"mode": "deny_list"}) assert "page=5" in out assert "token=" in out assert "abc" not in out @@ -149,7 +151,7 @@ def test_scrub_query_denylist(): def test_scrub_query_allowlist(): out = scrub_query_string( - "token=abc&page=5", KeyValueCollectionBehavior("allowList", ["page"]) + "token=abc&page=5", {"mode": "allow_list", "terms": ["page"]} ) assert "page=5" in out assert "abc" not in out @@ -161,24 +163,51 @@ def test_scrub_query_allowlist(): def test_body_type_default_all(): - dc = DataCollection() - # None means all valid types - assert should_collect_body_type(dc, "incomingRequest") is True + # An omitted http_bodies means all valid types. + assert should_collect_body_type({}, "incoming_request") is True def test_body_type_explicit_list(): - dc = DataCollection(http_bodies=["incomingRequest"]) - assert should_collect_body_type(dc, "incomingRequest") is True - assert should_collect_body_type(dc, "outgoingRequest") is False + dc = {"http_bodies": ["incoming_request"]} + assert should_collect_body_type(dc, "incoming_request") is True + assert should_collect_body_type(dc, "outgoing_request") is False def test_body_type_empty_off(): - dc = DataCollection(http_bodies=[]) - assert should_collect_body_type(dc, "incomingRequest") is False + assert should_collect_body_type({"http_bodies": []}, "incoming_request") is False + + +# --------------------------------------------------------------------------- +# http_headers coercion (shorthand + per-direction) +# --------------------------------------------------------------------------- + + +def test_http_headers_collection_defaults(): + hh = _http_headers_from_value({}) + assert hh["request"] == {"mode": "deny_list"} + assert hh["response"] == {"mode": "deny_list"} + + +def test_http_headers_shorthand_string_applies_to_both(): + hh = _http_headers_from_value("off") + assert hh["request"]["mode"] == "off" + assert hh["response"]["mode"] == "off" + + +def test_http_headers_shorthand_single_kvcb_applies_to_both(): + hh = _http_headers_from_value({"mode": "allow_list", "terms": ["x-id"]}) + assert hh["request"] == {"mode": "allow_list", "terms": ["x-id"]} + assert hh["response"] == {"mode": "allow_list", "terms": ["x-id"]} + + +def test_http_headers_per_direction_defaults_missing_to_denylist(): + hh = _http_headers_from_value({"request": "off"}) + assert hh["request"]["mode"] == "off" + assert hh["response"] == {"mode": "deny_list"} # --------------------------------------------------------------------------- -# Resolution: cases A / B / C / D +# Resolution of data_collection against the legacy send_default_pii option # --------------------------------------------------------------------------- @@ -193,105 +222,134 @@ def _resolve(**options): return resolve_data_collection(base) -def test_resolve_case_c_neither(): +def test_resolve_no_options_collects_no_pii(): dc = _resolve() - assert dc.explicit is False - assert dc.user_info is False - assert dc.gen_ai.inputs is False and dc.gen_ai.outputs is False - assert dc.cookies.mode == "off" - assert dc.query_params.mode == "off" - assert dc.http_headers.request.mode == "denyList" - assert dc.http_bodies == ALL_BODY_TYPES - assert dc.frame_context_lines == 5 + assert dc["user_info"] is False + assert dc["gen_ai"]["inputs"] is False and dc["gen_ai"]["outputs"] is False + assert dc["cookies"]["mode"] == "off" + assert dc["query_params"]["mode"] == "off" + assert dc["http_headers"]["request"]["mode"] == "deny_list" + assert dc["http_bodies"] == ALL_HTTP_BODY_TYPES + assert dc["frame_context_lines"] == 5 -def test_resolve_case_b_pii_true(): +def test_resolve_send_default_pii_true_collects_pii(): dc = _resolve(send_default_pii=True) - assert dc.explicit is False - assert dc.user_info is True - assert dc.gen_ai.inputs is True and dc.gen_ai.outputs is True - assert dc.cookies.mode == "denyList" - assert dc.query_params.mode == "denyList" + assert dc["user_info"] is True + assert dc["gen_ai"]["inputs"] is True and dc["gen_ai"]["outputs"] is True + assert dc["cookies"]["mode"] == "deny_list" + assert dc["query_params"]["mode"] == "deny_list" -def test_resolve_case_b_pii_false(): +def test_resolve_send_default_pii_false_collects_no_pii(): dc = _resolve(send_default_pii=False) - assert dc.explicit is False - assert dc.user_info is False - assert dc.cookies.mode == "off" + assert dc["user_info"] is False + assert dc["cookies"]["mode"] == "off" -def test_resolve_case_a_defaults(): - dc = _resolve(data_collection=DataCollection()) - assert dc.explicit is True +def test_resolve_explicit_data_collection_uses_spec_defaults(): + dc = _resolve(data_collection={}) # spec defaults: collect more - assert dc.user_info is True - assert dc.gen_ai.inputs is True and dc.gen_ai.outputs is True - assert dc.cookies.mode == "denyList" - assert dc.query_params.mode == "denyList" - assert dc.http_bodies == ALL_BODY_TYPES - - -def test_resolve_case_a_partial_uses_spec_defaults_for_omitted(): - dc = _resolve(data_collection=DataCollection(user_info=False, http_bodies=[])) - assert dc.explicit is True - assert dc.user_info is False - assert dc.http_bodies == [] + assert dc["user_info"] is True + assert dc["gen_ai"]["inputs"] is True and dc["gen_ai"]["outputs"] is True + assert dc["cookies"]["mode"] == "deny_list" + assert dc["query_params"]["mode"] == "deny_list" + assert dc["http_bodies"] == ALL_HTTP_BODY_TYPES + + +def test_resolve_explicit_partial_fills_omitted_with_spec_defaults(): + dc = _resolve(data_collection={"user_info": False, "http_bodies": []}) + assert dc["user_info"] is False + assert dc["http_bodies"] == [] # omitted fields use spec defaults - assert dc.gen_ai.inputs is True - assert dc.cookies.mode == "denyList" + assert dc["gen_ai"]["inputs"] is True + assert dc["cookies"]["mode"] == "deny_list" -def test_resolve_case_a_frame_fallback_to_legacy_options(): +def test_resolve_explicit_frame_fields_fall_back_to_legacy_options(): dc = _resolve( - data_collection=DataCollection(), + data_collection={}, include_local_variables=False, include_source_context=False, ) - assert dc.stack_frame_variables is False - assert dc.frame_context_lines == 0 + assert dc["stack_frame_variables"] is False + assert dc["frame_context_lines"] == 0 -def test_resolve_case_d_both_data_collection_wins_and_warns(): +def test_resolve_data_collection_overrides_send_default_pii_and_warns(): with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") - dc = _resolve( - send_default_pii=True, data_collection=DataCollection(user_info=False) - ) - assert dc.explicit is True - assert dc.user_info is False # data_collection wins + dc = _resolve(send_default_pii=True, data_collection={"user_info": False}) + assert dc["user_info"] is False # data_collection wins assert any(issubclass(w.category, DeprecationWarning) for w in caught) -def test_resolve_accepts_dict(): - dc = _resolve(data_collection={"user_info": False, "http_bodies": []}) - assert dc.explicit is True - assert dc.user_info is False - assert dc.http_bodies == [] - assert dc.gen_ai.inputs is True - - def test_resolve_accepts_dict_with_nested_dicts(): dc = _resolve( data_collection={ "cookies": "off", - "query_params": {"mode": "allowList", "terms": ["page"]}, + "query_params": {"mode": "allow_list", "terms": ["page"]}, "http_headers": {"request": "off"}, "gen_ai": {"inputs": False, "outputs": True}, } ) - assert dc.cookies.mode == "off" - assert dc.query_params.mode == "allowList" - assert dc.query_params.terms == ["page"] - assert dc.http_headers.request.mode == "off" - assert dc.http_headers.response.mode == "denyList" - assert dc.gen_ai.inputs is False - assert dc.gen_ai.outputs is True + assert dc["cookies"]["mode"] == "off" + assert dc["query_params"]["mode"] == "allow_list" + assert dc["query_params"]["terms"] == ["page"] + assert dc["http_headers"]["request"]["mode"] == "off" + assert dc["http_headers"]["response"]["mode"] == "deny_list" + assert dc["gen_ai"]["inputs"] is False + assert dc["gen_ai"]["outputs"] is True + + +def test_resolve_http_headers_shorthand_off_applies_to_both(): + dc = _resolve(data_collection={"http_headers": "off"}) + assert dc["http_headers"]["request"]["mode"] == "off" + assert dc["http_headers"]["response"]["mode"] == "off" -def test_resolve_rejects_non_datacollection(): +def test_resolve_http_headers_shorthand_single_kvcb_applies_to_both(): + dc = _resolve(data_collection={"http_headers": {"mode": "off"}}) + assert dc["http_headers"]["request"]["mode"] == "off" + assert dc["http_headers"]["response"]["mode"] == "off" + + +@pytest.mark.parametrize("bad", [42, "oops", ["a"]]) +def test_resolve_rejects_non_dict(bad): with pytest.raises(TypeError): - _resolve(data_collection=42) + _resolve(data_collection=bad) + + +# --------------------------------------------------------------------------- +# graphql / database defaults and legacy PII gating +# --------------------------------------------------------------------------- + + +def test_resolve_explicit_graphql_database_defaults(): + dc = _resolve(data_collection={}) + assert dc["graphql"] == {"document": True, "variables": True} + assert dc["database"] == {"query_params": True} + + +def test_resolve_legacy_pii_off_gates_graphql_and_database(): + dc = _resolve(send_default_pii=False) + # document is always collected; variables/query_params follow send_default_pii + assert dc["graphql"]["document"] is True + assert dc["graphql"]["variables"] is False + assert dc["database"]["query_params"] is False + + +def test_resolve_legacy_pii_on_collects_graphql_and_database(): + dc = _resolve(send_default_pii=True) + assert dc["graphql"]["document"] is True + assert dc["graphql"]["variables"] is True + assert dc["database"]["query_params"] is True + + +def test_resolve_explicit_partial_graphql_fills_omitted(): + dc = _resolve(data_collection={"graphql": {"variables": False}}) + assert dc["graphql"]["document"] is True + assert dc["graphql"]["variables"] is False # --------------------------------------------------------------------------- @@ -304,8 +362,8 @@ def test_resolve_rejects_non_datacollection(): [(True, 5), (False, 0), (3, 3), (0, 0)], ) def test_frame_context_lines_bool_fallback(value, expected): - dc = _resolve(data_collection=DataCollection(frame_context_lines=value)) - assert dc.frame_context_lines == expected + dc = _resolve(data_collection={"frame_context_lines": value}) + assert dc["frame_context_lines"] == expected # --------------------------------------------------------------------------- @@ -313,49 +371,39 @@ def test_frame_context_lines_bool_fallback(value, expected): # --------------------------------------------------------------------------- -def test_client_accessors_case_c(): +def test_client_data_collection_defaults_to_no_pii(): sentry_sdk.init() client = sentry_sdk.get_client() - assert client.should_collect_user_info() is False + assert client.data_collection["user_info"] is False assert client.should_send_default_pii() is False - assert client.should_collect_gen_ai_inputs() is False - assert client.should_collect_gen_ai_outputs() is False + assert client.data_collection["provided_by_user"] is False -def test_client_accessors_case_b_pii(): +def test_client_send_default_pii_enables_user_info(): sentry_sdk.init(send_default_pii=True) client = sentry_sdk.get_client() - assert client.should_collect_user_info() is True - assert client.should_collect_gen_ai_inputs() is True - # include_prompts=False override still disables (legacy AND semantics) - assert client.should_collect_gen_ai_inputs(False) is False - assert client.should_collect_gen_ai_inputs(True) is True + assert client.data_collection["user_info"] is True + assert client.data_collection["provided_by_user"] is False -def test_client_accessors_case_a(): - sentry_sdk.init(data_collection=DataCollection(user_info=False)) +def test_client_explicit_data_collection_overrides_user_info(): + sentry_sdk.init(data_collection={"user_info": False}) client = sentry_sdk.get_client() - assert client.should_collect_user_info() is False - # gen_ai defaults to True in explicit mode - assert client.should_collect_gen_ai_inputs() is True - # explicit integration override wins - assert client.should_collect_gen_ai_inputs(False) is False - - -def test_client_accessors_gen_ai_explicit_override(): - sentry_sdk.init( - data_collection=DataCollection( - gen_ai=GenAICollection(inputs=False, outputs=True) - ) - ) + assert client.data_collection["user_info"] is False + assert client.data_collection["provided_by_user"] is True + + +def test_client_dsnless_spotlight_rederives_data_collection(): + # DSN-less spotlight flips send_default_pii on; non-explicit data_collection + # is re-derived to agree. + sentry_sdk.init(spotlight=True) client = sentry_sdk.get_client() - assert client.should_collect_gen_ai_inputs() is False - assert client.should_collect_gen_ai_outputs() is True - # integration override beats the global gen_ai setting - assert client.should_collect_gen_ai_inputs(True) is True + assert client.data_collection["provided_by_user"] is False + assert client.data_collection["user_info"] is True -def test_http_headers_collection_defaults(): - hh = HttpHeadersCollection() - assert hh.request.mode == "denyList" - assert hh.response.mode == "denyList" +def test_client_dsnless_spotlight_respects_explicit_data_collection(): + sentry_sdk.init(spotlight=True, data_collection={"user_info": False}) + client = sentry_sdk.get_client() + assert client.data_collection["provided_by_user"] is True + assert client.data_collection["user_info"] is False