Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,5 @@
# Changelog

## Unreleased

### Features

- Add the `data_collection` option, a structured configuration that supersedes `send_default_pii` for controlling what data integrations collect automatically (user identity, cookies, HTTP headers, query params, HTTP bodies, generative AI inputs/outputs, stack frame variables, source context). See the [Data Collection spec](https://develop.sentry.dev/sdk/foundations/client/data-collection/).
- Adds `sentry_sdk.DataCollection`, `KeyValueCollectionBehavior`, `HttpHeadersCollection`, and `GenAICollection`.
- When `data_collection` is not set, behavior is derived from `send_default_pii` (now deprecated), so upgrading without configuring `data_collection` changes nothing.
- `frame_context_lines` is now configurable (previously hardcoded to 5); AI integrations' `include_prompts` becomes a per-integration override of `data_collection.gen_ai`.

## 2.63.0

### Bug Fixes 🐛
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ sentry_sdk.init(
# To disable sending user data and HTTP request/response bodies, uncomment
# the line below. For more info visit:
# https://docs.sentry.io/platforms/python/configuration/options/#data_collection
# data_collection=sentry_sdk.DataCollection(user_info=False, http_bodies=[]),
# data_collection={"user_info": False, "http_bodies": []},
)
```

Expand Down
10 changes: 0 additions & 10 deletions sentry_sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@

from sentry_sdk.scope import Scope # isort: skip
from sentry_sdk.client import Client # isort: skip
from sentry_sdk.data_collection import ( # isort: skip
DataCollection,
GenAICollection,
HttpHeadersCollection,
KeyValueCollectionBehavior,
)
from sentry_sdk.consts import VERSION
from sentry_sdk.transport import HttpTransport, Transport

Expand All @@ -17,10 +11,6 @@
"Hub",
"Scope",
"Client",
"DataCollection",
"GenAICollection",
"HttpHeadersCollection",
"KeyValueCollectionBehavior",
"Transport",
"HttpTransport",
"VERSION",
Expand Down
61 changes: 60 additions & 1 deletion sentry_sdk/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def substituted_because_contains_sensitive_data(cls) -> "AnnotatedValue":
from collections.abc import Container, MutableMapping, Sequence
from datetime import datetime
from types import TracebackType
from typing import Any, Callable, Dict, Mapping, NotRequired, Optional, Type
from typing import Any, Callable, Dict, List, Mapping, NotRequired, Optional, Type

from typing_extensions import Literal, TypedDict

Expand All @@ -152,6 +152,65 @@ class SDKInfo(TypedDict):
version: str
packages: "Sequence[Mapping[str, str]]"

class KeyValueCollectionBehaviour(TypedDict):
mode: 'Literal["off", "deny_list", "allow_list"]'
terms: "NotRequired[List[str]]"

class GenAICollectionUserOptions(TypedDict, total=False):
inputs: bool
outputs: bool

class GenAICollectionBehaviour(TypedDict):
inputs: bool
outputs: bool

class GraphQLCollectionUserOptions(TypedDict, total=False):
document: bool
variables: bool

class GraphQLCollectionBehaviour(TypedDict):
document: bool
variables: bool

class DatabaseCollectionUserOptions(TypedDict, total=False):
query_params: bool

class DatabaseCollectionBehaviour(TypedDict):
query_params: bool

class HttpHeadersCollectionUserOptions(TypedDict, total=False):
request: "KeyValueCollectionBehaviour"
response: "KeyValueCollectionBehaviour"

class HttpHeadersCollectionBehaviour(TypedDict):
request: "KeyValueCollectionBehaviour"
response: "KeyValueCollectionBehaviour"

class DataCollectionUserOptions(TypedDict, total=False):
user_info: bool
cookies: "KeyValueCollectionBehaviour"
http_headers: "HttpHeadersCollectionBehaviour"
http_bodies: "List[str]"
query_params: "KeyValueCollectionBehaviour"
graphql: "GraphQLCollectionBehaviour"
gen_ai: "GenAICollectionBehaviour"
database: "DatabaseCollectionBehaviour"
stack_frame_variables: bool
frame_context_lines: int

class DataCollection(TypedDict):
provided_by_user: bool
user_info: bool
cookies: "KeyValueCollectionBehaviour"
http_headers: "HttpHeadersCollectionBehaviour"
http_bodies: "List[str]"
query_params: "KeyValueCollectionBehaviour"
graphql: "GraphQLCollectionBehaviour"
gen_ai: "GenAICollectionBehaviour"
database: "DatabaseCollectionBehaviour"
stack_frame_variables: bool
frame_context_lines: int

# "critical" is an alias of "fatal" recognized by Relay
LogLevelStr = Literal["fatal", "critical", "error", "warning", "info", "debug"]

Expand Down
87 changes: 18 additions & 69 deletions sentry_sdk/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
ClientConstructor,
)
from sentry_sdk.data_collection import (
OFF_DATA_COLLECTION,
DataCollection,
_map_from_send_default_pii,
resolve_data_collection,
)
Expand Down Expand Up @@ -76,6 +74,7 @@
from sentry_sdk._log_batcher import LogBatcher
from sentry_sdk._metrics_batcher import MetricsBatcher
from sentry_sdk._types import (
DataCollection,
Event,
EventDataCategory,
Hint,
Expand Down Expand Up @@ -355,7 +354,7 @@ def _get_options(*args: "Optional[str]", **kwargs: "Any") -> "Dict[str, Any]":

if rv["event_scrubber"] is None:
rv["event_scrubber"] = EventScrubber(
send_default_pii=rv["data_collection"].user_info
send_default_pii=rv["data_collection"]["user_info"]
)

if rv["socket_options"] and not isinstance(rv["socket_options"], list):
Expand Down Expand Up @@ -392,6 +391,12 @@ def _get_options(*args: "Optional[str]", **kwargs: "Any") -> "Dict[str, Any]":
# Older Python versions
module_not_found_error = ImportError # type: ignore

_DISABLED_DATA_COLLECTION_CONFIG = _map_from_send_default_pii(
send_default_pii=False,
include_local_variables=True,
include_source_context=True,
)


class BaseClient:
"""
Expand Down Expand Up @@ -433,20 +438,7 @@ def should_send_default_pii(self) -> bool:

@property
def data_collection(self) -> "DataCollection":
return OFF_DATA_COLLECTION

def should_collect_user_info(self) -> bool:
return False

def should_collect_gen_ai_inputs(
self, include_prompts: "Optional[bool]" = None
) -> bool:
return False

def should_collect_gen_ai_outputs(
self, include_prompts: "Optional[bool]" = None
) -> bool:
return False
return _DISABLED_DATA_COLLECTION_CONFIG

def is_active(self) -> bool:
"""
Expand Down Expand Up @@ -639,14 +631,16 @@ def _record_lost_event(
self.options["profiles_sampler"] = sample_all
# data_collection was resolved in _get_options() before this
# spotlight override flipped send_default_pii on. Re-derive it so
# the should_collect_* accessors agree with should_send_default_pii()
# in DSN-less spotlight mode (only when the user did not set
# data_collection agrees with should_send_default_pii() in
# DSN-less spotlight mode (only when the user did not set
# data_collection explicitly).
if not self.options["data_collection"].explicit:
if not self.options["data_collection"]["provided_by_user"]:
self.options["data_collection"] = _map_from_send_default_pii(
True,
self.options["include_local_variables"] is not False,
self.options["include_source_context"] is not False,
send_default_pii=True,
include_local_variables=self.options["include_local_variables"]
is not False,
include_source_context=self.options["include_source_context"]
is not False,
)

self.session_flusher = SessionFlusher(capture_func=_capture_envelope)
Expand Down Expand Up @@ -764,52 +758,7 @@ def data_collection(self) -> "DataCollection":
Returns the resolved :class:`~sentry_sdk.data_collection.DataCollection`
config for this client.
"""
dc = self.options.get("data_collection")
return dc if dc is not None else OFF_DATA_COLLECTION

def should_collect_user_info(self) -> bool:
"""
Returns whether the SDK should automatically populate ``user.*`` fields
(id, email, username, ip_address) from instrumentation.
"""
return bool(self.data_collection.user_info)

def should_collect_gen_ai_inputs(
self, include_prompts: "Optional[bool]" = None
) -> bool:
"""
Returns whether the SDK should collect generative AI input content.

``include_prompts`` is the integration-level override (if set, it takes
precedence over the global ``data_collection.gen_ai.inputs`` setting).
"""
return self._should_collect_gen_ai_content("inputs", include_prompts)

def should_collect_gen_ai_outputs(
self, include_prompts: "Optional[bool]" = None
) -> bool:
"""
Returns whether the SDK should collect generative AI output content.

``include_prompts`` is the integration-level override (if set, it takes
precedence over the global ``data_collection.gen_ai.outputs`` setting).
"""
return self._should_collect_gen_ai_content("outputs", include_prompts)

def _should_collect_gen_ai_content(
self, direction: str, include_prompts: "Optional[bool]"
) -> bool:
dc = self.data_collection
if dc.explicit:
# Integration-level override wins over the global gen_ai setting.
if include_prompts is not None:
return include_prompts
return bool(getattr(dc.gen_ai, direction))
# Legacy (data_collection not set): preserve the historical gate
# `should_send_default_pii() and integration.include_prompts`.
# `include_prompts is None` means "no integration-level override", which
# falls back to the legacy default of True (collect when PII is on).
return self.should_send_default_pii() and (include_prompts is not False)
return self.options["data_collection"]

@property
def dsn(self) -> "Optional[str]":
Expand Down
8 changes: 4 additions & 4 deletions sentry_sdk/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class CompressionAlgo(Enum):
from sentry_sdk._types import (
BreadcrumbProcessor,
ContinuousProfilerMode,
DataCollectionUserOptions,
Event,
EventProcessor,
Hint,
Expand All @@ -56,7 +57,6 @@ class CompressionAlgo(Enum):
TracesSampler,
TransactionProcessor,
)
from sentry_sdk.data_collection import DataCollection

# Experiments are feature flags to enable and disable certain unstable SDK
# functionality. Changing them from the defaults (`None`) in production
Expand Down Expand Up @@ -1273,7 +1273,7 @@ def __init__(
transport_queue_size: int = DEFAULT_QUEUE_SIZE,
sample_rate: float = 1.0,
send_default_pii: "Optional[bool]" = None,
data_collection: "Optional[Union[DataCollection, Dict[str, Any]]]" = None,
data_collection: "Optional[DataCollectionUserOptions]" = None,
http_proxy: "Optional[str]" = None,
https_proxy: "Optional[str]" = None,
ignore_errors: "Sequence[Union[type, str]]" = [], # noqa: B006
Expand Down Expand Up @@ -1432,12 +1432,12 @@ def __init__(
Use `data_collection` instead. `send_default_pii` is still honored when `data_collection` is not set.

:param data_collection: Structured configuration controlling what data integrations collect automatically,
superseding `send_default_pii`. Pass a dict or a :class:`sentry_sdk.DataCollection` instance to enable or
superseding `send_default_pii`. Pass a dict to enable or
restrict collection per category (user identity, cookies, HTTP headers/bodies, query params, generative AI
inputs/outputs, stack frame variables, source context).

When `data_collection` is set, omitted fields use their defaults (most categories are collected, with the
sensitive denylist scrubbing values). When it is not set, the SDK derives behavior from `send_default_pii`
sensitive denylist scrubbing values). When it is not set, the SDK derives behaviour from `send_default_pii`
so that upgrading without configuring `data_collection` changes nothing. If both are set, `data_collection`
takes precedence.

Expand Down
Loading
Loading