Skip to content

Commit 51f3bac

Browse files
mabdinurYun-KimmajorgreysP403n1x87emmettbutler
committed
chore(telemetry): ensure instrumentation telemetry is compatible with python 3.12 (#6859)
## Motivation - Make the instrumentation telemetry client compatible with python3.12: python/cpython#104826 ## Description - Start telemetry worker thread as early as possible. - Delays sending all telemetry events until app-started is queued. - Refactors tests to align with this new logic. ## Risk - Telemetry events (metrics/logs/integrations) are queued as early as possible but these events are only sent when the trace agent writer is started. This **may** result in a memory leak if high cardinality telemetry metrics and logs are added in the future. This is not a concern right now. ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed. If no release note is required, add label `changelog/no-changelog`. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) - [x] If this PR touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. - [x] This PR doesn't touch any of that. --------- Co-authored-by: Yun Kim <[email protected]> Co-authored-by: Tahir H. Butt <[email protected]> Co-authored-by: Gabriele N. Tornetta <[email protected]> Co-authored-by: Yun Kim <[email protected]> Co-authored-by: Emmett Butler <[email protected]> Co-authored-by: ZStriker19 <[email protected]>
1 parent 4575165 commit 51f3bac

File tree

9 files changed

+77
-71
lines changed

9 files changed

+77
-71
lines changed

.circleci/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@ jobs:
3838
workflows:
3939
setup:
4040
jobs:
41-
- setup
41+
- setup

ddtrace/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525

2626

2727
telemetry.install_excepthook()
28+
# In order to support 3.12, we start the writer upon initialization.
29+
# See https://github.com/python/cpython/pull/104826.
30+
# Telemetry events will only be sent after the `app-started` is queued.
31+
# This will occur when the agent writer starts.
32+
telemetry.telemetry_writer.enable()
2833

2934
from ._monkey import patch # noqa: E402
3035
from ._monkey import patch_all # noqa: E402

ddtrace/internal/telemetry/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ def _excepthook(tp, value, root_traceback):
4949
error_msg = "{}:{} {}".format(filename, lineno, str(value))
5050
telemetry_writer.add_integration(integration_name, True, error_msg=error_msg)
5151

52+
if telemetry_writer.started is False:
53+
telemetry_writer._app_started_event(False)
54+
telemetry_writer._app_dependencies_loaded_event()
55+
56+
telemetry_writer.app_shutdown()
57+
telemetry_writer.disable()
58+
5259
return _ORIGINAL_EXCEPTHOOK(tp, value, root_traceback)
5360

5461

ddtrace/internal/telemetry/data.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
from typing import List
55
from typing import Tuple
66

7-
import ddtrace
87
from ddtrace.internal.compat import PY3
98
from ddtrace.internal.constants import DEFAULT_SERVICE_NAME
109
from ddtrace.internal.packages import get_distributions
1110
from ddtrace.internal.runtime.container import get_container_info
1211
from ddtrace.internal.utils.cache import cached
12+
from ddtrace.version import get_version
1313

1414
from ...settings import _config as config
1515
from ..hostname import get_hostname
@@ -63,7 +63,7 @@ def _get_application(key):
6363
"env": env or "",
6464
"language_name": "python",
6565
"language_version": _format_version_info(sys.version_info),
66-
"tracer_version": ddtrace.__version__,
66+
"tracer_version": get_version(),
6767
"runtime_name": platform.python_implementation(),
6868
"runtime_version": _format_version_info(sys.implementation.version) if PY3 else "",
6969
"products": _get_products(),
@@ -88,7 +88,7 @@ def get_application(service, version, env):
8888
def _get_products():
8989
# type: () -> Dict
9090
return {
91-
"appsec": {"version": ddtrace.__version__, "enabled": config._appsec_enabled},
91+
"appsec": {"version": get_version(), "enabled": config._appsec_enabled},
9292
}
9393

9494

ddtrace/internal/telemetry/writer.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,6 @@ def enable(self):
216216

217217
if self._is_periodic:
218218
self.start()
219-
atexit.register(self.app_shutdown)
220219
return True
221220

222221
self.status = ServiceStatus.RUNNING
@@ -290,14 +289,18 @@ def add_error(self, code, msg, filename, line_number):
290289
msg = "%s:%s: %s" % (filename, line_number, msg)
291290
self._error = (code, msg)
292291

293-
def _app_started_event(self):
294-
# type: () -> None
292+
def _app_started_event(self, register_app_shutdown=True):
293+
# type: (bool) -> None
295294
"""Sent when TelemetryWriter is enabled or forks"""
296295
if self._forked:
297296
# app-started events should only be sent by the main process
298297
return
299298
# List of configurations to be collected
300299

300+
self.started = True
301+
if register_app_shutdown:
302+
atexit.register(self.app_shutdown)
303+
301304
self.add_configurations(
302305
[
303306
(TELEMETRY_TRACING_ENABLED, config._tracing_enabled, "unknown"),
@@ -593,15 +596,6 @@ def periodic(self, force_flush=False):
593596
for telemetry_event in telemetry_events:
594597
self._client.send_event(telemetry_event)
595598

596-
def start(self, *args, **kwargs):
597-
# type: (...) -> None
598-
super(TelemetryWriter, self).start(*args, **kwargs)
599-
# Queue app-started event after the telemetry worker thread is running
600-
if self.started is False:
601-
self._app_started_event()
602-
self._app_dependencies_loaded_event()
603-
self.started = True
604-
605599
def app_shutdown(self):
606600
self._app_closing_event()
607601
self.periodic(force_flush=True)
@@ -634,8 +628,7 @@ def _fork_writer(self):
634628

635629
# Enable writer service in child process to avoid interpreter shutdown
636630
# error in Python 3.12
637-
if sys.version_info >= (3, 12):
638-
self.enable()
631+
self.enable()
639632

640633
def _restart_sequence(self):
641634
self._sequence = itertools.count(1)

ddtrace/internal/writer/writer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,8 @@ def _send_payload(self, payload, count, client):
627627
def start(self):
628628
super(AgentWriter, self).start()
629629
try:
630-
telemetry_writer.enable()
630+
telemetry_writer._app_started_event()
631+
telemetry_writer._app_dependencies_loaded_event()
631632

632633
# appsec remote config should be enabled/started after the global tracer and configs
633634
# are initialized

tests/telemetry/test_telemetry.py

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@
88
def test_enable(test_agent_session, run_python_code_in_subprocess):
99
code = """
1010
from ddtrace.internal.telemetry import telemetry_writer
11+
from ddtrace.internal.service import ServiceStatus
12+
1113
telemetry_writer.enable()
14+
15+
assert telemetry_writer.status == ServiceStatus.RUNNING
16+
assert telemetry_writer._worker is not None
1217
"""
1318

1419
stdout, stderr, status, _ = run_python_code_in_subprocess(code)
@@ -17,26 +22,10 @@ def test_enable(test_agent_session, run_python_code_in_subprocess):
1722
assert stdout == b"", stderr
1823
assert stderr == b""
1924

20-
events = test_agent_session.get_events()
21-
assert len(events) == 3
22-
23-
# Same runtime id is used
24-
assert events[0]["runtime_id"] == events[1]["runtime_id"]
25-
assert events[0]["request_type"] == "app-closing"
26-
assert events[1]["request_type"] == "app-dependencies-loaded"
27-
assert events[2]["request_type"] == "app-started"
28-
assert events[2]["payload"]["error"] == {"code": 0, "message": ""}
29-
3025

3126
@pytest.mark.snapshot
3227
def test_telemetry_enabled_on_first_tracer_flush(test_agent_session, ddtrace_run_python_code_in_subprocess):
3328
"""assert telemetry events are generated after the first trace is flushed to the agent"""
34-
# Using ddtrace-run and/or importing ddtrace alone should not enable telemetry
35-
# Telemetry data should only be sent after the first trace to the agent
36-
_, stderr, status, _ = ddtrace_run_python_code_in_subprocess("import ddtrace")
37-
assert status == 0, stderr
38-
# No trace and No Telemetry
39-
assert len(test_agent_session.get_events()) == 0
4029

4130
# Submit a trace to the agent in a subprocess
4231
code = 'from ddtrace import tracer; span = tracer.trace("test-telemetry"); span.finish()'
@@ -58,13 +47,19 @@ def test_telemetry_enabled_on_first_tracer_flush(test_agent_session, ddtrace_run
5847
def test_enable_fork(test_agent_session, run_python_code_in_subprocess):
5948
"""assert app-started/app-closing events are only sent in parent process"""
6049
code = """
50+
import warnings
51+
# This test logs the following warning in py3.12:
52+
# This process (pid=402) is multi-threaded, use of fork() may lead to deadlocks in the child
53+
warnings.filterwarnings("ignore", category=DeprecationWarning)
54+
6155
import os
6256
6357
from ddtrace.internal.runtime import get_runtime_id
6458
from ddtrace.internal.telemetry import telemetry_writer
6559
6660
# We have to start before forking since fork hooks are not enabled until after enabling
6761
telemetry_writer.enable()
62+
telemetry_writer._app_started_event()
6863
6964
if os.fork() == 0:
7065
# Send multiple started events to confirm none get sent
@@ -78,27 +73,29 @@ def test_enable_fork(test_agent_session, run_python_code_in_subprocess):
7873

7974
stdout, stderr, status, _ = run_python_code_in_subprocess(code)
8075
assert status == 0, stderr
81-
assert stderr == b""
76+
assert stderr == b"", stderr
8277

8378
runtime_id = stdout.strip().decode("utf-8")
8479

8580
requests = test_agent_session.get_requests()
8681

8782
# We expect 2 events from the parent process to get sent, but none from the child process
88-
assert len(requests) == 3
83+
assert len(requests) == 2
8984
# Validate that the runtime id sent for every event is the parent processes runtime id
9085
assert requests[0]["body"]["runtime_id"] == runtime_id
9186
assert requests[0]["body"]["request_type"] == "app-closing"
9287
assert requests[1]["body"]["runtime_id"] == runtime_id
93-
assert requests[1]["body"]["request_type"] == "app-dependencies-loaded"
94-
assert requests[1]["body"]["runtime_id"] == runtime_id
95-
assert requests[2]["body"]["request_type"] == "app-started"
96-
assert requests[2]["body"]["runtime_id"] == runtime_id
88+
assert requests[1]["body"]["request_type"] == "app-started"
9789

9890

9991
def test_enable_fork_heartbeat(test_agent_session, run_python_code_in_subprocess):
10092
"""assert app-heartbeat events are only sent in parent process when no other events are queued"""
10193
code = """
94+
import warnings
95+
# This test logs the following warning in py3.12:
96+
# This process (pid=402) is multi-threaded, use of fork() may lead to deadlocks in the child
97+
warnings.filterwarnings("ignore", category=DeprecationWarning)
98+
10299
import os
103100
104101
from ddtrace.internal.runtime import get_runtime_id
@@ -120,7 +117,7 @@ def test_enable_fork_heartbeat(test_agent_session, run_python_code_in_subprocess
120117

121118
stdout, stderr, status, _ = run_python_code_in_subprocess(code)
122119
assert status == 0, stderr
123-
assert stderr == b""
120+
assert stderr == b"", stderr
124121

125122
runtime_id = stdout.strip().decode("utf-8")
126123

@@ -138,6 +135,11 @@ def test_heartbeat_interval_configuration(run_python_code_in_subprocess):
138135
env = os.environ.copy()
139136
env["DD_TELEMETRY_HEARTBEAT_INTERVAL"] = "61"
140137
code = """
138+
import warnings
139+
# This test logs the following warning in py3.12:
140+
# This process (pid=402) is multi-threaded, use of fork() may lead to deadlocks in the child
141+
warnings.filterwarnings("ignore", category=DeprecationWarning)
142+
141143
from ddtrace import config
142144
assert config._telemetry_heartbeat_interval == 61
143145
@@ -156,6 +158,11 @@ def test_logs_after_fork(run_python_code_in_subprocess):
156158
# Regression test: telemetry writer should not log an error when a process forks
157159
_, err, status, _ = run_python_code_in_subprocess(
158160
"""
161+
import warnings
162+
# This test logs the following warning in py3.12:
163+
# This process (pid=402) is multi-threaded, use of fork() may lead to deadlocks in the child
164+
warnings.filterwarnings("ignore", category=DeprecationWarning)
165+
159166
import ddtrace
160167
import logging
161168
import os
@@ -167,7 +174,7 @@ def test_logs_after_fork(run_python_code_in_subprocess):
167174
)
168175

169176
assert status == 0, err
170-
assert err == b""
177+
assert err == b"", err
171178

172179

173180
def test_app_started_error_handled_exception(test_agent_session, run_python_code_in_subprocess):
@@ -250,6 +257,9 @@ def test_handled_integration_error(test_agent_session, run_python_code_in_subpro
250257
251258
from ddtrace import patch, tracer
252259
patch(raise_errors=False, sqlite3=True)
260+
261+
# Create a span to start the telemetry writer
262+
tracer.trace("hi").finish()
253263
"""
254264

255265
_, stderr, status, _ = run_python_code_in_subprocess(code)
@@ -260,15 +270,11 @@ def test_handled_integration_error(test_agent_session, run_python_code_in_subpro
260270

261271
events = test_agent_session.get_events()
262272

263-
assert len(events) == 5
264-
# Same runtime id is used
265-
assert (
266-
events[0]["runtime_id"]
267-
== events[1]["runtime_id"]
268-
== events[2]["runtime_id"]
269-
== events[3]["runtime_id"]
270-
== events[4]["runtime_id"]
271-
)
273+
assert len(events) > 1
274+
for event in events:
275+
# Same runtime id is used
276+
assert event["runtime_id"] == events[0]["runtime_id"]
277+
272278
integrations_events = [event for event in events if event["request_type"] == "app-integrations-change"]
273279

274280
assert len(integrations_events) == 1
@@ -277,12 +283,14 @@ def test_handled_integration_error(test_agent_session, run_python_code_in_subpro
277283
== "failed to import ddtrace module 'ddtrace.contrib.sqlite3' when patching on import"
278284
)
279285

280-
metric_events = [event for event in events if event["request_type"] == "generate-metrics"]
281-
286+
metric_events = [
287+
event
288+
for event in events
289+
if event["request_type"] == "generate-metrics"
290+
and event["payload"]["series"][0]["metric"] == "integration_errors"
291+
]
282292
assert len(metric_events) == 1
283-
assert metric_events[0]["payload"]["namespace"] == "tracers"
284293
assert len(metric_events[0]["payload"]["series"]) == 1
285-
assert metric_events[0]["payload"]["series"][0]["metric"] == "integration_errors"
286294
assert metric_events[0]["payload"]["series"][0]["type"] == "count"
287295
assert len(metric_events[0]["payload"]["series"][0]["points"]) == 1
288296
assert metric_events[0]["payload"]["series"][0]["points"][0][1] == 1

tests/telemetry/test_telemetry_metrics_e2e.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import os
44
import subprocess
55
import sys
6-
import time
76

87
import pytest
98

@@ -28,8 +27,6 @@ def _build_env():
2827
def gunicorn_server(telemetry_metrics_enabled="true", token=None):
2928
cmd = ["ddtrace-run", "gunicorn", "-w", "1", "-b", "0.0.0.0:8000", "tests.telemetry.app:app"]
3029
env = _build_env()
31-
env["DD_TELEMETRY_METRICS_ENABLED"] = telemetry_metrics_enabled
32-
env["DD_TELEMETRY_HEARTBEAT_INTERVAL"] = "1.0"
3330
env["_DD_TRACE_WRITER_ADDITIONAL_HEADERS"] = "X-Datadog-Test-Session-Token:{}".format(token)
3431
env["DD_TRACE_AGENT_URL"] = os.environ.get("DD_TRACE_AGENT_URL", "")
3532
env["DD_TRACE_DEBUG"] = "true"
@@ -90,19 +87,15 @@ def test_telemetry_metrics_enabled_on_gunicorn_child_process(test_agent_session)
9087
gunicorn_client.get("/count_metric")
9188
response = gunicorn_client.get("/count_metric")
9289
assert response.status_code == 200
93-
# DD_TELEMETRY_HEARTBEAT_INTERVAL is set to 1 second
94-
time.sleep(1)
9590
gunicorn_client.get("/count_metric")
9691
response = gunicorn_client.get("/count_metric")
9792
assert response.status_code == 200
9893

9994
events = test_agent_session.get_events()
10095
metrics = list(filter(lambda event: event["request_type"] == "generate-metrics", events))
101-
assert len(metrics) == 2
96+
assert len(metrics) == 1
10297
assert metrics[0]["payload"]["series"][0]["metric"] == "test_metric"
103-
assert metrics[0]["payload"]["series"][0]["points"][0][1] == 2.0
104-
assert metrics[1]["payload"]["series"][0]["metric"] == "test_metric"
105-
assert metrics[1]["payload"]["series"][0]["points"][0][1] == 3.0
98+
assert metrics[0]["payload"]["series"][0]["points"][0][1] == 5
10699

107100

108101
def test_span_creation_and_finished_metrics_datadog(test_agent_session, ddtrace_run_python_code_in_subprocess):

tests/telemetry/test_writer.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -372,8 +372,8 @@ def test_send_failing_request(mock_status, telemetry_writer):
372372
with httpretty.enabled():
373373
httpretty.register_uri(httpretty.POST, telemetry_writer._client.url, status=mock_status)
374374
with mock.patch("ddtrace.internal.telemetry.writer.log") as log:
375-
# sends failing app-closing event
376-
telemetry_writer.app_shutdown()
375+
# sends failing app-heartbeat event
376+
telemetry_writer.periodic()
377377
# asserts unsuccessful status code was logged
378378
log.debug.assert_called_with(
379379
"failed to send telemetry to the Datadog Agent at %s. response: %s",
@@ -392,13 +392,11 @@ def test_telemetry_graceful_shutdown(telemetry_writer, test_agent_session, mock_
392392
telemetry_writer.app_shutdown()
393393

394394
events = test_agent_session.get_events()
395-
assert len(events) == 3
395+
assert len(events) == 1
396396

397397
# Reverse chronological order
398398
assert events[0]["request_type"] == "app-closing"
399-
assert events[0] == _get_request_body({}, "app-closing", 3)
400-
assert events[1]["request_type"] == "app-dependencies-loaded"
401-
assert events[2]["request_type"] == "app-started"
399+
assert events[0] == _get_request_body({}, "app-closing", 1)
402400

403401

404402
def test_app_heartbeat_event_periodic(mock_time, telemetry_writer, test_agent_session):
@@ -407,6 +405,7 @@ def test_app_heartbeat_event_periodic(mock_time, telemetry_writer, test_agent_se
407405

408406
# Ensure telemetry writer is initialized to send periodic events
409407
telemetry_writer._is_periodic = True
408+
telemetry_writer.started = True
410409
# Assert default telemetry interval is 10 seconds and the expected periodic threshold and counts are set
411410
assert telemetry_writer.interval == 10
412411
assert telemetry_writer._periodic_threshold == 5

0 commit comments

Comments
 (0)