Skip to content

Commit 15c32da

Browse files
authored
tracks helpers usage and source names (#497)
* tracks helper usage * tracks source names * adds source names to extract info * bumps version to 0.3.4 * uses typed dict for sources info
1 parent fb9dabf commit 15c32da

File tree

14 files changed

+174
-74
lines changed

14 files changed

+174
-74
lines changed

dlt/cli/utils.py

Lines changed: 3 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
11
import ast
2-
import inspect
32
import os
43
import tempfile
5-
import time
6-
import contextlib
7-
from typing import Any, Callable, Tuple
4+
from typing import Callable
85

96
from dlt.common import git
107
from dlt.common.reflection.utils import set_ast_parents
118
from dlt.common.storages import FileStorage
129
from dlt.common.typing import TFun
13-
from dlt.common.runtime.telemetry import start_telemetry
14-
from dlt.common.runtime.segment import track
1510
from dlt.common.configuration import resolve_configuration
1611
from dlt.common.configuration.specs import RunConfiguration
12+
from dlt.common.runtime.telemetry import with_telemetry
1713

1814
from dlt.reflection.script_visitor import PipelineScriptVisitor
1915

@@ -62,39 +58,7 @@ def ensure_git_command(command: str) -> None:
6258

6359

6460
def track_command(command: str, track_before: bool, *args: str) -> Callable[[TFun], TFun]:
65-
"""Adds telemetry to f: TFun and add optional f *args values to `properties` of telemetry event"""
66-
def decorator(f: TFun) -> TFun:
67-
sig: inspect.Signature = inspect.signature(f)
68-
def _wrap(*f_args: Any, **f_kwargs: Any) -> Any:
69-
# look for additional arguments
70-
bound_args = sig.bind(*f_args, **f_kwargs)
71-
props = {p:bound_args.arguments[p] for p in args if p in bound_args.arguments}
72-
start_ts = time.time()
73-
74-
def _track(success: bool) -> None:
75-
with contextlib.suppress(Exception):
76-
props["elapsed"] = time.time() - start_ts
77-
props["success"] = success
78-
# resolve runtime config and init telemetry
79-
c = resolve_configuration(RunConfiguration())
80-
start_telemetry(c)
81-
track("command", command, props)
82-
83-
# some commands should be tracked before execution
84-
if track_before:
85-
_track(True)
86-
return f(*f_args, **f_kwargs)
87-
# some commands we track after, where we can pass the success
88-
try:
89-
rv = f(*f_args, **f_kwargs)
90-
_track(rv == 0)
91-
return rv
92-
except Exception:
93-
_track(False)
94-
raise
95-
96-
return _wrap # type: ignore
97-
return decorator
61+
return with_telemetry("command", command, track_before, *args)
9862

9963

10064
def get_telemetry_status() -> bool:

dlt/common/pipeline.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import os
22
import datetime # noqa: 251
33
import humanize
4-
import inspect
54
import contextlib
65
from typing import Any, Callable, ClassVar, Dict, List, NamedTuple, Optional, Protocol, Sequence, TYPE_CHECKING, Tuple, TypedDict
76

@@ -25,8 +24,16 @@
2524
from dlt.common.data_writers.writers import TLoaderFileFormat
2625

2726

27+
class ExtractDataInfo(TypedDict):
28+
name: str
29+
data_type: str
30+
31+
2832
class ExtractInfo(NamedTuple):
2933
"""A tuple holding information on extracted data items. Returned by pipeline `extract` method."""
34+
35+
extract_data_info: List[ExtractDataInfo]
36+
3037
def asdict(self) -> DictStrAny:
3138
return {}
3239

@@ -209,7 +216,8 @@ def __call__(
209216
table_name: str = None,
210217
write_disposition: TWriteDisposition = None,
211218
columns: Sequence[TColumnSchema] = None,
212-
schema: Schema = None
219+
schema: Schema = None,
220+
loader_file_format: TLoaderFileFormat = None
213221
) -> LoadInfo:
214222
...
215223

dlt/common/runtime/segment.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,14 @@
1313
from dlt.common.configuration.paths import get_dlt_data_dir
1414

1515
from dlt.common.runtime import logger
16+
1617
from dlt.common.configuration.specs import RunConfiguration
1718
from dlt.common.runtime.exec_info import exec_info_names, in_continuous_integration
1819
from dlt.common.typing import DictStrAny, StrAny
1920
from dlt.common.utils import uniq_id
2021
from dlt.version import __version__, DLT_PKG_NAME
2122

22-
TEventCategory = Literal["pipeline", "command"]
23+
TEventCategory = Literal["pipeline", "command", "helper"]
2324

2425
_THREAD_POOL: ThreadPoolExecutor = None
2526
_SESSION: requests.Session = None
@@ -202,9 +203,10 @@ def _send_event(
202203
headers = _segment_request_header(_WRITE_KEY)
203204

204205
def _future_send() -> None:
206+
# import time
205207
# start_ts = time.time()
206208
resp = _SESSION.post(_SEGMENT_ENDPOINT, headers=headers, json=payload, timeout=_SEGMENT_REQUEST_TIMEOUT)
207-
# print(f"sending to Segment done {resp.status_code} {time.time() - start_ts}")
209+
# print(f"SENDING TO Segment done {resp.status_code} {time.time() - start_ts} {base64.b64decode(_WRITE_KEY)}")
208210
# handle different failure cases
209211
if resp.status_code != 200:
210212
logger.debug(

dlt/common/runtime/telemetry.py

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
1+
import time
2+
import contextlib
3+
import inspect
4+
from typing import Any, Callable
5+
16
from dlt.common.configuration.specs import RunConfiguration
2-
from dlt.common.runtime.segment import init_segment, disable_segment
7+
from dlt.common.typing import TFun
8+
from dlt.common.configuration import resolve_configuration
9+
from dlt.common.runtime.segment import TEventCategory, init_segment, disable_segment, track
310

411
from dlt.common.runtime.sentry import init_sentry, disable_sentry
512

613

7-
_TELEMETRY_ENABLED = False
14+
_TELEMETRY_STARTED = False
815

916

1017
def start_telemetry(config: RunConfiguration) -> None:
1118
# enable telemetry only once
1219

13-
global _TELEMETRY_ENABLED
14-
if _TELEMETRY_ENABLED:
20+
global _TELEMETRY_STARTED
21+
if _TELEMETRY_STARTED:
1522
return
1623

1724
if config.sentry_dsn:
@@ -20,15 +27,61 @@ def start_telemetry(config: RunConfiguration) -> None:
2027
if config.dlthub_telemetry:
2128
init_segment(config)
2229

23-
_TELEMETRY_ENABLED = True
30+
_TELEMETRY_STARTED = True
2431

2532

2633
def stop_telemetry() -> None:
27-
global _TELEMETRY_ENABLED
28-
if not _TELEMETRY_ENABLED:
34+
global _TELEMETRY_STARTED
35+
if not _TELEMETRY_STARTED:
2936
return
3037

3138
disable_sentry()
3239
disable_segment()
3340

34-
_TELEMETRY_ENABLED = False
41+
_TELEMETRY_STARTED = False
42+
43+
44+
def is_telemetry_started() -> bool:
45+
return _TELEMETRY_STARTED
46+
47+
48+
def with_telemetry(category: TEventCategory, command: str, track_before: bool, *args: str) -> Callable[[TFun], TFun]:
49+
"""Adds telemetry to f: TFun and add optional f *args values to `properties` of telemetry event"""
50+
def decorator(f: TFun) -> TFun:
51+
sig: inspect.Signature = inspect.signature(f)
52+
def _wrap(*f_args: Any, **f_kwargs: Any) -> Any:
53+
# look for additional arguments
54+
bound_args = sig.bind(*f_args, **f_kwargs)
55+
props = {p:bound_args.arguments[p] for p in args if p in bound_args.arguments}
56+
start_ts = time.time()
57+
58+
def _track(success: bool) -> None:
59+
with contextlib.suppress(Exception):
60+
props["elapsed"] = time.time() - start_ts
61+
props["success"] = success
62+
# resolve runtime config and init telemetry
63+
if not _TELEMETRY_STARTED:
64+
c = resolve_configuration(RunConfiguration())
65+
start_telemetry(c)
66+
track(category, command, props)
67+
68+
# some commands should be tracked before execution
69+
if track_before:
70+
_track(True)
71+
return f(*f_args, **f_kwargs)
72+
# some commands we track after, where we can pass the success
73+
try:
74+
rv = f(*f_args, **f_kwargs)
75+
# if decorated function returns int, 0 is a success - used to track dlt commands
76+
if isinstance(rv, int):
77+
success = rv == 0
78+
else:
79+
success = True
80+
_track(success)
81+
return rv
82+
except Exception:
83+
_track(False)
84+
raise
85+
86+
return _wrap # type: ignore
87+
return decorator

dlt/helpers/airflow_helper.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
from tenacity import retry_if_exception, wait_exponential, stop_after_attempt, Retrying, RetryCallState
55

66
from dlt.common.exceptions import MissingDependencyException
7+
from dlt.common.runtime.telemetry import with_telemetry
78

89
try:
910
from airflow.configuration import conf
1011
from airflow.utils.task_group import TaskGroup
11-
#from airflow.decorators import task
1212
from airflow.operators.python import PythonOperator
1313
from airflow.operators.python import get_current_context
1414
except ImportError:
@@ -118,6 +118,7 @@ def __init__(
118118
if ConfigProvidersContext in Container():
119119
del Container()[ConfigProvidersContext]
120120

121+
@with_telemetry("helper", "airflow_add_run", False, "decompose")
121122
def add_run(
122123
self,
123124
pipeline: Pipeline,

dlt/helpers/dbt/runner.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
from dlt.helpers.dbt.configuration import DBTRunnerConfiguration
2020
from dlt.helpers.dbt.exceptions import IncrementalSchemaOutOfSyncError, PrerequisitesException, DBTNodeResult, DBTProcessingError
2121

22+
from dlt.common.runtime.telemetry import with_telemetry
23+
2224

2325
class DBTPackageRunner:
2426
"""A Python wrapper over a dbt package
@@ -256,6 +258,7 @@ def run_all(self,
256258
raise
257259

258260

261+
@with_telemetry("helper", "dbt_create_runner", False, "package_profile_name")
259262
@with_config(spec=DBTRunnerConfiguration, sections=(known_sections.DBT_PACKAGE_RUNNER,))
260263
def create_runner(
261264
venv: Venv,

dlt/pipeline/helpers.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,18 @@
11
import contextlib
2-
from typing import Callable, Sequence, Iterable, Optional, Any, List, Iterator, Dict, Union, TypedDict
2+
from typing import Callable, Sequence, Iterable, Optional, Any, List, Dict, Tuple, Union, TypedDict
33
from itertools import chain
44

55
from dlt.common.jsonpath import resolve_paths, TAnyJsonPath, compile_paths
6-
76
from dlt.common.exceptions import TerminalException
8-
from dlt.common.schema.utils import get_child_tables, group_tables_by_resource, compile_simple_regexes, compile_simple_regex
7+
from dlt.common.schema.utils import group_tables_by_resource, compile_simple_regexes, compile_simple_regex
98
from dlt.common.schema.typing import TSimpleRegex
109
from dlt.common.typing import REPattern
11-
from dlt.destinations.exceptions import DatabaseUndefinedRelation
10+
from dlt.common.pipeline import TSourceState, _reset_resource_state, _sources_state, _delete_source_state_keys, _get_matching_resources
1211

12+
from dlt.destinations.exceptions import DatabaseUndefinedRelation
1313
from dlt.pipeline.exceptions import PipelineStepFailed, PipelineHasPendingDataException
1414
from dlt.pipeline.typing import TPipelineStep
1515
from dlt.pipeline import Pipeline
16-
from dlt.common.pipeline import TSourceState, _reset_resource_state, _sources_state, _delete_source_state_keys, _get_matching_resources
1716

1817

1918
def retry_load(retry_on_pipeline_steps: Sequence[TPipelineStep] = ("load",)) -> Callable[[BaseException], bool]:

dlt/pipeline/pipeline.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from dlt.common.storages import LiveSchemaStorage, NormalizeStorage, LoadStorage, SchemaStorage, FileStorage, NormalizeStorageConfiguration, SchemaStorageConfiguration, LoadStorageConfiguration
2525
from dlt.common.destination import DestinationCapabilitiesContext
2626
from dlt.common.destination.reference import DestinationReference, JobClientBase, DestinationClientConfiguration, DestinationClientDwhConfiguration, TDestinationReferenceArg, DestinationClientStagingConfiguration, DestinationClientDwhConfiguration
27+
from dlt.common.destination.capabilities import INTERNAL_LOADER_FILE_FORMATS
2728
from dlt.common.pipeline import ExtractInfo, LoadInfo, NormalizeInfo, PipelineContext, SupportsPipeline, TPipelineLocalState, TPipelineState, StateInjectableContext
2829
from dlt.common.schema import Schema
2930
from dlt.common.utils import is_interactive
@@ -44,12 +45,10 @@
4445
from dlt.pipeline.configuration import PipelineConfiguration
4546
from dlt.pipeline.progress import _Collector, _NULL_COLLECTOR
4647
from dlt.pipeline.exceptions import CannotRestorePipelineException, InvalidPipelineName, PipelineConfigMissing, PipelineNotActive, PipelineStepFailed, SqlClientNotAvailable
47-
from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace, load_trace, merge_traces, start_trace, start_trace_step, end_trace_step, end_trace
48+
from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace, load_trace, merge_traces, start_trace, start_trace_step, end_trace_step, end_trace, describe_extract_data
4849
from dlt.pipeline.typing import TPipelineStep
4950
from dlt.pipeline.state_sync import STATE_ENGINE_VERSION, load_state_from_destination, merge_state_if_changed, migrate_state, state_resource, json_encode_state, json_decode_state
5051

51-
from dlt.common.destination.capabilities import INTERNAL_LOADER_FILE_FORMATS
52-
5352

5453
def with_state_sync(may_extract_state: bool = False) -> Callable[[TFun], TFun]:
5554

@@ -285,10 +284,10 @@ def extract(
285284
# TODO: if we fail here we should probably wipe out the whole extract folder
286285
for extract_id in extract_ids:
287286
storage.commit_extract_files(extract_id)
288-
return ExtractInfo()
287+
return ExtractInfo(describe_extract_data(data))
289288
except Exception as exc:
290289
# TODO: provide metrics from extractor
291-
raise PipelineStepFailed(self, "extract", exc, ExtractInfo()) from exc
290+
raise PipelineStepFailed(self, "extract", exc, ExtractInfo(describe_extract_data(data))) from exc
292291

293292
@with_runtime_trace
294293
@with_schemas_sync

dlt/pipeline/trace.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,19 @@
22
import pickle
33
import datetime # noqa: 251
44
import dataclasses
5-
from typing import Any, List, NamedTuple, Optional, Protocol, Sequence
6-
5+
from collections.abc import Sequence as C_Sequence
6+
from typing import Any, List, Tuple, NamedTuple, Optional, Protocol, Sequence
77
import humanize
88

99
from dlt.common import pendulum
1010
from dlt.common.runtime.logger import suppress_and_warn
1111
from dlt.common.configuration import is_secret_hint
1212
from dlt.common.configuration.utils import _RESOLVED_TRACES
13-
from dlt.common.pipeline import SupportsPipeline
13+
from dlt.common.pipeline import ExtractDataInfo, SupportsPipeline
1414
from dlt.common.typing import StrAny
1515
from dlt.common.utils import uniq_id
1616

17+
from dlt.extract.source import DltResource, DltSource
1718
from dlt.pipeline.typing import TPipelineStep
1819
from dlt.pipeline.exceptions import PipelineStepFailed
1920

@@ -212,3 +213,35 @@ def load_trace(trace_path: str) -> PipelineTrace:
212213
except (AttributeError, FileNotFoundError):
213214
# on incompatible pickling / file not found return no trace
214215
return None
216+
217+
218+
def describe_extract_data(data: Any) -> List[ExtractDataInfo]:
219+
"""Extract source and resource names from data passed to extract"""
220+
data_info: List[ExtractDataInfo] = []
221+
222+
def add_item(item: Any) -> bool:
223+
if isinstance(item, (DltResource, DltSource)):
224+
# record names of sources/resources
225+
data_info.append({
226+
"name": item.name,
227+
"data_type": "resource" if isinstance(item, DltResource) else "source"
228+
})
229+
return False
230+
else:
231+
# anything else
232+
data_info.append({
233+
"name": "",
234+
"data_type": type(item).__name__
235+
})
236+
return True
237+
238+
item: Any = data
239+
if isinstance(data, C_Sequence) and len(data) > 0:
240+
for item in data:
241+
# add_item returns True if non named item was returned. in that case we break
242+
if add_item(item):
243+
break
244+
return data_info
245+
246+
add_item(item)
247+
return data_info

0 commit comments

Comments
 (0)