-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathingestion.py
More file actions
391 lines (321 loc) · 14.8 KB
/
ingestion.py
File metadata and controls
391 lines (321 loc) · 14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
"""Low-level wrapper for the IngestionAPI.
This module provides thin wrappers around the autogenerated bindings for the IngestionAPI.
It handles common concerns like error handling and retries.
It provides an asynchronous client for the IngestionAPI.
"""
from __future__ import annotations
import hashlib
import logging
from collections import namedtuple
from typing import TYPE_CHECKING, Any, Iterable, cast
from sift.ingestion_configs.v2.ingestion_configs_pb2 import (
ListIngestionConfigFlowsRequest,
ListIngestionConfigFlowsResponse,
ListIngestionConfigsRequest,
ListIngestionConfigsResponse,
)
from sift.ingestion_configs.v2.ingestion_configs_pb2_grpc import (
IngestionConfigServiceStub,
)
from sift_client._internal.low_level_wrappers.base import (
LowLevelClientBase,
)
from sift_client.sift_types.ingestion import FlowConfig, IngestionConfig
from sift_client.transport import GrpcClient, WithGrpcClient
from sift_client.util import cel_utils as cel
logger = logging.getLogger(__name__)
DEFAULT_INGESTION_CONFIG_PAGE_SIZE = 100
"""Default page size for ingestion config and flow list calls (flow configs can be large)."""
if TYPE_CHECKING:
from datetime import datetime
from sift_stream_bindings import (
DiskBackupPolicyPy,
FlowConfigPy,
FlowDescriptorPy,
FlowPy,
IngestionConfigFormPy,
IngestWithConfigDataStreamRequestPy,
IngestWithConfigDataStreamRequestWrapperPy,
MetadataPy,
RetryPolicyPy,
RunFormPy,
RunSelectorPy,
SiftStreamBuilderPy,
SiftStreamMetricsSnapshotPy,
SiftStreamPy,
TimeValuePy,
)
from sift_client.resources.ingestion import StreamingMode, TracingConfig
def _to_rust_py_timestamp(time: datetime) -> TimeValuePy:
"""Convert a Python datetime to a Rust TimeValuePy.
Args:
time: The datetime to convert
Returns:
A TimeValuePy representation
"""
# Importing here to allow sift_stream_bindings to be an optional dependancy for non-ingestion users
from sift_stream_bindings import TimeValuePy
ts = time.timestamp()
secs = int(ts)
nsecs = int((ts - secs) * 1_000_000_000)
return TimeValuePy.from_timestamp(secs, nsecs)
class IngestionLowLevelClient(LowLevelClientBase, WithGrpcClient):
"""Low-level client for the IngestionAPI.
This class provides a thin wrapper around the autogenerated bindings for the IngestionAPI.
It handles common concerns like error handling and retries.
"""
CacheEntry = namedtuple("CacheEntry", ["data_queue", "ingestion_config", "thread"])
_sift_stream_builder: SiftStreamBuilderPy | None
stream_cache: dict[str, CacheEntry]
def __init__(self, grpc_client: GrpcClient):
"""Initialize the IngestionLowLevelClient.
Args:
grpc_client: The gRPC client to use for making API calls.
"""
super().__init__(grpc_client=grpc_client)
async def list_ingestion_configs(
self,
filter_query: str | None = None,
page_size: int | None = DEFAULT_INGESTION_CONFIG_PAGE_SIZE,
page_token: str | None = None,
order_by: str | None = None,
) -> tuple[list[IngestionConfig], str]:
"""List ingestion configs (single page).
Args:
filter_query: The CEL filter query.
page_size: Number of results per page.
page_token: Token for the next page.
order_by: Unused; accepted for _handle_pagination compatibility.
Returns:
A tuple of (list of IngestionConfig, next_page_token).
"""
request_kwargs: dict[str, Any] = {}
if page_size is not None:
request_kwargs["page_size"] = page_size
if page_token is not None:
request_kwargs["page_token"] = page_token
if filter_query is not None:
request_kwargs["filter"] = filter_query
if order_by is not None:
request_kwargs["order_by"] = order_by
request = ListIngestionConfigsRequest(**request_kwargs)
res = await self._grpc_client.get_stub(IngestionConfigServiceStub).ListIngestionConfigs(
request
)
res = cast("ListIngestionConfigsResponse", res)
configs = [IngestionConfig._from_proto(config) for config in res.ingestion_configs]
return configs, res.next_page_token
async def list_all_ingestion_configs(
self,
filter_query: str | None = None,
page_size: int | None = DEFAULT_INGESTION_CONFIG_PAGE_SIZE,
max_results: int | None = None,
) -> list[IngestionConfig]:
"""List all ingestion configs matching the filter, using pagination.
Args:
filter_query: The CEL filter query.
page_size: Number of results per page.
max_results: Maximum total results to return; None for no limit.
Returns:
A list of all matching IngestionConfigs.
"""
return await self._handle_pagination(
self.list_ingestion_configs,
kwargs={"filter_query": filter_query},
page_size=page_size,
max_results=max_results,
)
async def list_ingestion_config_flows(
self,
ingestion_config_id: str,
page_size: int | None = DEFAULT_INGESTION_CONFIG_PAGE_SIZE,
page_token: str | None = None,
order_by: str | None = None,
filter_query: str | None = None,
) -> tuple[list[FlowConfig], str]:
"""List ingestion config flows (single page).
Args:
ingestion_config_id: The ingestion config ID.
page_size: Number of results per page.
page_token: Token for the next page.
order_by: Unused; accepted for _handle_pagination compatibility.
filter_query: Optional CEL filter for flows.
Returns:
A tuple of (list of FlowConfig, next_page_token).
"""
request_kwargs: dict[str, Any] = {"ingestion_config_id": ingestion_config_id}
if page_size is not None:
request_kwargs["page_size"] = page_size
if page_token is not None:
request_kwargs["page_token"] = page_token
if filter_query is not None:
request_kwargs["filter"] = filter_query
if order_by is not None:
request_kwargs["order_by"] = order_by
request = ListIngestionConfigFlowsRequest(**request_kwargs)
res = await self._grpc_client.get_stub(IngestionConfigServiceStub).ListIngestionConfigFlows(
request
)
res = cast("ListIngestionConfigFlowsResponse", res)
flows = [FlowConfig._from_proto(flow) for flow in res.flows]
return flows, res.next_page_token
async def get_ingestion_config_flows(
self,
ingestion_config_id: str,
page_size: int | None = DEFAULT_INGESTION_CONFIG_PAGE_SIZE,
max_results: int | None = None,
) -> list[FlowConfig]:
"""Get all flows for an ingestion config, using pagination.
Args:
ingestion_config_id: The ingestion config ID.
page_size: Number of results per page.
max_results: Maximum total results to return; None for no limit.
Returns:
A list of all FlowConfigs for the ingestion config.
"""
return await self._handle_pagination(
self.list_ingestion_config_flows,
kwargs={"ingestion_config_id": ingestion_config_id},
page_size=page_size,
max_results=max_results,
)
async def get_ingestion_config_id_from_client_key(self, client_key: str) -> str | None:
"""Get the ingestion config id."""
filter_query = cel.equals("client_key", client_key)
ingestion_configs = await self.list_all_ingestion_configs(filter_query)
if not ingestion_configs:
return None
if len(ingestion_configs) > 1:
raise ValueError(
f"Expected 1 ingestion config for client key {client_key}, got {len(ingestion_configs)}"
)
return ingestion_configs[0].id_
def _hash_flows(self, asset_name: str, flows: list[FlowConfig]) -> str:
"""Generate a client key that should be unique but deterministic for the given asset and flow configuration."""
return _hash_flows(asset_name=asset_name, flows=flows)
class IngestionConfigStreamingLowLevelClient(LowLevelClientBase):
DEFAULT_MAX_LOG_FILES = 7 # Equal to 1 week of logs
DEFAULT_LOGFILE_PREFIX = "sift_stream_bindings.log"
_sift_stream_instance: SiftStreamPy
def __init__(self, sift_stream_instance: SiftStreamPy):
super().__init__()
self._sift_stream_instance = sift_stream_instance
@classmethod
async def create_sift_stream_instance(
cls,
api_key: str,
grpc_uri: str,
ingestion_config_form: IngestionConfigFormPy,
run_form: RunFormPy | None = None,
run_id: str | None = None,
asset_tags: list[str] | None = None,
asset_metadata: list[MetadataPy] | None = None,
streaming_mode: StreamingMode = ..., # type: ignore[assignment]
retry_policy: RetryPolicyPy | None = None,
disk_backup_policy: DiskBackupPolicyPy | None = None,
checkpoint_interval_seconds: int | None = None,
enable_tls: bool = True,
tracing_config: TracingConfig | None = None,
) -> IngestionConfigStreamingLowLevelClient:
# Importing here to allow sift_stream_bindings to be an optional dependancy for non-ingestion users
# TODO(nathan): Fix bindings to fix mypy issues with tracing functions
from sift_stream_bindings import ( # type: ignore[attr-defined]
DurationPy,
SiftStreamBuilderPy,
init_tracing, # type: ignore[attr-defined]
init_tracing_with_file, # type: ignore[attr-defined]
is_tracing_initialized, # type: ignore[attr-defined]
)
from sift_client.resources.ingestion import StreamingMode, TracingConfig
if streaming_mode is ...: # type: ignore[comparison-overlap]
streaming_mode = StreamingMode.LIVE_WITH_BACKUPS
if not is_tracing_initialized():
if tracing_config is None:
tracing_config = TracingConfig.with_file()
if tracing_config.log_dir is not None:
# Use file logging
init_tracing_with_file(
tracing_config.level,
tracing_config.log_dir,
tracing_config.filename_prefix or cls.DEFAULT_LOGFILE_PREFIX,
tracing_config.max_log_files or cls.DEFAULT_MAX_LOG_FILES,
)
else:
# Use stdout/stderr only
init_tracing(tracing_config.level)
sift_builder = SiftStreamBuilderPy(uri=grpc_uri, apikey=api_key)
sift_builder.enable_tls = enable_tls
config_builder = sift_builder.ingestion_config(ingestion_config_form)
config_builder.run = run_form
config_builder.run_id = run_id
config_builder.asset_tags = asset_tags
config_builder.metadata = asset_metadata
if streaming_mode == StreamingMode.LIVE_ONLY:
sift_stream_instance = await config_builder.live_only().build()
elif streaming_mode == StreamingMode.FILE_BACKUP:
fb_builder = config_builder.file_backup()
if disk_backup_policy is not None:
fb_builder.disk_backup_policy = disk_backup_policy
sift_stream_instance = await fb_builder.build()
else: # LIVE_WITH_BACKUPS (default)
lwb_builder = config_builder.live_with_backups()
if retry_policy is not None:
lwb_builder.retry_policy = retry_policy
if disk_backup_policy is not None:
lwb_builder.disk_backup_policy = disk_backup_policy
if checkpoint_interval_seconds is not None:
lwb_builder.checkpoint_interval = DurationPy(
secs=checkpoint_interval_seconds, nanos=0
)
sift_stream_instance = await lwb_builder.build()
return cls(sift_stream_instance)
async def send(self, flow: FlowPy):
await self._sift_stream_instance.send(flow)
async def batch_send(self, flows: Iterable[FlowPy]):
await self._sift_stream_instance.batch_send(flows)
async def send_requests(self, requests: list[IngestWithConfigDataStreamRequestPy]):
await self._sift_stream_instance.send_requests(requests)
def try_send_requests(
self, requests: Iterable[IngestWithConfigDataStreamRequestWrapperPy]
) -> None:
self._sift_stream_instance.try_send_requests(requests)
def try_send(self, flow: FlowPy) -> None:
self._sift_stream_instance.try_send(flow)
def get_flow_descriptor(self, flow_name: str) -> FlowDescriptorPy:
return self._sift_stream_instance.get_flow_descriptor(flow_name)
async def add_new_flows(self, flow_configs: list[FlowConfigPy]):
await self._sift_stream_instance.add_new_flows(flow_configs)
async def attach_run(self, run_selector: RunSelectorPy):
await self._sift_stream_instance.attach_run(run_selector)
def detach_run(self):
self._sift_stream_instance.detach_run()
def get_run_id(self) -> str | None:
return self._sift_stream_instance.run()
async def finish(self):
await self._sift_stream_instance.finish()
def get_metrics_snapshot(self) -> SiftStreamMetricsSnapshotPy:
return self._sift_stream_instance.get_metrics_snapshot()
def _hash_flows(asset_name: str, flows: list[FlowConfig]) -> str:
"""Generate a client key that should be unique but deterministic for the given asset and flow configuration."""
# TODO: Taken from sift_py/ingestion/config/telemetry.py. Confirm intent from Marc.
m = hashlib.sha256()
m.update(asset_name.encode())
for flow in sorted(flows, key=lambda f: f.name):
m.update(flow.name.encode())
# Do not sort channels in alphabetical order since order matters.
for channel in flow.channels:
m.update(channel.name.encode())
# Use api_format for data type since that should be consistent between languages.
m.update(channel.data_type.hash_str(api_format=True).encode())
m.update((channel.description or "").encode())
m.update((channel.unit or "").encode())
if channel.bit_field_elements:
for bfe in sorted(channel.bit_field_elements, key=lambda bfe: bfe.index):
m.update(bfe.name.encode())
m.update(str(bfe.index).encode())
m.update(str(bfe.bit_count).encode())
if channel.enum_types:
for enum_name, enum_key in sorted(channel.enum_types.items(), key=lambda it: it[1]):
m.update(str(enum_key).encode())
m.update(enum_name.encode())
return m.hexdigest()