|
| 1 | +# SPDX-FileCopyrightText: 2024-present MTS PJSC |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +from typing import cast |
| 6 | + |
| 7 | +from data_rentgen.consumer.extractors.generic import GenericExtractor |
| 8 | +from data_rentgen.dto import ( |
| 9 | + JobDTO, |
| 10 | + JobTypeDTO, |
| 11 | + OperationDTO, |
| 12 | + OutputTypeDTO, |
| 13 | + RunDTO, |
| 14 | + RunStatusDTO, |
| 15 | + UserDTO, |
| 16 | +) |
| 17 | +from data_rentgen.dto.sql_query import SQLQueryDTO |
| 18 | +from data_rentgen.openlineage.dataset import OpenLineageOutputDataset |
| 19 | +from data_rentgen.openlineage.run_event import OpenLineageRunEvent |
| 20 | +from data_rentgen.openlineage.run_facets import ( |
| 21 | + OpenLineageStarRocksSessionInfoRunFacet, |
| 22 | +) |
| 23 | +from data_rentgen.utils.uuid import extract_timestamp_from_uuid |
| 24 | + |
| 25 | + |
| 26 | +class StarRocksExtractor(GenericExtractor): |
| 27 | + def match(self, event: OpenLineageRunEvent) -> bool: |
| 28 | + return bool(event.job.facets.jobType and event.job.facets.jobType.integration == "STARROCKS") |
| 29 | + |
| 30 | + def is_operation(self, event: OpenLineageRunEvent) -> bool: |
| 31 | + return bool(event.job.facets.jobType and event.job.facets.jobType.jobType == "QUERY") |
| 32 | + |
| 33 | + def extract_pure_run(self, event: OpenLineageRunEvent) -> RunDTO: |
| 34 | + # We treat queries as operations, and operations should be bound to run (session) for grouping. |
| 35 | + # So we create run artificially using starrocks_session facet |
| 36 | + starrocks_session = cast("OpenLineageStarRocksSessionInfoRunFacet", event.run.facets.starrocks_session) |
| 37 | + return RunDTO( |
| 38 | + id=starrocks_session.sessionId, |
| 39 | + job=JobDTO( |
| 40 | + name=f"{starrocks_session.user}@{starrocks_session.clientIp}", |
| 41 | + location=self._extract_job_location(event.job), |
| 42 | + type=JobTypeDTO(type="STARROCKS_SESSION"), |
| 43 | + ), |
| 44 | + parent_run=self.extract_parent_run(event.run.facets.parent) if event.run.facets.parent else None, |
| 45 | + started_at=extract_timestamp_from_uuid(starrocks_session.sessionId), |
| 46 | + user=UserDTO(name=starrocks_session.user), |
| 47 | + ) |
| 48 | + |
| 49 | + def _enrich_run_status(self, run: RunDTO, event: OpenLineageRunEvent): |
| 50 | + if self.is_operation(event): |
| 51 | + # for query events we don't know session start time |
| 52 | + run.status = RunStatusDTO.STARTED |
| 53 | + return run |
| 54 | + |
| 55 | + return super()._enrich_run_status(run, event) |
| 56 | + |
| 57 | + def extract_operation(self, event: OpenLineageRunEvent) -> OperationDTO: |
| 58 | + run = self.extract_run(event) |
| 59 | + |
| 60 | + operation = OperationDTO( |
| 61 | + id=event.run.runId, |
| 62 | + run=run, |
| 63 | + name=event.job.name, |
| 64 | + # no started_at == run.started_at |
| 65 | + type=self._extract_operation_type(event), |
| 66 | + sql_query=self._extract_sql_query(event), |
| 67 | + ) |
| 68 | + self._enrich_operation_status(operation, event) |
| 69 | + return operation |
| 70 | + |
| 71 | + def _extract_output_type( # noqa: PLR0911 |
| 72 | + self, |
| 73 | + operation: OperationDTO, |
| 74 | + dataset: OpenLineageOutputDataset, |
| 75 | + ) -> OutputTypeDTO: |
| 76 | + match operation.sql_query: |
| 77 | + case None: |
| 78 | + return OutputTypeDTO.UNKNOWN |
| 79 | + case SQLQueryDTO(query=query) if query.startswith("INSERT"): |
| 80 | + return OutputTypeDTO.APPEND |
| 81 | + case SQLQueryDTO(query=query) if query.startswith("CREATE"): |
| 82 | + return OutputTypeDTO.CREATE |
| 83 | + case SQLQueryDTO(query=query) if query.startswith("ALTER"): |
| 84 | + return OutputTypeDTO.ALTER |
| 85 | + case SQLQueryDTO(query=query) if query.startswith("DROP"): |
| 86 | + return OutputTypeDTO.DROP |
| 87 | + case SQLQueryDTO(query=query) if query.startswith("TRUNCATE"): |
| 88 | + return OutputTypeDTO.TRUNCATE |
| 89 | + return OutputTypeDTO.UNKNOWN |
0 commit comments