MTSWebServices
diff --git a/‎data_rentgen/consumer/extractors/batch_extractor.py‎
Lines changed: 16 additions & 3 deletions b/‎data_rentgen/consumer/extractors/batch_extractor.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎data_rentgen/consumer/extractors/job.py‎
Lines changed: 3 additions & 3 deletions b/‎data_rentgen/consumer/extractors/job.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎data_rentgen/consumer/extractors/operation.py‎
Lines changed: 11 additions & 5 deletions b/‎data_rentgen/consumer/extractors/operation.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎data_rentgen/consumer/openlineage/job_facets/__init__.py‎
Lines changed: 0 additions & 4 deletions b/‎data_rentgen/consumer/openlineage/job_facets/__init__.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎data_rentgen/consumer/openlineage/job_facets/job_type.py‎
Lines changed: 4 additions & 44 deletions b/‎data_rentgen/consumer/openlineage/job_facets/job_type.py‎
Lines changed: 4 additions & 44 deletions
diff --git a/‎data_rentgen/consumer/subscribers.py‎
Lines changed: 1 addition & 1 deletion b/‎data_rentgen/consumer/subscribers.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data_rentgen/db/migrations/versions/2025-04-25_2d2fe3f2f348_add_job_type.py‎
Lines changed: 92 additions & 0 deletions b/‎data_rentgen/db/migrations/versions/2025-04-25_2d2fe3f2f348_add_job_type.py‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎data_rentgen/db/models/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎data_rentgen/db/models/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎data_rentgen/db/models/job.py‎
Lines changed: 10 additions & 19 deletions b/‎data_rentgen/db/models/job.py‎
Lines changed: 10 additions & 19 deletions
diff --git a/‎data_rentgen/db/models/job_type.py‎
Lines changed: 23 additions & 0 deletions b/‎data_rentgen/db/models/job_type.py‎
Lines changed: 23 additions & 0 deletions
@@ -8,7 +8,6 @@
 from data_rentgen.consumer.extractors.operation import extract_operation
 from data_rentgen.consumer.extractors.output import extract_output
 from data_rentgen.consumer.extractors.run import extract_run
-from data_rentgen.consumer.openlineage.job_facets.job_type import OpenLineageJobType
 from data_rentgen.consumer.openlineage.run_event import OpenLineageRunEvent
 from data_rentgen.dto import (
     DatasetDTO,
@@ -22,13 +21,27 @@ def __init__(self) -> None:
 
     def add_events(self, events: list[OpenLineageRunEvent]) -> BatchExtractionResult:
         for event in events:
-            if event.job.facets.jobType and event.job.facets.jobType.jobType == OpenLineageJobType.JOB:
+            if self.is_operation(event):
                 self.extract_operation(event)
             else:
                 self.extract_run(event)
-
         return self.result
 
+    def is_operation(self, event: OpenLineageRunEvent) -> bool:
+        has_lineage = bool(event.inputs or event.outputs)
+
+        job_type_facet = event.job.facets.jobType
+        if not job_type_facet:
+            return has_lineage
+
+        if job_type_facet.integration == "SPARK":
+            return job_type_facet.jobType != "APPLICATION"
+
+        if job_type_facet.integration == "AIRFLOW":
+            return job_type_facet.jobType == "TASK"
+
+        return has_lineage
+
     def extract_run(self, event: OpenLineageRunEvent) -> None:
         run = extract_run(event)
         self.result.add_run(run)
 
@@ -5,7 +5,7 @@
 
 from data_rentgen.consumer.openlineage.job import OpenLineageJob
 from data_rentgen.consumer.openlineage.run_facets import OpenLineageParentJob
-from data_rentgen.dto import JobDTO, JobTypeDTO, LocationDTO
+from data_rentgen.dto import JobDTO, LocationDTO
 
 
 def extract_parent_job(job: OpenLineageParentJob) -> JobDTO:
@@ -34,10 +34,10 @@ def extract_job_location(job: OpenLineageJob | OpenLineageParentJob) -> Location
     )
 
 
-def extract_job_type(job: OpenLineageJob) -> JobTypeDTO | None:
+def extract_job_type(job: OpenLineageJob) -> str | None:
     if job.facets.jobType:
         job_type = job.facets.jobType.jobType
         integration_type = job.facets.jobType.integration
-        return JobTypeDTO(f"{integration_type}_{job_type}")
+        return f"{integration_type}_{job_type}".upper()
 
     return None
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024-2025 MTS PJSC
 # SPDX-License-Identifier: Apache-2.0
 
-from data_rentgen.consumer.extractors.run import extract_parent_run
+from data_rentgen.consumer.extractors.run import extract_parent_run, extract_run
 from data_rentgen.consumer.openlineage.run_event import (
     OpenLineageRunEvent,
     OpenLineageRunEventType,
@@ -10,21 +10,27 @@
 
 
 def extract_operation(event: OpenLineageRunEvent) -> OperationDTO:
-    # operation always has parent
-    run = extract_parent_run(event.run.facets.parent)  # type: ignore[arg-type]
+    if event.run.facets.parent and event.job.facets.jobType and event.job.facets.jobType.integration == "SPARK":
+        run = extract_parent_run(event.run.facets.parent)
+    else:
+        run = extract_run(event)
 
     # in some cases, operation name may contain raw SELECT query with newlines
     operation_name = " ".join(line.strip() for line in event.job.name.splitlines()).strip()
     # remove parent job name from operation name
-    if operation_name.startswith(run.job.name):
+    if operation_name != run.job.name and operation_name.startswith(run.job.name):
         prefix = len(run.job.name) + 1
         operation_name = operation_name[prefix:]
 
+    type_: OperationTypeDTO = OperationTypeDTO.BATCH
+    if event.job.facets.jobType:
+        type_ = OperationTypeDTO(event.job.facets.jobType.processingType)
+
     operation = OperationDTO(
         id=event.run.runId,  # type: ignore [arg-type]
         run=run,
         name=operation_name,
-        type=OperationTypeDTO(event.job.facets.jobType.processingType) if event.job.facets.jobType else None,
+        type=type_,
     )
     enrich_operation_status(operation, event)
     enrich_operation_description(operation, event)
 
@@ -7,19 +7,15 @@
     OpenLineageDocumentationJobFacet,
 )
 from data_rentgen.consumer.openlineage.job_facets.job_type import (
-    OpenLineageJobIntegrationType,
     OpenLineageJobProcessingType,
-    OpenLineageJobType,
     OpenLineageJobTypeJobFacet,
 )
 
 __all__ = [
     "OpenLineageDocumentationJobFacet",
     "OpenLineageJobFacet",
     "OpenLineageJobFacets",
-    "OpenLineageJobIntegrationType",
     "OpenLineageJobProcessingType",
-    "OpenLineageJobType",
     "OpenLineageJobTypeJobFacet",
 ]
 
 
@@ -3,64 +3,24 @@
 
 from enum import Enum
 
-from pydantic import field_validator
-
 from data_rentgen.consumer.openlineage.job_facets.base import OpenLineageJobFacet
 
 
-class OpenLineageJobIntegrationType(str, Enum):
-    """Integration where job is running.
-    See [JobTypeJobFacet](https://github.com/OpenLineage/OpenLineage/blob/main/spec/facets/JobTypeJobFacet.json).
-    """
-
-    SPARK = "SPARK"
-    AIRFLOW = "AIRFLOW"
-
-    def __str__(self) -> str:
-        return self.value
-
-
-class OpenLineageJobType(str, Enum):
-    """Job type.
-    See [JobTypeJobFacet](https://github.com/OpenLineage/OpenLineage/blob/main/spec/facets/JobTypeJobFacet.json).
-    """
-
-    APPLICATION = "APPLICATION"
-    JOB = "JOB"
-    DAG = "DAG"
-    TASK = "TASK"
-
-    def __str__(self) -> str:
-        return self.value
-
-    @classmethod
-    def _missing_(cls, value):
-        if value in {"SQL_JOB", "RDD_JOB"}:
-            return cls.JOB
-        return None
-
-
 class OpenLineageJobProcessingType(str, Enum):
     """Job processing type.
     See [JobTypeJobFacet](https://github.com/OpenLineage/OpenLineage/blob/main/spec/facets/JobTypeJobFacet.json).
     """
 
     BATCH = "BATCH"
     STREAMING = "STREAMING"
+    NONE = "NONE"
 
 
 class OpenLineageJobTypeJobFacet(OpenLineageJobFacet):
     """Job facet describing job type.
     See [JobTypeJobFacet](https://github.com/OpenLineage/OpenLineage/blob/main/spec/facets/JobTypeJobFacet.json).
     """
 
-    integration: OpenLineageJobIntegrationType
-    jobType: OpenLineageJobType
-    processingType: OpenLineageJobProcessingType | None = None
-
-    @field_validator("processingType", mode="before")
-    @classmethod
-    def _validate_processing_type(cls, processing_type: str):
-        if processing_type == "NONE":
-            return None
-        return processing_type
+    processingType: OpenLineageJobProcessingType
+    integration: str
+    jobType: str | None = None
@@ -65,7 +65,7 @@ async def extract_events(
             event = OpenLineageRunEventAdapter.validate_json(message.value)
             extractor.add_events([event])
         except (ValueError, TypeError):
-            logger.error(  # noqa: TRY400
+            logger.exception(
                 "Failed to parse message: ConsumerRecord(topic=%r, partition=%d, offset=%d)",
                 message.topic,
                 message.partition,
 
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: 2024-2025 MTS PJSC
+# SPDX-License-Identifier: Apache-2.0
+"""Add job_type
+
+Revision ID: 2d2fe3f2f348
+Revises: 976168ee4f16
+Create Date: 2025-04-25 15:09:17.556969
+
+"""
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "2d2fe3f2f348"
+down_revision = "976168ee4f16"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "job_type",
+        sa.Column("id", sa.BigInteger(), nullable=False),
+        sa.Column("type", sa.String(), nullable=False),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk__job_type")),
+        sa.UniqueConstraint("type", name=op.f("uq__job_type__type")),
+    )
+    op.create_index(op.f("ix__job_type__type"), "job_type", ["type"], unique=False)
+
+    op.execute(
+        sa.text(
+            """
+        INSERT INTO
+            job_type (id, type)
+        VALUES
+            (0, 'UNKNOWN'),
+            (1, 'SPARK_APPLICATION'),
+            (2, 'AIRFLOW_DAG'),
+            (3, 'AIRFLOW_TASK');
+        """,
+        ),
+    )
+    op.execute(sa.text("ALTER SEQUENCE job_type_id_seq RESTART WITH 4;"))
+
+    op.execute(sa.text("LOCK TABLE job IN ACCESS EXCLUSIVE MODE;"))
+    op.drop_index("ix__job__type", table_name="job")
+    op.alter_column(
+        "job",
+        "type",
+        new_column_name="type_id",
+        existing_type=sa.String(length=32),
+        type_=sa.BigInteger(),
+        nullable=False,
+        postgresql_using="""
+            CASE
+                WHEN type = 'SPARK_APPLICATION'
+                    THEN 1
+                WHEN type = 'AIRFLOW_DAG'
+                    THEN 2
+                WHEN type = 'AIRFLOW_TASK'
+                    THEN 3
+                ELSE 0
+            END
+        """,
+    )
+    op.create_index(op.f("ix__job__type_id"), "job", ["type_id"], unique=False)
+
+
+def downgrade() -> None:
+    op.execute(sa.text("LOCK TABLE job IN ACCESS EXCLUSIVE MODE;"))
+    op.drop_index(op.f("ix__job__type_id"), table_name="job")
+    op.alter_column(
+        "job",
+        "type_id",
+        new_column_name="type",
+        existing_type=sa.BigInteger(),
+        type_=sa.String(length=32),
+        nullable=False,
+    )
+    op.execute(
+        sa.text(
+            """
+        UPDATE job
+        SET type = (SELECT job_type.type FROM job_type WHERE job_type.id = job.type);
+        """,
+        ),
+    )
+    op.create_index("ix__job__type", "job", ["type"], unique=False)
+
+    op.drop_index(op.f("ix__job_type__type"), table_name="job_type")
+    op.drop_table("job_type")
@@ -13,7 +13,8 @@
 )
 from data_rentgen.db.models.dataset_symlink import DatasetSymlink, DatasetSymlinkType
 from data_rentgen.db.models.input import Input
-from data_rentgen.db.models.job import Job, JobType
+from data_rentgen.db.models.job import Job
+from data_rentgen.db.models.job_type import JobType
 from data_rentgen.db.models.location import Location
 from data_rentgen.db.models.operation import Operation, OperationStatus, OperationType
 from data_rentgen.db.models.output import Output, OutputType
 
@@ -3,27 +3,15 @@
 
 from __future__ import annotations
 
-from enum import Enum
-
-from sqlalchemy import BigInteger, Computed, ForeignKey, Index, String, UniqueConstraint
+from sqlalchemy import BigInteger, Computed, ForeignKey, Index, String, UniqueConstraint, select
 from sqlalchemy.dialects.postgresql import TSVECTOR
-from sqlalchemy.orm import Mapped, mapped_column, relationship
-from sqlalchemy_utils import ChoiceType
+from sqlalchemy.orm import Mapped, column_property, mapped_column, relationship
 
 from data_rentgen.db.models.base import Base
+from data_rentgen.db.models.job_type import JobType
 from data_rentgen.db.models.location import Location
 
 
-class JobType(str, Enum):
-    AIRFLOW_DAG = "AIRFLOW_DAG"
-    AIRFLOW_TASK = "AIRFLOW_TASK"
-    SPARK_APPLICATION = "SPARK_APPLICATION"
-    UNKNOWN = "UNKNOWN"
-
-    def __str__(self) -> str:
-        return self.value
-
-
 class Job(Base):
     __tablename__ = "job"
     __table_args__ = (
@@ -49,12 +37,12 @@ class Job(Base):
         doc="Job name, e.g. Airflow DAG name + task name, or Spark applicationName",
     )
 
-    type: Mapped[JobType] = mapped_column(
-        ChoiceType(JobType, impl=String(32)),
+    type_id: Mapped[int] = mapped_column(
+        BigInteger,
+        ForeignKey("job_type.id", ondelete="CASCADE"),
         index=True,
         nullable=False,
-        default=JobType.UNKNOWN,
-        doc="Job type, e.g. AIRFLOW_DAG, AIRFLOW_TASK, SPARK_APPLICATION",
+        doc="Job type",
     )
 
     search_vector: Mapped[str] = mapped_column(
@@ -76,3 +64,6 @@ class Job(Base):
         deferred=True,
         doc="Full-text search vector",
     )
+
+
+Job.type = column_property(select(JobType.type).where(Job.type_id == JobType.id).scalar_subquery())
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2024-2025 MTS PJSC
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from sqlalchemy import BigInteger, String, UniqueConstraint
+from sqlalchemy.orm import Mapped, mapped_column
+
+from data_rentgen.db.models.base import Base
+
+
+class JobType(Base):
+    __tablename__ = "job_type"
+    __table_args__ = (UniqueConstraint("type"),)
+
+    id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
+
+    type: Mapped[str] = mapped_column(
+        String,
+        index=True,
+        nullable=False,
+        doc="Job type, e.g. SPARK_APPLICATION, AIRFLOW_DAG",
+    )
Original file line number	Diff line number	Diff line change
`@@ -7,19 +7,15 @@`
`7`	`7`	`OpenLineageDocumentationJobFacet,`
`8`	`8`	`)`
`9`	`9`	`from data_rentgen.consumer.openlineage.job_facets.job_type import (`
`10`		`- OpenLineageJobIntegrationType,`
`11`	`10`	`OpenLineageJobProcessingType,`
`12`		`- OpenLineageJobType,`
`13`	`11`	`OpenLineageJobTypeJobFacet,`
`14`	`12`	`)`
`15`	`13`
`16`	`14`	`__all__ = [`
`17`	`15`	`"OpenLineageDocumentationJobFacet",`
`18`	`16`	`"OpenLineageJobFacet",`
`19`	`17`	`"OpenLineageJobFacets",`
`20`		`- "OpenLineageJobIntegrationType",`
`21`	`18`	`"OpenLineageJobProcessingType",`
`22`		`- "OpenLineageJobType",`
`23`	`19`	`"OpenLineageJobTypeJobFacet",`
`24`	`20`	`]`
`25`	`21`