Skip to content

Commit 219c568

Browse files
authored
ISSUE #3032 (#27912)
* feat: move flat sampling to sampling config + dynamic sampling option * feat: move flat sampling on the backend to sample profile conifg object * feat: fix circular import * feat: align UI with new profiler config * feat: fix json schema * feat: align python imports with new schema path * feat: update migration to look at extension * feat: remove enable * feat: remove enable * feat: added titles to sample config * feat: generated ts classes * feat: addressed comments * feat: change sample config instantiation to match new structure * feat: removed backward compatible fields * feat: ran java linting * feat: updated imports to point to generated files * feat: added dynamic sampler resolution logic * feat: ran python linting * feat: remove duplicate migration * chore: merge upstream and clean conflicts * feat: update logic to support dynamic and static sampling * feat: adjust sample config call * feat: test for statis, dynamic, row count and tier methods * feat: more sample config unit tests * feat: added tests for metric and sampling * feat: added tests to validate fallback is not called i nmetric computers * feat: strengthen profiler validation tests * feat: fix sampling config * feat: fix sampling config * feat: fix sampling config * feat: generated typescript models * feat: fixed missing dq pipeline migration * feat: fixed static check * feat: fixed ci failures * feat: fixed ci failures * feat: fixed unit tests faioure and linting * feat: fixed integration tests failures * chore: fixe burstiq refactor * chore: fix trino ci failures * chore: revert baseline.json file * chore: fix sampler availabl burst iq changes * feat: added smart sampling radio button * feat: ignore static checks errors * feat: ran ts linting * feat burstiq infinite recursion issue with dynamic as default * feat: translate i8n keys * feat: fix failing tests
1 parent b42c9ad commit 219c568

94 files changed

Lines changed: 3513 additions & 544 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

bootstrap/sql/migrations/native/1.13.0/mysql/schemaChanges.sql

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,42 @@ WHERE pipelineType = 'profiler'
216216
OR JSON_CONTAINS_PATH(json, 'one', '$.sourceConfig.config.profileSampleType')
217217
OR JSON_CONTAINS_PATH(json, 'one', '$.sourceConfig.config.samplingMethodType'));
218218

219+
-- ingestion_pipeline_entity (testSuite pipelines): build profileSampleConfig (skip if already migrated)
220+
UPDATE ingestion_pipeline_entity
221+
SET json = JSON_SET(
222+
json,
223+
'$.sourceConfig.config.profileSampleConfig',
224+
JSON_OBJECT(
225+
'sampleConfigType', 'STATIC',
226+
'config', JSON_OBJECT(
227+
'profileSample', JSON_EXTRACT(json, '$.sourceConfig.config.profileSample'),
228+
'profileSampleType', COALESCE(
229+
JSON_EXTRACT(json, '$.sourceConfig.config.profileSampleType'),
230+
CAST('"PERCENTAGE"' AS JSON)
231+
),
232+
'samplingMethodType', JSON_EXTRACT(json, '$.sourceConfig.config.samplingMethodType')
233+
)
234+
)
235+
)
236+
WHERE pipelineType = 'testSuite'
237+
AND JSON_EXTRACT(json, '$.sourceConfig.config.profileSample') IS NOT NULL
238+
AND JSON_TYPE(JSON_EXTRACT(json, '$.sourceConfig.config.profileSample')) != 'NULL'
239+
AND NOT JSON_CONTAINS_PATH(json, 'one', '$.sourceConfig.config.profileSampleConfig');
240+
241+
-- ingestion_pipeline_entity (testSuite pipelines): remove old flat fields
242+
UPDATE ingestion_pipeline_entity
243+
SET json = JSON_REMOVE(
244+
JSON_REMOVE(
245+
JSON_REMOVE(json, '$.sourceConfig.config.samplingMethodType'),
246+
'$.sourceConfig.config.profileSampleType'
247+
),
248+
'$.sourceConfig.config.profileSample'
249+
)
250+
WHERE pipelineType = 'testSuite'
251+
AND (JSON_CONTAINS_PATH(json, 'one', '$.sourceConfig.config.profileSample')
252+
OR JSON_CONTAINS_PATH(json, 'one', '$.sourceConfig.config.profileSampleType')
253+
OR JSON_CONTAINS_PATH(json, 'one', '$.sourceConfig.config.samplingMethodType'));
254+
219255
-- RDF distributed indexing state tables
220256
CREATE TABLE IF NOT EXISTS rdf_index_job (
221257
id VARCHAR(36) NOT NULL,

bootstrap/sql/migrations/native/1.13.0/postgres/schemaChanges.sql

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,37 @@ WHERE json #>> '{pipelineType}' = 'profiler'
235235
OR json::jsonb #>> '{sourceConfig,config,profileSampleType}' IS NOT NULL
236236
OR json::jsonb #>> '{sourceConfig,config,samplingMethodType}' IS NOT NULL);
237237

238+
-- ingestion_pipeline_entity (testSuite pipelines): build profileSampleConfig (skip if already migrated)
239+
UPDATE ingestion_pipeline_entity
240+
SET json = jsonb_set(
241+
json::jsonb,
242+
'{sourceConfig,config,profileSampleConfig}',
243+
jsonb_build_object(
244+
'sampleConfigType', 'STATIC',
245+
'config', jsonb_build_object(
246+
'profileSample', json::jsonb #> '{sourceConfig,config,profileSample}',
247+
'profileSampleType', COALESCE(
248+
json::jsonb #> '{sourceConfig,config,profileSampleType}',
249+
'"PERCENTAGE"'::jsonb
250+
),
251+
'samplingMethodType', json::jsonb #> '{sourceConfig,config,samplingMethodType}'
252+
)
253+
)
254+
)::json
255+
WHERE json #>> '{pipelineType}' = 'testSuite'
256+
AND json::jsonb #>> '{sourceConfig,config,profileSample}' IS NOT NULL
257+
AND json::jsonb #> '{sourceConfig,config,profileSampleConfig}' IS NULL;
258+
259+
-- ingestion_pipeline_entity (testSuite pipelines): remove old flat fields
260+
UPDATE ingestion_pipeline_entity
261+
SET json = (json::jsonb #- '{sourceConfig,config,profileSample}'
262+
#- '{sourceConfig,config,profileSampleType}'
263+
#- '{sourceConfig,config,samplingMethodType}')::json
264+
WHERE json #>> '{pipelineType}' = 'testSuite'
265+
AND (json::jsonb #>> '{sourceConfig,config,profileSample}' IS NOT NULL
266+
OR json::jsonb #>> '{sourceConfig,config,profileSampleType}' IS NOT NULL
267+
OR json::jsonb #>> '{sourceConfig,config,samplingMethodType}' IS NOT NULL);
268+
238269
-- RDF distributed indexing state tables
239270
CREATE TABLE IF NOT EXISTS rdf_index_job (
240271
id VARCHAR(36) NOT NULL,

ingestion/setup.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,8 @@
422422
# TODO: Remove once no unit test requires testcontainers
423423
"testcontainers",
424424
VERSIONS["factory-boy"],
425+
*plugins["exasol"],
426+
*plugins["teradata"],
425427
}
426428

427429
test = {
@@ -490,6 +492,8 @@
490492
VERSIONS["kafka-connect"],
491493
VERSIONS["factory-boy"],
492494
"locust~=2.32.0",
495+
*plugins["exasol"],
496+
*plugins["teradata"],
493497
}
494498

495499
docs = {

ingestion/src/metadata/data_quality/runner/base_test_suite_source.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,7 @@
3434
from metadata.generated.schema.type.entityReference import EntityReference
3535
from metadata.ingestion.ometa.ometa_api import OpenMetadata
3636
from metadata.sampler.models import (
37-
ProfileSampleConfig,
38-
ProfileSampleConfigType,
3937
SampleConfig,
40-
StaticSamplingConfig,
4138
)
4239
from metadata.sampler.sampler_interface import SamplerInterface # noqa: TC001
4340
from metadata.utils.bigquery_utils import copy_service_config
@@ -126,15 +123,8 @@ def create_data_quality_interface(self) -> TestSuiteInterface:
126123
schema_entity=schema_entity,
127124
database_entity=database_entity,
128125
default_sample_config=SampleConfig(
129-
profileSampleConfig=ProfileSampleConfig(
130-
sampleConfigType=ProfileSampleConfigType.STATIC,
131-
config=StaticSamplingConfig(
132-
profileSample=self.source_config.profileSample,
133-
profileSampleType=self.source_config.profileSampleType,
134-
samplingMethodType=self.source_config.samplingMethodType,
135-
),
136-
)
137-
if self.source_config.profileSample
126+
profileSampleConfig=self.source_config.profileSampleConfig
127+
if self.source_config.profileSampleConfig
138128
else None,
139129
),
140130
)

ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToBeUnique.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def _run_results(self, metric: Metrics, column: Column) -> Optional[int]: # noq
7575
"""
7676
count = Metrics.valuesCount.value(column).fn()
7777
grouped_cte = (
78-
select(count.label(column.name)).select_from(self.runner.dataset).group_by(column).cte("grouped_cte")
78+
select(count.label(column.name)).select_from(self.runner.dataset).group_by(column).cte("grouped_cte") # type: ignore
7979
)
8080
unique_count = Metrics.uniqueCount.value(column).query(
8181
sample=self.runner.dataset,

ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableDiff.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
from metadata.profiler.orm.functions.md5 import MD5
5757
from metadata.profiler.orm.functions.substr import Substr
5858
from metadata.profiler.orm.registry import Dialects, PythonDialects
59+
from metadata.sampler.config import resolve_static_sampling_config
5960
from metadata.utils.collections import CaseInsensitiveList
6061
from metadata.utils.credentials import normalize_pem_string
6162
from metadata.utils.logger import test_suite_logger
@@ -443,9 +444,12 @@ def sample_where_clause(self) -> Tuple[Optional[str], Optional[str]]: # noqa: U
443444
return None, None
444445
profile_sample_config = config.profileSampleConfig if config else None
445446
sample_config = profile_sample_config.root if profile_sample_config else None
446-
static = sample_config.config if sample_config else None
447-
profile_sample = getattr(static, "profileSample", None) if static else None
448-
profile_sample_type = getattr(static, "profileSampleType", None) if static else None
447+
static = resolve_static_sampling_config(
448+
sample_config=sample_config,
449+
row_count=self.get_total_row_count(),
450+
)
451+
profile_sample = static.profileSample if static else None
452+
profile_sample_type = static.profileSampleType if static else None
449453
if profile_sample is None or (profile_sample_type == ProfileSampleType.PERCENTAGE and profile_sample == 100):
450454
return None, None
451455
if DatabaseServiceType.Mssql in [
@@ -490,16 +494,19 @@ def calculate_nounce(self, max_nounce=2**32 - 1) -> int:
490494
config = self.runtime_params.table_profile_config
491495
profile_sample_config = config.profileSampleConfig if config else None
492496
sample_config = profile_sample_config.root if profile_sample_config else None
493-
static = sample_config.config if sample_config else None
494-
profile_sample = getattr(static, "profileSample", 100)
495-
profile_sample_type = getattr(static, "profileSampleType", None)
497+
row_count = self.get_total_row_count()
498+
static = resolve_static_sampling_config(
499+
sample_config=sample_config,
500+
row_count=row_count,
501+
)
502+
profile_sample = static.profileSample if static else None
503+
profile_sample_type = static.profileSampleType if static else None
496504
if profile_sample_type == ProfileSampleType.PERCENTAGE:
497-
return int(max_nounce * profile_sample / 100)
505+
return int(max_nounce * ((profile_sample or 100) / 100))
498506
if profile_sample_type == ProfileSampleType.ROWS:
499-
row_count = self.get_total_row_count()
500507
if row_count is None:
501508
raise ValueError("Row count is required for ROWS profile sample type")
502-
return int(max_nounce * (profile_sample / row_count))
509+
return int(max_nounce * ((profile_sample or row_count) / row_count))
503510
raise ValueError("Invalid profile sample type")
504511

505512
def get_row_diff_test_case_result(

ingestion/src/metadata/great_expectations/action.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from typing import Dict, List, Optional, Union, cast # noqa: UP035
2323

2424
from great_expectations.checkpoint.actions import ValidationAction
25-
from great_expectations.core import ExpectationConfiguration
25+
from great_expectations.core import ExpectationConfiguration # type: ignore
2626
from great_expectations.core.batch import Batch
2727
from great_expectations.core.batch_spec import (
2828
RuntimeDataBatchSpec,
@@ -32,8 +32,8 @@
3232
from great_expectations.core.expectation_validation_result import (
3333
ExpectationSuiteValidationResult,
3434
)
35-
from great_expectations.data_asset.data_asset import DataAsset
36-
from great_expectations.data_context.data_context import DataContext
35+
from great_expectations.data_asset.data_asset import DataAsset # type: ignore
36+
from great_expectations.data_context.data_context import DataContext # type: ignore
3737

3838
from metadata.generated.schema.type.basic import Timestamp
3939

@@ -115,7 +115,7 @@ def __init__(
115115
table_name: Optional[str] = None, # noqa: UP045
116116
expectation_suite_table_config_map: Optional[Dict[str, Dict[str, str]]] = None, # noqa: UP006, UP045
117117
):
118-
super().__init__(data_context, name=name)
118+
super().__init__(data_context, name=name) # type: ignore
119119
self.database_service_name = database_service_name
120120
self.database_name = database_name
121121
self.table_name = table_name
@@ -153,7 +153,7 @@ def _run( # pylint: disable=unused-argument
153153
expectation_suite_name = None
154154

155155
if expectation_suite_identifier:
156-
expectation_suite_name = expectation_suite_identifier.expectation_suite_name
156+
expectation_suite_name = expectation_suite_identifier.expectation_suite_name # type: ignore
157157
self.expectation_suite = self.data_context.get_expectation_suite(expectation_suite_name)
158158

159159
check_point_spec = self._get_checkpoint_batch_spec(data_asset)
@@ -182,7 +182,7 @@ def _run( # pylint: disable=unused-argument
182182

183183
if table_entity:
184184
for result in validation_result_suite.results:
185-
self._handle_test_case(result, table_entity)
185+
self._handle_test_case(result, table_entity) # type: ignore
186186

187187
@staticmethod
188188
def _get_checkpoint_batch_spec(

ingestion/src/metadata/great_expectations/action1xx.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@
2222
from typing import Dict, List, Literal, Optional, Union, cast # noqa: UP035
2323

2424
from great_expectations.checkpoint import (
25-
ActionContext,
26-
CheckpointResult,
27-
ValidationAction,
25+
ActionContext, # type: ignore
26+
CheckpointResult, # type: ignore
27+
ValidationAction, # type: ignore
2828
)
2929
from great_expectations.core.batch import Batch
3030
from great_expectations.core.expectation_validation_result import (
3131
ExpectationSuiteValidationResultMeta,
3232
)
33-
from great_expectations.datasource.fluent import DataAsset
33+
from great_expectations.datasource.fluent import DataAsset # type: ignore
3434
from great_expectations.validator.validator import Validator
3535
from sqlalchemy.engine.base import Connection, Engine
3636
from sqlalchemy.engine.url import URL
@@ -83,7 +83,7 @@ class OpenMetadataValidationAction1xx(ValidationAction):
8383
Format: {"suite_name": {"database_name": "db", "schema_name": "schema", "table_name": "table"}}
8484
"""
8585

86-
type: Literal["open_metadata_validation_action"] = "open_metadata_validation_action"
86+
type: Literal["open_metadata_validation_action"] = "open_metadata_validation_action" # type: ignore
8787
name: str = "OpenMetadataValidationAction"
8888
config_file_path: Optional[str] = None # noqa: UP045
8989
database_service_name: Optional[str] = None # noqa: UP045
@@ -153,7 +153,7 @@ def run(
153153

154154
if table_entity:
155155
for result in v.results:
156-
self._handle_test_case(result, table_entity)
156+
self._handle_test_case(result, table_entity) # type: ignore
157157

158158
@staticmethod
159159
def _get_checkpoint_batch_spec(meta: Union[ExpectationSuiteValidationResultMeta, dict]): # noqa: UP007

ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ def __read_manifest(
314314
# For remote repositories, clone the dependency as before
315315
url_parsed = giturlparse.parse(remote_git_url)
316316
_clone_repo(
317-
f"{url_parsed.owner}/{url_parsed.repo}", # pylint: disable=E1101
317+
f"{url_parsed.owner}/{url_parsed.repo}", # type: ignore
318318
f"{repo.path}/{IMPORTED_PROJECTS_DIR}/{remote_name}",
319319
credentials,
320320
)
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from metadata.ingestion.source.database.presto.metadata import PrestoSource
22
from metadata.utils.service_spec.default import DefaultDatabaseSpec
33

4-
ServiceSpec = DefaultDatabaseSpec(metadata_source_class=PrestoSource)
4+
ServiceSpec = DefaultDatabaseSpec(
5+
metadata_source_class=PrestoSource, # type: ignore
6+
)

0 commit comments

Comments
 (0)