Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Table Column Count Metric definition
"""

# pylint: disable=duplicate-code

from typing import TYPE_CHECKING, Optional
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Table Column Count Metric definition
"""

# pylint: disable=duplicate-code

from typing import TYPE_CHECKING, Optional
Expand Down
1 change: 1 addition & 0 deletions ingestion/src/metadata/profiler/metrics/static/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Count Metric definition
"""

# pylint: disable=duplicate-code

import traceback
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
CountInSet Metric definition
"""

# pylint: disable=duplicate-code
import traceback
from typing import TYPE_CHECKING, List, Optional
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Distinct Count Metric definition
"""

# pylint: disable=duplicate-code

import json
Expand All @@ -25,6 +26,7 @@
from metadata.generated.schema.configuration.profilerConfiguration import MetricType
from metadata.profiler.metrics.core import StaticMetric, _label
from metadata.profiler.orm.functions.count import CountFn
from metadata.profiler.orm.registry import is_complex_type
from metadata.utils.logger import profiler_logger

logger = profiler_logger()
Expand Down Expand Up @@ -52,6 +54,8 @@ def fn(self):
"""
Distinct Count metric for Sqlalchemy connectors
"""
if is_complex_type(self.col.type):
return None
return func.count(distinct(CountFn(column(self.col.name, self.col.type))))

def df_fn(self, dfs: Optional["PandasRunner"] = None):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
"""
ILIKE Count Metric definition
"""

# pylint: disable=duplicate-code

from sqlalchemy import case, column

from metadata.generated.schema.configuration.profilerConfiguration import MetricType
from metadata.profiler.metrics.core import StaticMetric, _label
from metadata.profiler.orm.functions.sum import SumFn
from metadata.profiler.orm.registry import is_complex_type


class ILikeCount(StaticMetric):
Expand Down Expand Up @@ -46,6 +48,8 @@ def metric_type(self):

@_label
def fn(self):
if is_complex_type(self.col.type):
return None
if not hasattr(self, "expression"):
raise AttributeError(
"ILike Count requires an expression to be set: add_props(expression=...)(Metrics.iLikeCount)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
"""
Like Count Metric definition
"""

# pylint: disable=duplicate-code

from sqlalchemy import case, column

from metadata.generated.schema.configuration.profilerConfiguration import MetricType
from metadata.profiler.metrics.core import StaticMetric, _label
from metadata.profiler.orm.functions.sum import SumFn
from metadata.profiler.orm.registry import is_complex_type


class LikeCount(StaticMetric):
Expand Down Expand Up @@ -46,6 +48,8 @@ def metric_type(self):

@_label
def fn(self):
if is_complex_type(self.col.type):
return None
if not hasattr(self, "expression"):
raise AttributeError(
"Like Count requires an expression to be set: add_props(expression=...)(Metrics.likeCount)"
Expand Down
1 change: 1 addition & 0 deletions ingestion/src/metadata/profiler/metrics/static/max.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Max Metric definition
"""

from functools import partial
from typing import TYPE_CHECKING, Callable, Optional

Expand Down
10 changes: 5 additions & 5 deletions ingestion/src/metadata/profiler/metrics/static/max_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
MAX_LENGTH Metric definition
"""

# pylint: disable=duplicate-code


Expand All @@ -23,7 +24,7 @@
from metadata.profiler.metrics.core import StaticMetric, _label
from metadata.profiler.metrics.pandas_metric_protocol import PandasComputation
from metadata.profiler.orm.functions.length import LenFn
from metadata.profiler.orm.registry import is_concatenable
from metadata.profiler.orm.registry import is_complex_type, is_concatenable
from metadata.utils.logger import profiler_logger

if TYPE_CHECKING:
Expand Down Expand Up @@ -100,14 +101,13 @@ def update_accumulator(
current_max: Optional[int], df: "pd.DataFrame", column
) -> Optional[int]:
"""Computes one DataFrame chunk and updates the running maximum"""
# pylint: disable=import-outside-toplevel
import pandas as pd
from numpy import vectorize

length_vectorize_func = vectorize(len)
chunk_max = None

if is_concatenable(column.type):
max_val = length_vectorize_func(df[column.name].dropna().astype(str)).max()
if is_concatenable(column.type) or is_complex_type(column.type):
max_val = df[column.name].dropna().astype(str).str.len().max()
if not pd.isnull(max_val):
chunk_max = max_val

Expand Down
1 change: 1 addition & 0 deletions ingestion/src/metadata/profiler/metrics/static/mean.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
AVG Metric definition
"""

from functools import partial
from typing import TYPE_CHECKING, Callable, NamedTuple, Optional

Expand Down
1 change: 1 addition & 0 deletions ingestion/src/metadata/profiler/metrics/static/min.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Min Metric definition
"""

from functools import partial
from typing import TYPE_CHECKING, Callable, Optional

Expand Down
10 changes: 5 additions & 5 deletions ingestion/src/metadata/profiler/metrics/static/min_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
MIN_LENGTH Metric definition
"""

# pylint: disable=duplicate-code


Expand All @@ -23,7 +24,7 @@
from metadata.profiler.metrics.core import StaticMetric, _label
from metadata.profiler.metrics.pandas_metric_protocol import PandasComputation
from metadata.profiler.orm.functions.length import LenFn
from metadata.profiler.orm.registry import is_concatenable
from metadata.profiler.orm.registry import is_complex_type, is_concatenable
from metadata.utils.logger import profiler_logger

if TYPE_CHECKING:
Expand Down Expand Up @@ -100,14 +101,13 @@ def update_accumulator(
current_min: Optional[int], df: "pd.DataFrame", column
) -> Optional[int]:
"""Computes one DataFrame chunk and updates the running minimum"""
# pylint: disable=import-outside-toplevel
import pandas as pd
from numpy import vectorize

length_vectorize_func = vectorize(len)
chunk_min = None

if is_concatenable(column.type):
min_val = length_vectorize_func(df[column.name].dropna().astype(str)).min()
if is_concatenable(column.type) or is_complex_type(column.type):
min_val = df[column.name].dropna().astype(str).str.len().min()
if not pd.isnull(min_val):
chunk_min = min_val

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
"""
Like Count Metric definition
"""

# pylint: disable=duplicate-code

from sqlalchemy import case, column

from metadata.generated.schema.configuration.profilerConfiguration import MetricType
from metadata.profiler.metrics.core import StaticMetric, _label
from metadata.profiler.orm.functions.sum import SumFn
from metadata.profiler.orm.registry import is_complex_type


class NotLikeCount(StaticMetric):
Expand Down Expand Up @@ -46,6 +48,8 @@ def metric_type(self):

@_label
def fn(self):
if is_complex_type(self.col.type):
return None
if not hasattr(self, "expression"):
raise AttributeError(
"Not Like Count requires an expression to be set: add_props(expression=...)(Metrics.notLikeCount)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Regex Count Metric definition
"""

# pylint: disable=duplicate-code

import traceback
Expand All @@ -24,7 +25,7 @@
from metadata.profiler.metrics.pandas_metric_protocol import PandasComputation
from metadata.profiler.orm.functions.regexp import RegexpMatchFn
from metadata.profiler.orm.functions.sum import SumFn
from metadata.profiler.orm.registry import is_concatenable
from metadata.profiler.orm.registry import is_complex_type, is_concatenable
from metadata.utils.logger import profiler_logger

if TYPE_CHECKING:
Expand Down Expand Up @@ -64,6 +65,8 @@ def _is_concatenable(self):
@_label
def fn(self):
"""sqlalchemy function"""
if is_complex_type(self.col.type):
return None
if not hasattr(self, "expression"):
raise AttributeError(
"Not Regex Count requires an expression to be set: add_props(expression=...)(Metrics.notRegexCount)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Null Count Metric definition
"""

# pylint: disable=duplicate-code

from typing import TYPE_CHECKING, Optional
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Null Count Metric definition
"""

# pylint: disable=duplicate-code

from typing import TYPE_CHECKING, Optional
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Regex Count Metric definition
"""

# pylint: disable=duplicate-code

import traceback
Expand All @@ -24,7 +25,7 @@
from metadata.profiler.metrics.pandas_metric_protocol import PandasComputation
from metadata.profiler.orm.functions.regexp import RegexpMatchFn
from metadata.profiler.orm.functions.sum import SumFn
from metadata.profiler.orm.registry import is_concatenable
from metadata.profiler.orm.registry import is_complex_type, is_concatenable
from metadata.utils.logger import profiler_logger

if TYPE_CHECKING:
Expand Down Expand Up @@ -64,6 +65,8 @@ def _is_concatenable(self):
@_label
def fn(self):
"""sqlalchemy function"""
if is_complex_type(self.col.type):
return None
if not hasattr(self, "expression"):
raise AttributeError(
"Regex Count requires an expression to be set: add_props(expression=...)(Metrics.regexCount)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Table Count Metric definition
"""

from typing import TYPE_CHECKING, Callable, Optional

from sqlalchemy import func
Expand Down
1 change: 1 addition & 0 deletions ingestion/src/metadata/profiler/metrics/static/sum.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
SUM Metric definition
"""

from functools import partial
from typing import TYPE_CHECKING, Callable, Optional

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
Unique Count Metric definition
"""

import json
from collections import Counter
from typing import TYPE_CHECKING, Optional
Expand All @@ -23,7 +24,7 @@
from metadata.profiler.metrics.core import QueryMetric
from metadata.profiler.metrics.pandas_metric_protocol import PandasComputation
from metadata.profiler.orm.functions.unique_count import _unique_count_query_mapper
from metadata.profiler.orm.registry import NOT_COMPUTE, Dialects
from metadata.profiler.orm.registry import NOT_COMPUTE, Dialects, is_complex_type
from metadata.utils.logger import profiler_logger

if TYPE_CHECKING:
Expand Down Expand Up @@ -60,7 +61,9 @@ def query(self, sample: Optional[type], session: Optional[Session] = None):
"We are missing the session attribute to compute the UniqueCount."
)

if self.col.type.__class__.__name__ in NOT_COMPUTE:
if self.col.type.__class__.__name__ in NOT_COMPUTE or is_complex_type(
self.col.type
):
return None

# Run all queries on top of the sampled data
Expand Down
Loading
Loading