Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ repos:
# Flake8: complexity and style checking
# https://flake8.pycqa.org/en/latest/user/using-hooks.html
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
rev: 7.3.0
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
Expand Down Expand Up @@ -51,14 +51,14 @@ repos:
h5py>=2.10.0,
wheel>=0.33.1,
numpy<2.0.0,
pandas>=1.1.2,
'pandas>=1.1.2,<3.0.0',
python-dateutil>=2.7.5,
pytz>=2020.1,
pyarrow>=1.0.1,
chardet>=3.0.4,
fastavro>=1.0.0.post1,
python-snappy>=0.7.1,
charset-normalizer>=1.3.6,
'charset-normalizer>=1.3.6,<7.0.0',
psutil>=4.0.0,
scipy>=1.4.1,
requests>=2.28.1,
Expand All @@ -82,11 +82,9 @@ repos:

# requirements-ml.txt
scikit-learn>=0.23.2,
'keras>=2.4.3,<=3.4.0',
'keras>=3.11.0',
rapidfuzz>=2.6.1,
"tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
"tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
"tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
"tensorflow>=2.16.0",
tqdm>=4.0.0,

# requirements-reports.txt
Expand All @@ -101,7 +99,7 @@ repos:
pytest-xdist>=2.1.0,
pytest-forked>=1.3.0,
toolz>=0.10.0,
'memray>=1.7.0,<1.12.0',
'memray>=1.18.0',
]
# Check-manifest: ensures required non-Python files are included in MANIFEST.in
# https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml
Expand Down
7 changes: 4 additions & 3 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ recursive-include dataprofiler *.parquet
recursive-include dataprofiler *.py
recursive-include dataprofiler *.txt

recursive-include resources *.json
recursive-include resources *.pb
recursive-include resources *.py
recursive-include dataprofiler/resources *.json
recursive-include dataprofiler/resources *.pb
recursive-include dataprofiler/resources *.py
recursive-include dataprofiler/resources *.keras

recursive-include dataprofiler/labelers/embeddings *.txt
include versioneer.py
Expand Down
19 changes: 12 additions & 7 deletions dataprofiler/labelers/base_data_labeler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains abstract classes from which labeler classes will inherit."""

from __future__ import annotations

import json
Expand All @@ -9,15 +10,14 @@

import numpy as np
import pandas as pd
import pkg_resources

from dataprofiler._typing import DataArray

from .. import data_readers
from . import data_processing
from . import data_processing, utils
from .base_model import BaseModel

default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
default_labeler_dir = utils.find_resources_dir("labelers")


class BaseDataLabeler:
Expand Down Expand Up @@ -246,7 +246,8 @@ def set_params(self, params: dict) -> None:
self._postprocessor.set_params(**params["postprocessor"])

self.check_pipeline(
skip_postprocessor=self._postprocessor is None, error_on_mismatch=False
skip_postprocessor=self._postprocessor is None,
error_on_mismatch=False,
)

def add_label(self, label: str, same_as: str = None) -> None:
Expand Down Expand Up @@ -438,7 +439,9 @@ def get_parameter_overlap_mismatches(
messages.append(
"Preprocessor and postprocessor value for `{}` do not "
"match. {} != {}".format(
param, preprocessor_params[param], postprocessor_params[param]
param,
preprocessor_params[param],
postprocessor_params[param],
)
)
if messages:
Expand Down Expand Up @@ -490,7 +493,8 @@ def _load_parameters(dirpath: str, load_options: dict = None) -> dict[str, dict]
"The load_options preprocessor class does not "
"match the required DataLabeler preprocessor."
"\n {} != {}".format(
processor_class.__class__.__name__, param_processor_class
processor_class.__class__.__name__,
param_processor_class,
)
)
params["preprocessor"]["class"] = load_options.get("preprocessor_class")
Expand All @@ -505,7 +509,8 @@ def _load_parameters(dirpath: str, load_options: dict = None) -> dict[str, dict]
raise ValueError(
"The load_options postprocessor class does not match "
"the required DataLabeler postprocessor.\n {} != {}".format(
processor_class.__class__.__name__, param_processor_class
processor_class.__class__.__name__,
param_processor_class,
)
)
params["postprocessor"]["class"] = load_options.get("postprocessor_class")
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, BaseModel)
or self._parameters != other._parameters
or self._label_mapping != other._label_mapping
Expand Down
5 changes: 2 additions & 3 deletions dataprofiler/labelers/char_load_tf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,7 @@ def _construct_model(self) -> None:

# Compile the model w/ metrics
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

losses = ["categorical_crossentropy", None, None]
# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
Expand Down Expand Up @@ -316,7 +315,7 @@ def _reconstruct_model(self) -> None:

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}
losses = ["categorical_crossentropy", None, None]

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
Expand Down
66 changes: 26 additions & 40 deletions dataprofiler/labelers/character_level_cnn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,7 @@ def load_from_disk(cls, dirpath: str) -> CharacterLevelCnnModel:
loaded_model._model_default_ind = loaded_model.label_mapping[
loaded_model._parameters["default_label"]
]
loaded_model._compile_loss(loaded_model._model, loaded_model.num_labels)
return loaded_model

@staticmethod
Expand All @@ -475,6 +476,28 @@ def _argmax_threshold_layer(
# matrix.
return ThreshArgMaxLayer(threshold, num_labels, default_ind)

@staticmethod
def _compile_loss(model: tf.keras.Model, num_labels: int) -> None:
"""Compiles the loss for the given model and number of labels."""
# Compile the model
softmax_output_layer_name = model.output_names[0]
# losses = {softmax_output_layer_name: "categorical_crossentropy"}
losses = ["categorical_crossentropy", None, None]

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

model.compile(loss=losses, optimizer="adam", metrics=metrics)

def _construct_model(self) -> None:
"""
Construct model for the data labeler.
Expand Down Expand Up @@ -570,24 +593,7 @@ def _construct_model(self) -> None:
final_predicted_layer(argmax_layer, self._model.outputs[0]),
]
self._model = tf.keras.Model(self._model.inputs, argmax_outputs)

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
self._compile_loss(self._model, num_labels)

self._epoch_id = 0
self._model_num_labels = num_labels
Expand Down Expand Up @@ -632,24 +638,7 @@ def _reconstruct_model(self) -> None:
final_predicted_layer(argmax_layer, final_softmax_layer),
]
self._model = tf.keras.Model(self._model.inputs, argmax_outputs)

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
self._compile_loss(self._model, num_labels)
self._epoch_id = 0
self._model_num_labels = num_labels
self._model_default_ind = default_ind
Expand Down Expand Up @@ -699,14 +688,11 @@ def fit(
f1_report: dict = {}

self._model.reset_metrics()
softmax_output_layer_name = self._model.output_names[0]

start_time = time.time()
batch_id = 0
for x_train, y_train in train_data:
model_results = self._model.train_on_batch(
x_train, {softmax_output_layer_name: y_train}
)
model_results = self._model.train_on_batch(x_train, y_train)
sys.stdout.flush()
if verbose:
sys.stdout.write(
Expand Down
5 changes: 3 additions & 2 deletions dataprofiler/labelers/data_labelers.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
"""Module to train and choose between structured and unstructured data labelers."""

from __future__ import annotations

import os

import pandas as pd
import pkg_resources

from .. import data_readers
from . import utils
from .base_data_labeler import BaseDataLabeler, TrainableDataLabeler
from .base_model import BaseModel
from .data_processing import BaseDataPostprocessor, BaseDataPreprocessor

default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
default_labeler_dir = utils.find_resources_dir("labelers")


def train_structured_labeler(
Expand Down
11 changes: 7 additions & 4 deletions dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains pre-built processors for data labeling/processing."""

from __future__ import annotations

import abc
Expand All @@ -15,9 +16,11 @@

import numpy as np
import numpy.typing as npt
import pkg_resources

default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
from . import utils

default_labeler_dir = utils.find_resources_dir("labelers")


Processor = TypeVar("Processor", bound="BaseDataProcessor")

Expand Down Expand Up @@ -70,7 +73,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, BaseDataProcessor)
or self._parameters != other._parameters
):
Expand Down Expand Up @@ -1586,7 +1589,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, StructCharPostprocessor)
or self._parameters["default_label"] != other._parameters["default_label"]
or self._parameters["pad_label"] != other._parameters["pad_label"]
Expand Down
22 changes: 22 additions & 0 deletions dataprofiler/labelers/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
"""Contains functions for checking for installations/dependencies."""

import importlib.resources
import sys
import warnings
from pathlib import Path
from typing import Any, Callable, List

try:
# Newer Pythons / newer typeshed
from importlib.resources.abc import Traversable
except ModuleNotFoundError:
# Older Pythons
from importlib.abc import Traversable


def warn_missing_module(labeler_function: str, module_name: str) -> None:
"""
Expand Down Expand Up @@ -50,3 +60,15 @@ def new_f(*args: Any, **kwds: Any) -> Any:
return new_f

return check_module


def find_resources_dir(resource_path: str | Path | None = None) -> Traversable:
"""Return the path to the package resources."""
resource = importlib.resources.files("dataprofiler") / "resources"
if resource_path:
resource /= resource_path

if not (resource.is_file() or resource.is_dir()):
raise FileNotFoundError(f"Resource not found: {resource_path}")

return resource
2 changes: 1 addition & 1 deletion dataprofiler/plugins/decorators.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains function for generating plugins data."""

from collections import defaultdict
from typing import Any, DefaultDict, Dict

Expand All @@ -21,7 +22,6 @@ def __inner_factory_function(fn):
:param fn: Plugin function
:return: function
"""
global plugins_dict
plugins_dict[typ][name] = fn
return fn

Expand Down
9 changes: 5 additions & 4 deletions dataprofiler/tests/labelers/test_char_tf_load_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@

import numpy as np
import pandas as pd
import pkg_resources
import tensorflow as tf

from dataprofiler.labelers import utils as labeler_utils
from dataprofiler.labelers.char_load_tf_model import CharLoadTFModel

_file_dir = os.path.dirname(os.path.abspath(__file__))
_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers")

default_labeler_dir = labeler_utils.find_resources_dir("labelers")

mock_model_parameters = {
"model_path": "project/example/path/fake_model.h5",
Expand Down Expand Up @@ -303,7 +302,9 @@ def test_param_validation(self, *mocks):
"fake_extra_param": "fails",
}
model = CharLoadTFModel(
self.model_path, label_mapping=self.label_mapping, parameters=parameters
self.model_path,
label_mapping=self.label_mapping,
parameters=parameters,
)
model._construct_model()
self.assertDictEqual(parameters, model._parameters)
Expand Down
Loading
Loading