Skip to content

Commit 651ad6a

Browse files
committed
refactor: losses for keras and tests
1 parent b4f3201 commit 651ad6a

8 files changed

Lines changed: 38 additions & 56 deletions

.pre-commit-config.yaml

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ repos:
2121
# Flake8: complexity and style checking
2222
# https://flake8.pycqa.org/en/latest/user/using-hooks.html
2323
- repo: https://github.com/pycqa/flake8
24-
rev: 4.0.1
24+
rev: 7.3.0
2525
hooks:
2626
- id: flake8
2727
additional_dependencies: [flake8-docstrings]
@@ -58,7 +58,7 @@ repos:
5858
chardet>=3.0.4,
5959
fastavro>=1.0.0.post1,
6060
python-snappy>=0.7.1,
61-
charset-normalizer>=1.3.6,
61+
'charset-normalizer>=1.3.6,<7.0.0',
6262
psutil>=4.0.0,
6363
scipy>=1.4.1,
6464
requests>=2.28.1,
@@ -82,11 +82,9 @@ repos:
8282

8383
# requirements-ml.txt
8484
scikit-learn>=0.23.2,
85-
'keras>=2.4.3,<=3.4.0',
85+
'keras>=3.11.0',
8686
rapidfuzz>=2.6.1,
87-
"tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
88-
"tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
89-
"tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
87+
"tensorflow>=2.16.0",
9088
tqdm>=4.0.0,
9189

9290
# requirements-reports.txt
@@ -101,7 +99,7 @@ repos:
10199
pytest-xdist>=2.1.0,
102100
pytest-forked>=1.3.0,
103101
toolz>=0.10.0,
104-
'memray>=1.7.0,<1.12.0',
102+
'memray>=1.18.0',
105103
]
106104
# Check-manifest: ensures required non-Python files are included in MANIFEST.in
107105
# https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml

dataprofiler/labelers/base_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def __eq__(self, other: object) -> bool:
7878
:rtype: bool
7979
"""
8080
if (
81-
type(self) != type(other)
81+
type(self) is not type(other)
8282
or not isinstance(other, BaseModel)
8383
or self._parameters != other._parameters
8484
or self._label_mapping != other._label_mapping

dataprofiler/labelers/char_load_tf_model.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,7 @@ def _construct_model(self) -> None:
262262

263263
# Compile the model w/ metrics
264264
softmax_output_layer_name = self._model.output_names[0]
265-
losses = {softmax_output_layer_name: "categorical_crossentropy"}
266-
265+
losses = ["categorical_crossentropy", None, None]
267266
# use f1 score metric
268267
f1_score_training = labeler_utils.F1Score(
269268
num_classes=num_labels, average="micro"
@@ -316,7 +315,7 @@ def _reconstruct_model(self) -> None:
316315

317316
# Compile the model
318317
softmax_output_layer_name = self._model.output_names[0]
319-
losses = {softmax_output_layer_name: "categorical_crossentropy"}
318+
losses = ["categorical_crossentropy", None, None]
320319

321320
# use f1 score metric
322321
f1_score_training = labeler_utils.F1Score(

dataprofiler/labelers/character_level_cnn_model.py

Lines changed: 26 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,7 @@ def load_from_disk(cls, dirpath: str) -> CharacterLevelCnnModel:
450450
loaded_model._model_default_ind = loaded_model.label_mapping[
451451
loaded_model._parameters["default_label"]
452452
]
453+
loaded_model._compile_loss(loaded_model._model, loaded_model.num_labels)
453454
return loaded_model
454455

455456
@staticmethod
@@ -475,6 +476,28 @@ def _argmax_threshold_layer(
475476
# matrix.
476477
return ThreshArgMaxLayer(threshold, num_labels, default_ind)
477478

479+
@staticmethod
480+
def _compile_loss(model: tf.keras.Model, num_labels: int) -> None:
481+
"""Compiles the loss for the given model and number of labels."""
482+
# Compile the model
483+
softmax_output_layer_name = model.output_names[0]
484+
# losses = {softmax_output_layer_name: "categorical_crossentropy"}
485+
losses = ["categorical_crossentropy", None, None]
486+
487+
# use f1 score metric
488+
f1_score_training = labeler_utils.F1Score(
489+
num_classes=num_labels, average="micro"
490+
)
491+
metrics = {
492+
softmax_output_layer_name: [
493+
"categorical_crossentropy",
494+
"acc",
495+
f1_score_training,
496+
]
497+
}
498+
499+
model.compile(loss=losses, optimizer="adam", metrics=metrics)
500+
478501
def _construct_model(self) -> None:
479502
"""
480503
Construct model for the data labeler.
@@ -570,24 +593,7 @@ def _construct_model(self) -> None:
570593
final_predicted_layer(argmax_layer, self._model.outputs[0]),
571594
]
572595
self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
573-
574-
# Compile the model
575-
softmax_output_layer_name = self._model.output_names[0]
576-
losses = {softmax_output_layer_name: "categorical_crossentropy"}
577-
578-
# use f1 score metric
579-
f1_score_training = labeler_utils.F1Score(
580-
num_classes=num_labels, average="micro"
581-
)
582-
metrics = {
583-
softmax_output_layer_name: [
584-
"categorical_crossentropy",
585-
"acc",
586-
f1_score_training,
587-
]
588-
}
589-
590-
self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
596+
self._compile_loss(self._model, num_labels)
591597

592598
self._epoch_id = 0
593599
self._model_num_labels = num_labels
@@ -632,24 +638,7 @@ def _reconstruct_model(self) -> None:
632638
final_predicted_layer(argmax_layer, final_softmax_layer),
633639
]
634640
self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
635-
636-
# Compile the model
637-
softmax_output_layer_name = self._model.output_names[0]
638-
losses = {softmax_output_layer_name: "categorical_crossentropy"}
639-
640-
# use f1 score metric
641-
f1_score_training = labeler_utils.F1Score(
642-
num_classes=num_labels, average="micro"
643-
)
644-
metrics = {
645-
softmax_output_layer_name: [
646-
"categorical_crossentropy",
647-
"acc",
648-
f1_score_training,
649-
]
650-
}
651-
652-
self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
641+
self._compile_loss(self._model, num_labels)
653642
self._epoch_id = 0
654643
self._model_num_labels = num_labels
655644
self._model_default_ind = default_ind
@@ -699,14 +688,11 @@ def fit(
699688
f1_report: dict = {}
700689

701690
self._model.reset_metrics()
702-
softmax_output_layer_name = self._model.output_names[0]
703691

704692
start_time = time.time()
705693
batch_id = 0
706694
for x_train, y_train in train_data:
707-
model_results = self._model.train_on_batch(
708-
x_train, {softmax_output_layer_name: y_train}
709-
)
695+
model_results = self._model.train_on_batch(x_train, y_train)
710696
sys.stdout.flush()
711697
if verbose:
712698
sys.stdout.write(

dataprofiler/labelers/data_processing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def __eq__(self, other: object) -> bool:
7373
:rtype: bool
7474
"""
7575
if (
76-
type(self) != type(other)
76+
type(self) is not type(other)
7777
or not isinstance(other, BaseDataProcessor)
7878
or self._parameters != other._parameters
7979
):
@@ -1589,7 +1589,7 @@ def __eq__(self, other: object) -> bool:
15891589
:rtype: bool
15901590
"""
15911591
if (
1592-
type(self) != type(other)
1592+
type(self) is not type(other)
15931593
or not isinstance(other, StructCharPostprocessor)
15941594
or self._parameters["default_label"] != other._parameters["default_label"]
15951595
or self._parameters["pad_label"] != other._parameters["pad_label"]

dataprofiler/tests/labelers/test_character_level_cnn_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ def test_save(self, mock_open, *mocks):
430430
StringIO.close(mock_file)
431431

432432
@mock.patch("tensorflow.keras.Model.save", return_value=None)
433-
@mock.patch("tensorflow.keras.models.load_model", return_value=mock.Mock())
433+
@mock.patch("tensorflow.keras.models.load_model", return_value=mock.MagicMock())
434434
@mock.patch("builtins.open", side_effect=mock_open)
435435
def test_load(self, *mocks):
436436
dir = os.path.join(_resource_labeler_dir, "unstructured_model/")

dataprofiler/tests/labelers/test_data_labelers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ def test_has_public_functions(self, *args):
399399

400400
@staticmethod
401401
def _setup_mock_load_model(mock_load_model):
402-
mock_load_model.return_value = mock.Mock()
402+
mock_load_model.return_value = mock.MagicMock()
403403

404404
def test_load_labeler(self, mock_open, mock_load_model):
405405

requirements-ml.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,4 @@ scikit-learn>=0.23.2
22
keras<=3.11.0
33
rapidfuzz>=2.6.1
44
tensorflow>=2.16.0
5-
tensorflow-metal; sys_platform == 'darwin' and platform_machine == 'arm64'
65
tqdm>=4.0.0

0 commit comments

Comments
 (0)