refactor: losses for keras and tests

JGSweets · JGSweets · commit 651ad6a264c7 · 2026-03-13T17:11:47.000-05:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
   # Flake8: complexity and style checking
   # https://flake8.pycqa.org/en/latest/user/using-hooks.html
   - repo: https://github.com/pycqa/flake8
-    rev: 4.0.1
+    rev: 7.3.0
     hooks:
       - id: flake8
         additional_dependencies: [flake8-docstrings]
@@ -58,7 +58,7 @@ repos:
             chardet>=3.0.4,
             fastavro>=1.0.0.post1,
             python-snappy>=0.7.1,
-            charset-normalizer>=1.3.6,
+            'charset-normalizer>=1.3.6,<7.0.0',
             psutil>=4.0.0,
             scipy>=1.4.1,
             requests>=2.28.1,
@@ -82,11 +82,9 @@ repos:
 
             # requirements-ml.txt
             scikit-learn>=0.23.2,
-            'keras>=2.4.3,<=3.4.0',
+            'keras>=3.11.0',
             rapidfuzz>=2.6.1,
-            "tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
-            "tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
-            "tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
+            "tensorflow>=2.16.0",
             tqdm>=4.0.0,
 
             # requirements-reports.txt
@@ -101,7 +99,7 @@ repos:
             pytest-xdist>=2.1.0,
             pytest-forked>=1.3.0,
             toolz>=0.10.0,
-            'memray>=1.7.0,<1.12.0',
+            'memray>=1.18.0',
           ]
   # Check-manifest: ensures required non-Python files are included in MANIFEST.in
   # https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml
diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py
@@ -78,7 +78,7 @@ def __eq__(self, other: object) -> bool:
         :rtype: bool
         """
         if (
-            type(self) != type(other)
+            type(self) is not type(other)
             or not isinstance(other, BaseModel)
             or self._parameters != other._parameters
             or self._label_mapping != other._label_mapping
diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py
@@ -262,8 +262,7 @@ def _construct_model(self) -> None:
 
         # Compile the model w/ metrics
         softmax_output_layer_name = self._model.output_names[0]
-        losses = {softmax_output_layer_name: "categorical_crossentropy"}
-
+        losses = ["categorical_crossentropy", None, None]
         # use f1 score metric
         f1_score_training = labeler_utils.F1Score(
             num_classes=num_labels, average="micro"
@@ -316,7 +315,7 @@ def _reconstruct_model(self) -> None:
 
         # Compile the model
         softmax_output_layer_name = self._model.output_names[0]
-        losses = {softmax_output_layer_name: "categorical_crossentropy"}
+        losses = ["categorical_crossentropy", None, None]
 
         # use f1 score metric
         f1_score_training = labeler_utils.F1Score(
diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py
@@ -450,6 +450,7 @@ def load_from_disk(cls, dirpath: str) -> CharacterLevelCnnModel:
         loaded_model._model_default_ind = loaded_model.label_mapping[
             loaded_model._parameters["default_label"]
         ]
+        loaded_model._compile_loss(loaded_model._model, loaded_model.num_labels)
         return loaded_model
 
     @staticmethod
@@ -475,6 +476,28 @@ def _argmax_threshold_layer(
         # matrix.
         return ThreshArgMaxLayer(threshold, num_labels, default_ind)
 
+    @staticmethod
+    def _compile_loss(model: tf.keras.Model, num_labels: int) -> None:
+        """Compiles the loss for the given model and number of labels."""
+        # Compile the model
+        softmax_output_layer_name = model.output_names[0]
+        # losses = {softmax_output_layer_name: "categorical_crossentropy"}
+        losses = ["categorical_crossentropy", None, None]
+
+        # use f1 score metric
+        f1_score_training = labeler_utils.F1Score(
+            num_classes=num_labels, average="micro"
+        )
+        metrics = {
+            softmax_output_layer_name: [
+                "categorical_crossentropy",
+                "acc",
+                f1_score_training,
+            ]
+        }
+
+        model.compile(loss=losses, optimizer="adam", metrics=metrics)
+
     def _construct_model(self) -> None:
         """
         Construct model for the data labeler.
@@ -570,24 +593,7 @@ def _construct_model(self) -> None:
             final_predicted_layer(argmax_layer, self._model.outputs[0]),
         ]
         self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
-
-        # Compile the model
-        softmax_output_layer_name = self._model.output_names[0]
-        losses = {softmax_output_layer_name: "categorical_crossentropy"}
-
-        # use f1 score metric
-        f1_score_training = labeler_utils.F1Score(
-            num_classes=num_labels, average="micro"
-        )
-        metrics = {
-            softmax_output_layer_name: [
-                "categorical_crossentropy",
-                "acc",
-                f1_score_training,
-            ]
-        }
-
-        self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
+        self._compile_loss(self._model, num_labels)
 
         self._epoch_id = 0
         self._model_num_labels = num_labels
@@ -632,24 +638,7 @@ def _reconstruct_model(self) -> None:
             final_predicted_layer(argmax_layer, final_softmax_layer),
         ]
         self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
-
-        # Compile the model
-        softmax_output_layer_name = self._model.output_names[0]
-        losses = {softmax_output_layer_name: "categorical_crossentropy"}
-
-        # use f1 score metric
-        f1_score_training = labeler_utils.F1Score(
-            num_classes=num_labels, average="micro"
-        )
-        metrics = {
-            softmax_output_layer_name: [
-                "categorical_crossentropy",
-                "acc",
-                f1_score_training,
-            ]
-        }
-
-        self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
+        self._compile_loss(self._model, num_labels)
         self._epoch_id = 0
         self._model_num_labels = num_labels
         self._model_default_ind = default_ind
@@ -699,14 +688,11 @@ def fit(
         f1_report: dict = {}
 
         self._model.reset_metrics()
-        softmax_output_layer_name = self._model.output_names[0]
 
         start_time = time.time()
         batch_id = 0
         for x_train, y_train in train_data:
-            model_results = self._model.train_on_batch(
-                x_train, {softmax_output_layer_name: y_train}
-            )
+            model_results = self._model.train_on_batch(x_train, y_train)
             sys.stdout.flush()
             if verbose:
                 sys.stdout.write(
diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py
@@ -73,7 +73,7 @@ def __eq__(self, other: object) -> bool:
         :rtype: bool
         """
         if (
-            type(self) != type(other)
+            type(self) is not type(other)
             or not isinstance(other, BaseDataProcessor)
             or self._parameters != other._parameters
         ):
@@ -1589,7 +1589,7 @@ def __eq__(self, other: object) -> bool:
         :rtype: bool
         """
         if (
-            type(self) != type(other)
+            type(self) is not type(other)
             or not isinstance(other, StructCharPostprocessor)
             or self._parameters["default_label"] != other._parameters["default_label"]
             or self._parameters["pad_label"] != other._parameters["pad_label"]
diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py
@@ -430,7 +430,7 @@ def test_save(self, mock_open, *mocks):
         StringIO.close(mock_file)
 
     @mock.patch("tensorflow.keras.Model.save", return_value=None)
-    @mock.patch("tensorflow.keras.models.load_model", return_value=mock.Mock())
+    @mock.patch("tensorflow.keras.models.load_model", return_value=mock.MagicMock())
     @mock.patch("builtins.open", side_effect=mock_open)
     def test_load(self, *mocks):
         dir = os.path.join(_resource_labeler_dir, "unstructured_model/")
diff --git a/dataprofiler/tests/labelers/test_data_labelers.py b/dataprofiler/tests/labelers/test_data_labelers.py
@@ -399,7 +399,7 @@ def test_has_public_functions(self, *args):
 
     @staticmethod
     def _setup_mock_load_model(mock_load_model):
-        mock_load_model.return_value = mock.Mock()
+        mock_load_model.return_value = mock.MagicMock()
 
     def test_load_labeler(self, mock_open, mock_load_model):
 
diff --git a/requirements-ml.txt b/requirements-ml.txt
@@ -2,5 +2,4 @@ scikit-learn>=0.23.2
 keras<=3.11.0
 rapidfuzz>=2.6.1
 tensorflow>=2.16.0
-tensorflow-metal; sys_platform == 'darwin' and platform_machine == 'arm64'
 tqdm>=4.0.0