diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index a9f3082b..9290a61c 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -18,9 +18,9 @@ Steps to reproduce the behavior:
 4. See error
 
 **Operating environment(运行环境):**
- - python version [e.g. 3.6, 3.7]
- - torch version [e.g. 1.9.0, 1.10.0]
- - deepctr-torch version [e.g. 0.2.9,]
+ - python version [e.g. 3.10]
+ - pytorch/torch version [e.g. 2.5.1]
+ - deepctr-torch version [e.g. 0.3.0,]
 
 **Additional context**
 Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
index 801d66e3..1ef78471 100644
--- a/.github/ISSUE_TEMPLATE/question.md
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -15,6 +15,6 @@ A clear and concise description of what the question is.
 Add any other context about the problem here.
 
 **Operating environment(运行环境):**
- - python version [e.g. 3.6]
- - torch version [e.g. 1.10.0,]
- - deepctr-torch version [e.g. 0.2.9,]
+ - python version [e.g. 3.10]
+ - pytorch/torch version [e.g. 2.5.1]
+ - deepctr-torch version [e.g. 0.3.0,]
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9b913a32..82046a0d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,90 +1,87 @@
 name: CI
 
-on: 
+on:
   push:
-    path:
-      - 'deepctr_torch/*'
-      - 'tests/*'
+    paths:
+      - "deepctr_torch/**"
+      - "tests/**"
+      - "examples/**"
+      - "setup.py"
+      - ".github/workflows/**"
   pull_request:
-    path:
-      - 'deepctr_torch/*'
-      - 'tests/*'
-      
+    paths:
+      - "deepctr_torch/**"
+      - "tests/**"
+      - "examples/**"
+      - "setup.py"
+      - ".github/workflows/**"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
+  cancel-in-progress: true
+
 jobs:
   build:
-
-    runs-on: ubuntu-latest
-    timeout-minutes: 120
+    runs-on: ubuntu-22.04
+    timeout-minutes: 180
     strategy:
+      fail-fast: false
       matrix:
-        python-version: [3.6,3.7,3.8,3.9,3.10.7]
-        torch-version: [1.2.0,1.3.0,1.4.0,1.5.0,1.6.0,1.7.1,1.8.1,1.9.0,1.10.2,1.11.0,1.12.1]
-        
-        exclude:
-          - python-version: 3.6
-            torch-version: 1.11.0
-          - python-version: 3.6
-            torch-version: 1.12.1
-          - python-version: 3.8
-            torch-version: 1.2.0
-          - python-version: 3.8
-            torch-version: 1.3.0
-          - python-version: 3.9
-            torch-version: 1.2.0
-          - python-version: 3.9
-            torch-version: 1.3.0
-          - python-version: 3.9
-            torch-version: 1.4.0
-          - python-version: 3.9
-            torch-version: 1.5.0
-          - python-version: 3.9
-            torch-version: 1.6.0
-          - python-version: 3.9
-            torch-version: 1.7.1
-          - python-version: 3.10.7
-            torch-version: 1.2.0
-          - python-version: 3.10.7
-            torch-version: 1.3.0
-          - python-version: 3.10.7
-            torch-version: 1.4.0
-          - python-version: 3.10.7
-            torch-version: 1.5.0
-          - python-version: 3.10.7
-            torch-version: 1.6.0
-          - python-version: 3.10.7
-            torch-version: 1.7.1
-          - python-version: 3.10.7
-            torch-version: 1.8.1
-          - python-version: 3.10.7
-            torch-version: 1.9.0
-          - python-version: 3.10.7
-            torch-version: 1.10.2
+        include:
+          # Python 3.7 cannot install torch 2.4/2.5 wheels from official index.
+          # Keep a legacy torch smoke job to guarantee 3.7 runtime compatibility.
+          - python-version: "3.7"
+            torch-version: "1.13.1"
+          - python-version: "3.8"
+            torch-version: "2.4.1"
+          - python-version: "3.9"
+            torch-version: "2.4.1"
+          - python-version: "3.9"
+            torch-version: "2.5.1"
+          - python-version: "3.10"
+            torch-version: "2.4.1"
+          - python-version: "3.10"
+            torch-version: "2.5.1"
+            run-examples: "1"
+          - python-version: "3.11"
+            torch-version: "2.4.1"
+          - python-version: "3.11"
+            torch-version: "2.5.1"
+          - python-version: "3.12"
+            torch-version: "2.4.1"
+          - python-version: "3.12"
+            torch-version: "2.5.1"
+
+    env:
+      TORCH_VERSION: ${{ matrix.torch-version }}
+      TORCH_INDEX_URL: https://download.pytorch.org/whl/cpu
+      RUN_EXAMPLES: ${{ matrix.run-examples || '0' }}
+
     steps:
-    
-    - uses: actions/checkout@v3
-    
-    - name: Setup python environment
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
+      - uses: actions/checkout@v5
+
+      - name: Setup python environment
+        uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: bash tests/ci/install.sh
+
+      - name: Test with pytest
+        timeout-minutes: 180
+        run: bash tests/ci/test.sh
+
+      - name: Run examples smoke tests
+        if: ${{ env.RUN_EXAMPLES == '1' }}
+        timeout-minutes: 60
+        run: bash tests/ci/examples.sh
 
-    - name: Install dependencies
-      run: |
-        pip3 install -q torch==${{ matrix.torch-version }}
-        pip install -q requests
-        pip install -e .
-    - name: Test with pytest
-      timeout-minutes: 120
-      run: |
-        pip install -q pytest
-        pip install -q pytest-cov
-        pip install -q python-coveralls
-        pip install -q sklearn
-        pytest --cov=deepctr_torch --cov-report=xml
-    - name: Upload coverage to Codecov  
-      uses: codecov/codecov-action@v3.1.0
-      with:
-        token: ${{secrets.CODECOV_TOKEN}}
-        file: ./coverage.xml
-        flags: pytest
-        name: py${{ matrix.python-version }}-torch${{ matrix.torch-version }}
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v6
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.xml
+          flags: pytest
+          name: py${{ matrix.python-version }}-torch${{ matrix.torch-version }}
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 00000000..5d4acc27
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,15 @@
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
+sphinx:
+  configuration: docs/source/conf.py
+
+python:
+  install:
+    - requirements: docs/requirements.readthedocs.txt
+
+formats: []
diff --git a/deepctr_torch/__init__.py b/deepctr_torch/__init__.py
index 6c3af45c..564eb289 100644
--- a/deepctr_torch/__init__.py
+++ b/deepctr_torch/__init__.py
@@ -2,5 +2,5 @@
 from . import models
 from .utils import check_version
 
-__version__ = '0.2.9'
-check_version(__version__)
\ No newline at end of file
+__version__ = '0.3.0'
+check_version(__version__)
diff --git a/deepctr_torch/callbacks.py b/deepctr_torch/callbacks.py
index d1a69fe5..978a306b 100644
--- a/deepctr_torch/callbacks.py
+++ b/deepctr_torch/callbacks.py
@@ -1,73 +1,234 @@
+import copy
+import os
+
+import numpy as np
 import torch
-from tensorflow.python.keras.callbacks import EarlyStopping
-from tensorflow.python.keras.callbacks import ModelCheckpoint
-from tensorflow.python.keras.callbacks import History
-
-EarlyStopping = EarlyStopping
-History = History
-
-class ModelCheckpoint(ModelCheckpoint):
-    """Save the model after every epoch.
-
-    `filepath` can contain named formatting options,
-    which will be filled the value of `epoch` and
-    keys in `logs` (passed in `on_epoch_end`).
-
-    For example: if `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`,
-    then the model checkpoints will be saved with the epoch number and
-    the validation loss in the filename.
-
-    Arguments:
-        filepath: string, path to save the model file.
-        monitor: quantity to monitor.
-        verbose: verbosity mode, 0 or 1.
-        save_best_only: if `save_best_only=True`,
-            the latest best model according to
-            the quantity monitored will not be overwritten.
-        mode: one of {auto, min, max}.
-            If `save_best_only=True`, the decision
-            to overwrite the current save file is made
-            based on either the maximization or the
-            minimization of the monitored quantity. For `val_acc`,
-            this should be `max`, for `val_loss` this should
-            be `min`, etc. In `auto` mode, the direction is
-            automatically inferred from the name of the monitored quantity.
-        save_weights_only: if True, then only the model's weights will be
-            saved (`model.save_weights(filepath)`), else the full model
-            is saved (`model.save(filepath)`).
-        period: Interval (number of epochs) between checkpoints.
-    """
+
+
+class Callback(object):
+    def __init__(self):
+        """Initialize callback state."""
+        self.model = None
+        self.params = {}
+
+    def set_model(self, model):
+        self.model = model
+
+    def set_params(self, params):
+        self.params = params or {}
+
+    def on_train_begin(self, logs=None):
+        pass
+
+    def on_train_end(self, logs=None):
+        pass
+
+    def on_epoch_begin(self, epoch, logs=None):
+        pass
+
+    def on_epoch_end(self, epoch, logs=None):
+        pass
+
+
+class CallbackList(object):
+    def __init__(self, callbacks=None):
+        """Create a callback container."""
+        self.callbacks = list(callbacks or [])
+        self.model = None
+        self.params = {}
+
+    def append(self, callback):
+        self.callbacks.append(callback)
+
+    def set_model(self, model):
+        self.model = model
+        for callback in self.callbacks:
+            callback.set_model(model)
+
+    def set_params(self, params):
+        self.params = params or {}
+        for callback in self.callbacks:
+            callback.set_params(self.params)
+
+    def on_train_begin(self, logs=None):
+        for callback in self.callbacks:
+            callback.on_train_begin(logs=logs)
+
+    def on_train_end(self, logs=None):
+        for callback in self.callbacks:
+            callback.on_train_end(logs=logs)
+
+    def on_epoch_begin(self, epoch, logs=None):
+        for callback in self.callbacks:
+            callback.on_epoch_begin(epoch, logs=logs)
+
+    def on_epoch_end(self, epoch, logs=None):
+        for callback in self.callbacks:
+            callback.on_epoch_end(epoch, logs=logs)
+
+
+class History(Callback):
+    def on_train_begin(self, logs=None):
+        self.epoch = []
+        self.history = {}
+        if self.model is not None:
+            self.model.history = self
 
     def on_epoch_end(self, epoch, logs=None):
         logs = logs or {}
-        self.epochs_since_last_save += 1
-        if self.epochs_since_last_save >= self.period:
-            self.epochs_since_last_save = 0
-            filepath = self.filepath.format(epoch=epoch + 1, **logs)
-            if self.save_best_only:
-                current = logs.get(self.monitor)
-                if current is None:
-                    print('Can save best model only with %s available, skipping.' % self.monitor)
-                else:
-                    if self.monitor_op(current, self.best):
-                        if self.verbose > 0:
-                            print('Epoch %05d: %s improved from %0.5f to %0.5f,'
-                                  ' saving model to %s' % (epoch + 1, self.monitor, self.best,
-                                                           current, filepath))
-                        self.best = current
-                        if self.save_weights_only:
-                            torch.save(self.model.state_dict(), filepath)
-                        else:
-                            torch.save(self.model, filepath)
-                    else:
-                        if self.verbose > 0:
-                            print('Epoch %05d: %s did not improve from %0.5f' %
-                                  (epoch + 1, self.monitor, self.best))
+        self.epoch.append(epoch)
+        for key, value in logs.items():
+            self.history.setdefault(key, []).append(value)
+
+
+class EarlyStopping(Callback):
+    def __init__(
+        self,
+        monitor="val_loss",
+        min_delta=0,
+        patience=0,
+        verbose=0,
+        mode="auto",
+        baseline=None,
+        restore_best_weights=False,
+    ):
+        """Create an early-stopping callback."""
+        super(EarlyStopping, self).__init__()
+        self.monitor = monitor
+        self.min_delta = abs(min_delta)
+        self.patience = patience
+        self.verbose = verbose
+        self.mode = mode
+        self.baseline = baseline
+        self.restore_best_weights = restore_best_weights
+
+        if mode not in {"auto", "min", "max"}:
+            raise ValueError("mode should be one of {'auto', 'min', 'max'}")
+
+        if mode == "min":
+            self.monitor_op = np.less
+        elif mode == "max":
+            self.monitor_op = np.greater
+        else:
+            if "acc" in self.monitor or self.monitor.endswith("auc") or self.monitor.startswith("fmeasure"):
+                self.monitor_op = np.greater
+            else:
+                self.monitor_op = np.less
+
+    def on_train_begin(self, logs=None):
+        self.wait = 0
+        self.stopped_epoch = 0
+        self.best_weights = None
+        if self.baseline is not None:
+            self.best = self.baseline
+        else:
+            self.best = np.inf if self.monitor_op == np.less else -np.inf
+
+    def _is_improvement(self, current, best):
+        if self.monitor_op == np.less:
+            return current < (best - self.min_delta)
+        return current > (best + self.min_delta)
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        current = logs.get(self.monitor)
+        if current is None:
+            return
+
+        if self._is_improvement(current, self.best):
+            self.best = current
+            self.wait = 0
+            if self.restore_best_weights and self.model is not None:
+                self.best_weights = copy.deepcopy(self.model.state_dict())
+            return
+
+        self.wait += 1
+        if self.wait >= self.patience:
+            self.stopped_epoch = epoch + 1
+            if self.model is not None:
+                self.model.stop_training = True
+                if self.restore_best_weights and self.best_weights is not None:
+                    self.model.load_state_dict(self.best_weights)
+
+    def on_train_end(self, logs=None):
+        if self.stopped_epoch > 0 and self.verbose > 0:
+            print("Epoch %05d: early stopping" % self.stopped_epoch)
+
+
+class ModelCheckpoint(Callback):
+    def __init__(
+        self,
+        filepath,
+        monitor="val_loss",
+        verbose=0,
+        save_best_only=False,
+        save_weights_only=False,
+        mode="auto",
+        period=1,
+    ):
+        """Create a model-checkpoint callback."""
+        super(ModelCheckpoint, self).__init__()
+        self.filepath = filepath
+        self.monitor = monitor
+        self.verbose = verbose
+        self.save_best_only = save_best_only
+        self.save_weights_only = save_weights_only
+        self.period = period
+        self.epochs_since_last_save = 0
+
+        if mode not in {"auto", "min", "max"}:
+            raise ValueError("mode should be one of {'auto', 'min', 'max'}")
+
+        if mode == "min":
+            self.monitor_op = np.less
+            self.best = np.inf
+        elif mode == "max":
+            self.monitor_op = np.greater
+            self.best = -np.inf
+        else:
+            if "acc" in self.monitor or self.monitor.endswith("auc") or self.monitor.startswith("fmeasure"):
+                self.monitor_op = np.greater
+                self.best = -np.inf
             else:
+                self.monitor_op = np.less
+                self.best = np.inf
+
+    def _save(self, filepath):
+        output_dir = os.path.dirname(filepath)
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+        if self.save_weights_only:
+            torch.save(self.model.state_dict(), filepath)
+        else:
+            torch.save(self.model, filepath)
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        self.epochs_since_last_save += 1
+        if self.epochs_since_last_save < self.period:
+            return
+
+        self.epochs_since_last_save = 0
+        filepath = self.filepath.format(epoch=epoch + 1, **logs)
+
+        if self.save_best_only:
+            current = logs.get(self.monitor)
+            if current is None:
+                if self.verbose > 0:
+                    print("Can save best model only with %s available, skipping." % self.monitor)
+                return
+            if self.monitor_op(current, self.best):
                 if self.verbose > 0:
-                    print('Epoch %05d: saving model to %s' %
-                          (epoch + 1, filepath))
-                if self.save_weights_only:
-                    torch.save(self.model.state_dict(), filepath)
-                else:
-                    torch.save(self.model, filepath)
+                    print(
+                        "Epoch %05d: %s improved from %0.5f to %0.5f, saving model to %s"
+                        % (epoch + 1, self.monitor, self.best, current, filepath)
+                    )
+                self.best = current
+                self._save(filepath)
+            elif self.verbose > 0:
+                print("Epoch %05d: %s did not improve from %0.5f" % (epoch + 1, self.monitor, self.best))
+            return
+
+        if self.verbose > 0:
+            print("Epoch %05d: saving model to %s" % (epoch + 1, filepath))
+        self._save(filepath)
diff --git a/deepctr_torch/models/basemodel.py b/deepctr_torch/models/basemodel.py
index cd36340a..3e01d32b 100644
--- a/deepctr_torch/models/basemodel.py
+++ b/deepctr_torch/models/basemodel.py
@@ -15,20 +15,15 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.data as Data
-from sklearn.metrics import *
+from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, roc_auc_score
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
-try:
-    from tensorflow.python.keras.callbacks import CallbackList
-except ImportError:
-    from tensorflow.python.keras._impl.keras.callbacks import CallbackList
-
 from ..inputs import build_input_features, SparseFeat, DenseFeat, VarLenSparseFeat, get_varlen_pooling_list, \
     create_embedding_matrix, varlen_embedding_lookup
 from ..layers import PredictionLayer
 from ..layers.utils import slice_arrays
-from ..callbacks import History
+from ..callbacks import CallbackList, History
 
 
 class Linear(nn.Module):
@@ -76,7 +71,8 @@ def forward(self, X, sparse_feat_refine_weight=None):
 
         sparse_embedding_list += varlen_embedding_list
 
-        linear_logit = torch.zeros([X.shape[0], 1]).to(self.device)
+        # Keep accumulator on the same device as current input tensor.
+        linear_logit = X.new_zeros((X.shape[0], 1))
         if len(sparse_embedding_list) > 0:
             sparse_embedding_cat = torch.cat(sparse_embedding_list, dim=-1)
             if sparse_feat_refine_weight is not None:
@@ -148,7 +144,7 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoc
         :param validation_split: Float between 0 and 1. Fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch. The validation data is selected from the last samples in the `x` and `y` data provided, before shuffling.
         :param validation_data: tuple `(x_val, y_val)` or tuple `(x_val, y_val, val_sample_weights)` on which to evaluate the loss and any model metrics at the end of each epoch. The model will not be trained on this data. `validation_data` will override `validation_split`.
         :param shuffle: Boolean. Whether to shuffle the order of the batches at the beginning of each epoch.
-        :param callbacks: List of `deepctr_torch.callbacks.Callback` instances. List of callbacks to apply during training and validation (if ). See [callbacks](https://tensorflow.google.cn/api_docs/python/tf/keras/callbacks). Now available: `EarlyStopping` , `ModelCheckpoint`
+        :param callbacks: List of `deepctr_torch.callbacks.Callback` instances. List of callbacks to apply during training and validation. Now available: `EarlyStopping` , `ModelCheckpoint`
 
         :return: A `History` object. Its `History.history` attribute is a record of training loss values and metrics values at successive epochs, as well as validation loss values and validation metrics values (if applicable).
         """
@@ -229,6 +225,7 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoc
         # Train
         print("Train on {0} samples, validate on {1} samples, {2} steps per epoch".format(
             len(train_tensor_data), len(val_y), steps_per_epoch))
+        num_tasks = getattr(self, "num_tasks", 1)
         for epoch in range(initial_epoch, epochs):
             callbacks.on_epoch_begin(epoch)
             epoch_logs = {}
@@ -242,16 +239,21 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoc
                         x = x_train.to(self.device).float()
                         y = y_train.to(self.device).float()
 
-                        y_pred = model(x).squeeze()
+                        y_pred = model(x)
+                        if num_tasks == 1 and y_pred.ndim > 1 and y_pred.shape[-1] == 1:
+                            y_pred = y_pred.squeeze(-1)
 
                         optim.zero_grad()
                         if isinstance(loss_func, list):
-                            assert len(loss_func) == self.num_tasks,\
-                                "the length of `loss_func` should be equal with `self.num_tasks`"
+                            assert len(loss_func) == num_tasks,\
+                                "the length of `loss_func` should be equal with `num_tasks`"
                             loss = sum(
-                                [loss_func[i](y_pred[:, i], y[:, i], reduction='sum') for i in range(self.num_tasks)])
+                                [loss_func[i](y_pred[:, i], y[:, i], reduction='sum') for i in range(num_tasks)])
                         else:
-                            loss = loss_func(y_pred, y.squeeze(), reduction='sum')
+                            y_for_loss = y
+                            if y_for_loss.ndim > 1 and y_for_loss.shape[-1] == 1:
+                                y_for_loss = y_for_loss.squeeze(-1)
+                            loss = loss_func(y_pred, y_for_loss, reduction='sum')
                         reg_loss = self.get_regularization_loss()
 
                         total_loss = loss + reg_loss + self.aux_loss
@@ -265,8 +267,11 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoc
                             for name, metric_fun in self.metrics.items():
                                 if name not in train_result:
                                     train_result[name] = []
-                                train_result[name].append(metric_fun(
-                                    y.cpu().data.numpy(), y_pred.cpu().data.numpy().astype("float64")))
+                                y_true_metric, y_pred_metric = self._prepare_metric_inputs(
+                                    y.cpu().data.numpy(),
+                                    y_pred.cpu().data.numpy().astype("float64")
+                                )
+                                train_result[name].append(metric_fun(y_true_metric, y_pred_metric))
 
 
             except KeyboardInterrupt:
@@ -319,7 +324,8 @@ def evaluate(self, x, y, batch_size=256):
         pred_ans = self.predict(x, batch_size)
         eval_result = {}
         for name, metric_fun in self.metrics.items():
-            eval_result[name] = metric_fun(y, pred_ans)
+            y_true_metric, y_pred_metric = self._prepare_metric_inputs(y, pred_ans)
+            eval_result[name] = metric_fun(y_true_metric, y_pred_metric)
         return eval_result
 
     def predict(self, x, batch_size=256):
@@ -481,13 +487,22 @@ def _get_loss_func_single(self, loss):
         return loss_func
 
     def _log_loss(self, y_true, y_pred, eps=1e-7, normalize=True, sample_weight=None, labels=None):
-        # change eps to improve calculation accuracy
-        return log_loss(y_true,
-                        y_pred,
-                        eps,
-                        normalize,
-                        sample_weight,
-                        labels)
+        # sklearn>=1.5 removed `eps` from log_loss signature. We clip manually
+        # and fallback to the old signature for backward compatibility.
+        y_pred = np.clip(y_pred, eps, 1 - eps)
+        try:
+            return log_loss(y_true,
+                            y_pred,
+                            normalize=normalize,
+                            sample_weight=sample_weight,
+                            labels=labels)
+        except TypeError:
+            return log_loss(y_true,
+                            y_pred,
+                            eps=eps,
+                            normalize=normalize,
+                            sample_weight=sample_weight,
+                            labels=labels)
 
     @staticmethod
     def _accuracy_score(y_true, y_pred):
@@ -498,10 +513,7 @@ def _get_metrics(self, metrics, set_eps=False):
         if metrics:
             for metric in metrics:
                 if metric == "binary_crossentropy" or metric == "logloss":
-                    if set_eps:
-                        metrics_[metric] = self._log_loss
-                    else:
-                        metrics_[metric] = log_loss
+                    metrics_[metric] = self._log_loss
                 if metric == "auc":
                     metrics_[metric] = roc_auc_score
                 if metric == "mse":
@@ -511,6 +523,16 @@ def _get_metrics(self, metrics, set_eps=False):
                 self.metrics_names.append(metric)
         return metrics_
 
+    @staticmethod
+    def _prepare_metric_inputs(y_true, y_pred):
+        y_true = np.asarray(y_true)
+        y_pred = np.asarray(y_pred)
+        if y_true.ndim > 1:
+            y_true = y_true.reshape(-1)
+        if y_pred.ndim > 1:
+            y_pred = y_pred.reshape(-1)
+        return y_true, y_pred
+
     def _in_multi_worker_mode(self):
         # used for EarlyStopping in tf1.15
         return None
diff --git a/deepctr_torch/models/multitask/mmoe.py b/deepctr_torch/models/multitask/mmoe.py
index c0401eb7..df9f7ca6 100644
--- a/deepctr_torch/models/multitask/mmoe.py
+++ b/deepctr_torch/models/multitask/mmoe.py
@@ -127,7 +127,7 @@ def forward(self, X):
             else:
                 gate_dnn_out = self.gate_dnn_final_layer[i](dnn_input)
             gate_mul_expert = torch.matmul(gate_dnn_out.softmax(1).unsqueeze(1), expert_outs)  # (bs, 1, dim)
-            mmoe_outs.append(gate_mul_expert.squeeze())
+            mmoe_outs.append(gate_mul_expert.squeeze(1))
 
         # tower dnn (task-specific)
         task_outs = []
diff --git a/deepctr_torch/models/multitask/ple.py b/deepctr_torch/models/multitask/ple.py
index bc8a06fb..c056aefa 100644
--- a/deepctr_torch/models/multitask/ple.py
+++ b/deepctr_torch/models/multitask/ple.py
@@ -177,7 +177,7 @@ def cgc_net(self, inputs, level_num):
             else:
                 gate_dnn_out = self.specific_gate_dnn_final_layer[level_num][i](inputs[i])
             gate_mul_expert = torch.matmul(gate_dnn_out.softmax(1).unsqueeze(1), cur_experts_outputs)  # (bs, 1, dim)
-            cgc_outs.append(gate_mul_expert.squeeze())
+            cgc_outs.append(gate_mul_expert.squeeze(1))
 
         # gates for shared experts
         cur_experts_outputs = specific_expert_outputs + shared_expert_outputs
@@ -189,7 +189,7 @@ def cgc_net(self, inputs, level_num):
         else:
             gate_dnn_out = self.shared_gate_dnn_final_layer[level_num](inputs[-1])
         gate_mul_expert = torch.matmul(gate_dnn_out.softmax(1).unsqueeze(1), cur_experts_outputs)  # (bs, 1, dim)
-        cgc_outs.append(gate_mul_expert.squeeze())
+        cgc_outs.append(gate_mul_expert.squeeze(1))
 
         return cgc_outs
 
diff --git a/docs/requirements.readthedocs.txt b/docs/requirements.readthedocs.txt
index 793412bd..1d3d429e 100644
--- a/docs/requirements.readthedocs.txt
+++ b/docs/requirements.readthedocs.txt
@@ -1,3 +1,4 @@
 Cython>=0.28.5
-tensorflow==2.7.2
-scikit-learn==1.0
+scikit-learn>=1.3.2
+sphinx_rtd_theme>=3.0.0
+myst-parser>=3.0.0
diff --git a/docs/source/Examples.md b/docs/source/Examples.md
index 628a719b..6d91d72e 100644
--- a/docs/source/Examples.md
+++ b/docs/source/Examples.md
@@ -171,7 +171,25 @@ import numpy as np
 import pandas as pd
 import torch
 from sklearn.preprocessing import LabelEncoder
-from tensorflow.python.keras.preprocessing.sequence import pad_sequences
+
+try:
+    from tensorflow.keras.preprocessing.sequence import pad_sequences
+except Exception:
+    def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0):
+        if maxlen is None:
+            maxlen = max(len(seq) for seq in sequences)
+        x = np.full((len(sequences), maxlen), value, dtype=dtype)
+        for idx, seq in enumerate(sequences):
+            if truncating == 'pre':
+                trunc = seq[-maxlen:]
+            else:
+                trunc = seq[:maxlen]
+            trunc = np.asarray(trunc, dtype=dtype)
+            if padding == 'post':
+                x[idx, :len(trunc)] = trunc
+            else:
+                x[idx, -len(trunc):] = trunc
+        return x
 
 from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
 from deepctr_torch.models import DeepFM
@@ -308,4 +326,4 @@ if __name__ == "__main__":
     for i, target_name in enumerate(target):
         print("%s test LogLoss" % target_name, round(log_loss(test[target[i]].values, pred_ans[:, i]), 4))
         print("%s test AUC" % target_name, round(roc_auc_score(test[target[i]].values, pred_ans[:, i]), 4))
-```
\ No newline at end of file
+```
diff --git a/docs/source/History.md b/docs/source/History.md
index 4144109f..808b4b6a 100644
--- a/docs/source/History.md
+++ b/docs/source/History.md
@@ -1,4 +1,5 @@
 # History
+- 04/18/2026 : [v0.3.0](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.3.0) released. Improve compatibility for newer environments. Support Python `3.7` ~ `3.13` and modern PyTorch versions (`2.4+`). Remove hard dependency on TensorFlow private callback APIs and add examples smoke tests to CI.
 - 10/22/2022 : [v0.2.9](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.9) released.Add multi-task models: SharedBottom, ESMM, MMOE, PLE.
 - 06/19/2022 : [v0.2.8](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.8) released.Fix some bugs.
 - 06/14/2021 : [v0.2.7](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.7) released.Add [AFN](./Features.html#afn-adaptive-factorization-network-learning-adaptive-order-feature-interactions) and fix some bugs.
@@ -12,4 +13,4 @@
 - 10/03/2019 : [v0.1.3](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.3) released.Simplify the input logic.
 - 09/28/2019 : [v0.1.2](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.2) released.Add [sequence(multi-value) input support](./Examples.html#multi-value-input-movielens).
 - 09/24/2019 : [v0.1.1](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.1) released. Add [CCPM](./Features.html#ccpm-convolutional-click-prediction-model).
-- 09/22/2019 : DeepCTR-Torch first version v0.1.0  is released on [PyPi](https://pypi.org/project/deepctr-torch/)
\ No newline at end of file
+- 09/22/2019 : DeepCTR-Torch first version v0.1.0  is released on [PyPi](https://pypi.org/project/deepctr-torch/)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 132de990..01dbcbf4 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = ''
 # The full version, including alpha/beta/rc tags
-release = '0.2.9'
+release = '0.3.0'
 
 
 # -- General configuration ---------------------------------------------------
@@ -44,6 +44,7 @@
     'sphinx.ext.ifconfig',
     'sphinx.ext.viewcode',
     'sphinx.ext.githubpages',
+    'myst_parser',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -52,8 +53,10 @@
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
-source_suffix = ['.rst', '.md']
-#source_suffix = '.rst'
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
 
 # The master toctree document.
 master_doc = 'index'
@@ -63,7 +66,7 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = 'en'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -90,7 +93,8 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+_static_dir = os.path.join(os.path.dirname(__file__), '_static')
+html_static_path = ['_static'] if os.path.isdir(_static_dir) else []
 
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
@@ -163,10 +167,9 @@
 # -- Extension configuration -------------------------------------------------
 todo_include_todos = False
 html_theme = 'sphinx_rtd_theme'
-
-source_parsers = {
-    '.md': 'recommonmark.parser.CommonMarkParser',
-}
+# Match ReadTheDocs' older navigation behavior by hiding autodoc object
+# entries (class/function anchors) from global toctrees.
+toc_object_entries = False
 
 autodoc_mock_imports = [
-]
\ No newline at end of file
+]
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 564b887f..dfa5b3f1 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -34,6 +34,8 @@ You can read the latest code at https://github.com/shenweichen/DeepCTR-Torch and
 
 News
 -----
+04/18/2026 : Release `v0.3.0` with improved compatibility for Python `3.7` ~ `3.13` and PyTorch `2.4+`. CI now includes examples smoke tests. `Changelog <https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.3.0>`_
+
 10/22/2022 : Add multi-task models: SharedBottom, ESMM, MMOE, PLE. `Changelog <https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.9>`_
 
 06/19/2022 : Fix some bugs.  `Changelog <https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.8>`_
diff --git a/examples/run_classification_criteo.py b/examples/run_classification_criteo.py
index 67fb3d9a..54e94cf3 100644
--- a/examples/run_classification_criteo.py
+++ b/examples/run_classification_criteo.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+import os
+
 import pandas as pd
 import torch
 from sklearn.metrics import log_loss, roc_auc_score
@@ -58,7 +60,8 @@
     model.compile("adagrad", "binary_crossentropy",
                   metrics=["binary_crossentropy", "auc"], )
 
-    history = model.fit(train_model_input, train[target].values, batch_size=32, epochs=10, verbose=2,
+    epochs = int(os.getenv("DEEPCTR_EXAMPLE_EPOCHS", "10"))
+    history = model.fit(train_model_input, train[target].values, batch_size=32, epochs=epochs, verbose=2,
                         validation_split=0.2)
     pred_ans = model.predict(test_model_input, 256)
     print("")
diff --git a/examples/run_dien.py b/examples/run_dien.py
index 7d45583d..d21aa339 100644
--- a/examples/run_dien.py
+++ b/examples/run_dien.py
@@ -1,3 +1,4 @@
+import os
 import numpy as np
 import torch
 
@@ -65,4 +66,5 @@ def get_xy_fd(use_neg=False, hash_flag=False):
 
     model.compile('adam', 'binary_crossentropy',
                   metrics=['binary_crossentropy', 'auc'])
-    history = model.fit(x, y, batch_size=2, epochs=10, verbose=1, validation_split=0, shuffle=False)
+    epochs = int(os.getenv("DEEPCTR_EXAMPLE_EPOCHS", "10"))
+    history = model.fit(x, y, batch_size=2, epochs=epochs, verbose=1, validation_split=0, shuffle=False)
diff --git a/examples/run_din.py b/examples/run_din.py
index de716e16..4225182c 100644
--- a/examples/run_din.py
+++ b/examples/run_din.py
@@ -2,6 +2,7 @@
 
 sys.path.insert(0, '..')
 
+import os
 import numpy as np
 import torch
 from deepctr_torch.inputs import (DenseFeat, SparseFeat, VarLenSparseFeat,
@@ -47,4 +48,5 @@ def get_xy_fd():
     model = DIN(feature_columns, behavior_feature_list, device=device, att_weight_normalization=True)
     model.compile('adagrad', 'binary_crossentropy',
                   metrics=['binary_crossentropy'])
-    history = model.fit(x, y, batch_size=3, epochs=10, verbose=2, validation_split=0.0)
+    epochs = int(os.getenv("DEEPCTR_EXAMPLE_EPOCHS", "10"))
+    history = model.fit(x, y, batch_size=3, epochs=epochs, verbose=2, validation_split=0.0)
diff --git a/examples/run_multitask_learning.py b/examples/run_multitask_learning.py
index 567037a5..a93786e6 100644
--- a/examples/run_multitask_learning.py
+++ b/examples/run_multitask_learning.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+import os
+
 import pandas as pd
 import torch
 from sklearn.metrics import log_loss, roc_auc_score
@@ -56,7 +58,8 @@
     model.compile("adagrad", loss=["binary_crossentropy", "binary_crossentropy"],
                   metrics=['binary_crossentropy'], )
 
-    history = model.fit(train_model_input, train[target].values, batch_size=32, epochs=10, verbose=2)
+    epochs = int(os.getenv("DEEPCTR_EXAMPLE_EPOCHS", "10"))
+    history = model.fit(train_model_input, train[target].values, batch_size=32, epochs=epochs, verbose=2)
     pred_ans = model.predict(test_model_input, 256)
     print("")
     for i, target_name in enumerate(target):
diff --git a/examples/run_multivalue_movielens.py b/examples/run_multivalue_movielens.py
index 4a892a77..c0edfb7c 100644
--- a/examples/run_multivalue_movielens.py
+++ b/examples/run_multivalue_movielens.py
@@ -1,8 +1,28 @@
+import os
+
 import numpy as np
 import pandas as pd
 import torch
 from sklearn.preprocessing import LabelEncoder
-from tensorflow.python.keras.preprocessing.sequence import pad_sequences
+
+try:
+    from tensorflow.keras.preprocessing.sequence import pad_sequences
+except Exception:
+    def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0):
+        if maxlen is None:
+            maxlen = max(len(seq) for seq in sequences)
+        x = np.full((len(sequences), maxlen), value, dtype=dtype)
+        for idx, seq in enumerate(sequences):
+            if truncating == 'pre':
+                trunc = seq[-maxlen:]
+            else:
+                trunc = seq[:maxlen]
+            trunc = np.asarray(trunc, dtype=dtype)
+            if padding == 'post':
+                x[idx, :len(trunc)] = trunc
+            else:
+                x[idx, -len(trunc):] = trunc
+        return x
 
 from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
 from deepctr_torch.models import DeepFM
@@ -64,4 +84,6 @@ def split(x):
     model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
 
     model.compile("adam", "mse", metrics=['mse'], )
-    history = model.fit(model_input, data[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2)
+    epochs = int(os.getenv("DEEPCTR_EXAMPLE_EPOCHS", "10"))
+    history = model.fit(model_input, data[target].values, batch_size=256, epochs=epochs, verbose=2,
+                        validation_split=0.2)
diff --git a/examples/run_regression_movielens.py b/examples/run_regression_movielens.py
index f1583a0f..9c3803a4 100644
--- a/examples/run_regression_movielens.py
+++ b/examples/run_regression_movielens.py
@@ -1,3 +1,5 @@
+import os
+
 import pandas as pd
 import torch
 from sklearn.metrics import mean_squared_error
@@ -40,7 +42,8 @@
     model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
     model.compile("adam", "mse", metrics=['mse'], )
 
-    history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2,
+    epochs = int(os.getenv("DEEPCTR_EXAMPLE_EPOCHS", "10"))
+    history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=epochs, verbose=2,
                         validation_split=0.2)
     pred_ans = model.predict(test_model_input, batch_size=256)
     print("test MSE", round(mean_squared_error(
diff --git a/setup.py b/setup.py
index 51d0102b..ac077b47 100644
--- a/setup.py
+++ b/setup.py
@@ -4,12 +4,15 @@
     long_description = fh.read()
 
 REQUIRED_PACKAGES = [
-    'torch>=1.2.0', 'tqdm', 'scikit-learn', 'tensorflow'
+    'torch>=1.13.0; python_version < "3.8"',
+    'torch>=2.4.0; python_version >= "3.8"',
+    'tqdm',
+    'scikit-learn'
 ]
 
 setuptools.setup(
     name="deepctr-torch",
-    version="0.2.9",
+    version="0.3.0",
     author="Weichen Shen",
     author_email="weichenswc@163.com",
     description="Easy-to-use,Modular and Extendible package of deep learning based CTR(Click Through Rate) prediction models with PyTorch",
@@ -19,7 +22,7 @@
     download_url='https://github.com/shenweichen/deepctr-torch/tags',
     packages=setuptools.find_packages(
         exclude=["tests", "tests.models", "tests.layers"]),
-    python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*",  # '>=3.4',  # 3.4.6
+    python_requires=">=3.7",
     install_requires=REQUIRED_PACKAGES,
     extras_require={
 
@@ -33,12 +36,14 @@
         'Intended Audience :: Education',
         'Intended Audience :: Science/Research',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3 :: Only',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
+        'Programming Language :: Python :: 3.13',
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
         'Topic :: Software Development',
diff --git a/tests/callbacks_test.py b/tests/callbacks_test.py
new file mode 100644
index 00000000..edbf50bb
--- /dev/null
+++ b/tests/callbacks_test.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+import torch
+import pytest
+
+from deepctr_torch.callbacks import Callback, CallbackList, EarlyStopping, History, ModelCheckpoint
+
+
+class ProbeCallback(Callback):
+    def __init__(self):
+        """Initialize probe callback state."""
+        super(ProbeCallback, self).__init__()
+        self.events = []
+
+    def on_train_begin(self, logs=None):
+        self.events.append(("train_begin", logs))
+
+    def on_train_end(self, logs=None):
+        self.events.append(("train_end", logs))
+
+    def on_epoch_begin(self, epoch, logs=None):
+        self.events.append(("epoch_begin", epoch, logs))
+
+    def on_epoch_end(self, epoch, logs=None):
+        self.events.append(("epoch_end", epoch, logs))
+
+
+class TinyModel(torch.nn.Module):
+    def __init__(self):
+        """Initialize a tiny torch module for callback tests."""
+        super(TinyModel, self).__init__()
+        self.linear = torch.nn.Linear(1, 1)
+        self.stop_training = False
+
+
+def test_callback_and_callback_list_flow():
+    cb = Callback()
+    cb.set_model(TinyModel())
+    cb.set_params(None)
+    cb.on_train_begin()
+    cb.on_epoch_begin(0)
+    cb.on_epoch_end(0)
+    cb.on_train_end()
+
+    probe_1 = ProbeCallback()
+    probe_2 = ProbeCallback()
+    cb_list = CallbackList([probe_1])
+    cb_list.append(probe_2)
+    model = TinyModel()
+    cb_list.set_model(model)
+    cb_list.set_params(None)
+    cb_list.on_train_begin(logs={"phase": "train"})
+    cb_list.on_epoch_begin(1, logs={"loss": 0.2})
+    cb_list.on_epoch_end(1, logs={"loss": 0.1})
+    cb_list.on_train_end(logs={"done": True})
+
+    assert probe_1.model is model and probe_2.model is model
+    assert probe_1.params == {} and probe_2.params == {}
+    assert ("train_begin", {"phase": "train"}) in probe_1.events
+    assert ("train_end", {"done": True}) in probe_2.events
+
+
+def test_history_records_logs():
+    history = History()
+    model = TinyModel()
+    history.set_model(model)
+    history.on_train_begin()
+    history.on_epoch_end(0, {"loss": 0.3, "acc": 0.8})
+    history.on_epoch_end(1, {"loss": 0.2, "acc": 0.9})
+
+    assert model.history is history
+    assert history.epoch == [0, 1]
+    assert history.history["loss"] == [0.3, 0.2]
+    assert history.history["acc"] == [0.8, 0.9]
+
+
+def test_early_stopping_paths(capsys):
+    with pytest.raises(ValueError):
+        EarlyStopping(mode="unsupported")
+
+    # Cover baseline/min branch + _is_improvement(min)
+    es_min = EarlyStopping(monitor="val_loss", mode="min", baseline=0.5)
+    es_min.on_train_begin()
+    assert es_min.best == 0.5
+    assert es_min._is_improvement(0.4, es_min.best)
+
+    # Cover auto/max branch + restore-best-weights path
+    model = TinyModel()
+    with torch.no_grad():
+        model.linear.weight.fill_(1.0)
+
+    es = EarlyStopping(
+        monitor="val_auc",
+        mode="auto",
+        patience=1,
+        verbose=1,
+        restore_best_weights=True,
+    )
+    es.set_model(model)
+    es.on_train_begin()
+
+    # Missing metric should be ignored.
+    es.on_epoch_end(0, {})
+
+    # Improvement stores best weights.
+    es.on_epoch_end(0, {"val_auc": 0.9})
+    with torch.no_grad():
+        model.linear.weight.fill_(2.0)
+
+    # No improvement triggers early stop and restores best weights.
+    es.on_epoch_end(1, {"val_auc": 0.8})
+    es.on_train_end()
+    out = capsys.readouterr().out
+
+    assert model.stop_training is True
+    assert torch.allclose(model.linear.weight, torch.tensor([[1.0]]))
+    assert "early stopping" in out
+
+
+def test_model_checkpoint_paths(tmp_path, capsys):
+    with pytest.raises(ValueError):
+        ModelCheckpoint(filepath=str(tmp_path / "bad.ckpt"), mode="unsupported")
+
+    model = TinyModel()
+
+    # Auto mode with an "auc" metric goes through max branch.
+    ckpt_auto = ModelCheckpoint(filepath=str(tmp_path / "auto.ckpt"), monitor="val_auc", mode="auto")
+    ckpt_auto.set_model(model)
+
+    # save_best_only + missing monitor logs
+    best_path = tmp_path / "best" / "model.pt"
+    ckpt_best = ModelCheckpoint(
+        filepath=str(best_path),
+        monitor="val_loss",
+        mode="min",
+        verbose=1,
+        save_best_only=True,
+        save_weights_only=True,
+    )
+    ckpt_best.set_model(model)
+    ckpt_best.on_epoch_end(0, {})
+    ckpt_best.on_epoch_end(1, {"val_loss": 0.2})
+    ckpt_best.on_epoch_end(2, {"val_loss": 0.3})
+    assert best_path.exists()
+
+    # period gate + normal save + full model save path
+    regular_path = tmp_path / "regular" / "full_model.pt"
+    ckpt_regular = ModelCheckpoint(
+        filepath=str(regular_path),
+        verbose=1,
+        save_best_only=False,
+        save_weights_only=False,
+        period=2,
+    )
+    ckpt_regular.set_model(model)
+    ckpt_regular.on_epoch_end(0, {"loss": 0.2})
+    assert not regular_path.exists()
+    ckpt_regular.on_epoch_end(1, {"loss": 0.1})
+    assert regular_path.exists()
+
+    output = capsys.readouterr().out
+    assert "saving model" in output
diff --git a/tests/ci/examples.sh b/tests/ci/examples.sh
new file mode 100755
index 00000000..eb5a85cf
--- /dev/null
+++ b/tests/ci/examples.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+export DEEPCTR_EXAMPLE_EPOCHS="${DEEPCTR_EXAMPLE_EPOCHS:-1}"
+
+scripts=(
+  "run_classification_criteo.py"
+  "run_regression_movielens.py"
+  "run_multitask_learning.py"
+  "run_multivalue_movielens.py"
+  "run_din.py"
+  "run_dien.py"
+)
+
+pushd examples >/dev/null
+for script in "${scripts[@]}"; do
+  echo "Running example smoke test: ${script} (epochs=${DEEPCTR_EXAMPLE_EPOCHS})"
+  python "${script}"
+done
+popd >/dev/null
diff --git a/tests/ci/install.sh b/tests/ci/install.sh
new file mode 100755
index 00000000..16c29c5f
--- /dev/null
+++ b/tests/ci/install.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+python -m pip install -q --upgrade pip setuptools wheel
+python -m pip install -q "numpy<2"
+
+if [[ -n "${TORCH_INDEX_URL:-}" ]]; then
+  python -m pip install -q --index-url "${TORCH_INDEX_URL}" "torch==${TORCH_VERSION}"
+else
+  python -m pip install -q "torch==${TORCH_VERSION}"
+fi
+
+python -m pip install -q requests pytest pytest-cov python-coveralls pandas
+python -m pip install -e .
+python -m pip check
diff --git a/tests/ci/test.sh b/tests/ci/test.sh
new file mode 100755
index 00000000..92e1f877
--- /dev/null
+++ b/tests/ci/test.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+pytest --cov=deepctr_torch --cov-report=xml --cov-report=term-missing:skip-covered
diff --git a/tests/models/DeepFM_test.py b/tests/models/DeepFM_test.py
index a11dc3bd..270c82c0 100644
--- a/tests/models/DeepFM_test.py
+++ b/tests/models/DeepFM_test.py
@@ -34,5 +34,18 @@ def test_DeepFM(use_fm, hidden_size, sparse_feature_num, dense_feature_num):
                    dnn_hidden_units=hidden_size, dnn_dropout=0.5, device=get_device())
     check_model(model, model_name + '_no_linear', x, y)
 
+
+def test_DeepFM_fit_with_column_vector_target():
+    sample_size = SAMPLE_SIZE
+    x, y, feature_columns = get_test_data(
+        sample_size, sparse_feature_num=2, dense_feature_num=2)
+
+    model = DeepFM(feature_columns, feature_columns, dnn_hidden_units=(8,), dnn_dropout=0.5, device=get_device())
+    model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])
+
+    history = model.fit(x, y.reshape(-1, 1), batch_size=32, epochs=1, verbose=0, validation_split=0.2)
+    assert "loss" in history.history
+
+
 if __name__ == "__main__":
     pass
diff --git a/tests/models/multitask/MMOE_test.py b/tests/models/multitask/MMOE_test.py
index a37fe29c..14206ed3 100644
--- a/tests/models/multitask/MMOE_test.py
+++ b/tests/models/multitask/MMOE_test.py
@@ -29,5 +29,20 @@ def test_MMOE(num_experts, expert_dnn_hidden_units, gate_dnn_hidden_units, tower
     check_mtl_model(model, model_name, x, y_list, task_types)
 
 
+def test_MMOE_batch_size_one_multitask_fit():
+    sample_size = 8
+    x, y_list, feature_columns = get_mtl_test_data(
+        sample_size, sparse_feature_num=2, dense_feature_num=1, task_types=['binary', 'binary'])
+
+    model = MMOE(feature_columns, task_types=['binary', 'binary'], device=get_device(use_cuda=False))
+    model.compile('adam', ['binary_crossentropy', 'binary_crossentropy'], metrics=['binary_crossentropy'])
+
+    history = model.fit(x, y_list, batch_size=1, epochs=1, verbose=0)
+    assert "loss" in history.history
+
+    pred = model.predict(x, batch_size=1)
+    assert pred.shape == (sample_size, 2)
+
+
 if __name__ == "__main__":
     pass
diff --git a/tests/models/multitask/PLE_test.py b/tests/models/multitask/PLE_test.py
index ca8561f1..85620985 100644
--- a/tests/models/multitask/PLE_test.py
+++ b/tests/models/multitask/PLE_test.py
@@ -30,5 +30,20 @@ def test_PLE(shared_expert_num, specific_expert_num, num_levels, expert_dnn_hidd
     check_mtl_model(model, model_name, x, y_list, task_types)
 
 
+def test_PLE_batch_size_one_multitask_fit():
+    sample_size = 8
+    x, y_list, feature_columns = get_mtl_test_data(
+        sample_size, sparse_feature_num=2, dense_feature_num=1, task_types=['binary', 'binary'])
+
+    model = PLE(feature_columns, task_types=['binary', 'binary'], device=get_device(use_cuda=False))
+    model.compile('adam', ['binary_crossentropy', 'binary_crossentropy'], metrics=['binary_crossentropy'])
+
+    history = model.fit(x, y_list, batch_size=1, epochs=1, verbose=0)
+    assert "loss" in history.history
+
+    pred = model.predict(x, batch_size=1)
+    assert pred.shape == (sample_size, 2)
+
+
 if __name__ == "__main__":
     pass
diff --git a/tests/utils.py b/tests/utils.py
index 28f3010b..9665897a 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,6 +10,14 @@
 SAMPLE_SIZE = 64
 
 
+def _torch_load_compat(filepath):
+    kwargs = {"map_location": "cpu"}
+    try:
+        return torch.load(filepath, weights_only=False, **kwargs)
+    except TypeError:
+        return torch.load(filepath, **kwargs)
+
+
 def gen_sequence(dim, max_len, sample_size):
     return np.array([np.random.randint(0, dim, max_len) for _ in range(sample_size)]), np.random.randint(1, max_len + 1,
                                                                                                          sample_size)
@@ -160,12 +168,12 @@ def check_model(model, model_name, x, y, check_model_io=True):
 
     print(model_name + 'test, train valid pass!')
     torch.save(model.state_dict(), model_name + '_weights.h5')
-    model.load_state_dict(torch.load(model_name + '_weights.h5'))
+    model.load_state_dict(_torch_load_compat(model_name + '_weights.h5'))
     os.remove(model_name + '_weights.h5')
     print(model_name + 'test save load weight pass!')
     if check_model_io:
         torch.save(model, model_name + '.h5')
-        model = torch.load(model_name + '.h5')
+        model = _torch_load_compat(model_name + '.h5')
         os.remove(model_name + '.h5')
         print(model_name + 'test save load model pass!')
     print(model_name + 'test pass!')
diff --git a/tests/utils_mtl.py b/tests/utils_mtl.py
index 61020cf1..3b03a548 100644
--- a/tests/utils_mtl.py
+++ b/tests/utils_mtl.py
@@ -10,6 +10,14 @@
 SAMPLE_SIZE = 64
 
 
+def _torch_load_compat(filepath):
+    kwargs = {"map_location": "cpu"}
+    try:
+        return torch.load(filepath, weights_only=False, **kwargs)
+    except TypeError:
+        return torch.load(filepath, **kwargs)
+
+
 def gen_sequence(dim, max_len, sample_size):
     return np.array([np.random.randint(0, dim, max_len) for _ in range(sample_size)]), np.random.randint(1, max_len + 1,
                                                                                                          sample_size)
@@ -101,12 +109,12 @@ def check_mtl_model(model, model_name, x, y_list, task_types, check_model_io=Tru
 
     print(model_name + 'test, train valid pass!')
     torch.save(model.state_dict(), model_name + '_weights.h5')
-    model.load_state_dict(torch.load(model_name + '_weights.h5'))
+    model.load_state_dict(_torch_load_compat(model_name + '_weights.h5'))
     os.remove(model_name + '_weights.h5')
     print(model_name + 'test save load weight pass!')
     if check_model_io:
         torch.save(model, model_name + '.h5')
-        model = torch.load(model_name + '.h5')
+        model = _torch_load_compat(model_name + '.h5')
         os.remove(model_name + '.h5')
         print(model_name + 'test save load model pass!')
     print(model_name + 'test pass!')