fix(multitask): handle batch_size=1 safely in fit/gating

shenweichen · shenweichen · commit e1026c1e5777 · 2026-04-18T17:57:17.000+08:00
- keep Linear accumulator on input device to avoid cross-device errors

- avoid global squeeze in BaseModel.fit for multi-task outputs

- use squeeze(1) in MMOE/PLE expert-gating outputs

- add batch_size=1 regression tests for MMOE and PLE
diff --git a/deepctr_torch/models/basemodel.py b/deepctr_torch/models/basemodel.py
@@ -71,7 +71,8 @@ def forward(self, X, sparse_feat_refine_weight=None):
 
         sparse_embedding_list += varlen_embedding_list
 
-        linear_logit = torch.zeros([X.shape[0], 1]).to(self.device)
+        # Keep accumulator on the same device as current input tensor.
+        linear_logit = X.new_zeros((X.shape[0], 1))
         if len(sparse_embedding_list) > 0:
             sparse_embedding_cat = torch.cat(sparse_embedding_list, dim=-1)
             if sparse_feat_refine_weight is not None:
@@ -237,7 +238,9 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoc
                         x = x_train.to(self.device).float()
                         y = y_train.to(self.device).float()
 
-                        y_pred = model(x).squeeze()
+                        y_pred = model(x)
+                        if self.num_tasks == 1 and y_pred.ndim > 1 and y_pred.shape[-1] == 1:
+                            y_pred = y_pred.squeeze(-1)
 
                         optim.zero_grad()
                         if isinstance(loss_func, list):
@@ -246,7 +249,10 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoc
                             loss = sum(
                                 [loss_func[i](y_pred[:, i], y[:, i], reduction='sum') for i in range(self.num_tasks)])
                         else:
-                            loss = loss_func(y_pred, y.squeeze(), reduction='sum')
+                            y_for_loss = y
+                            if y_for_loss.ndim > 1 and y_for_loss.shape[-1] == 1:
+                                y_for_loss = y_for_loss.squeeze(-1)
+                            loss = loss_func(y_pred, y_for_loss, reduction='sum')
                         reg_loss = self.get_regularization_loss()
 
                         total_loss = loss + reg_loss + self.aux_loss
diff --git a/deepctr_torch/models/multitask/mmoe.py b/deepctr_torch/models/multitask/mmoe.py
@@ -127,7 +127,7 @@ def forward(self, X):
             else:
                 gate_dnn_out = self.gate_dnn_final_layer[i](dnn_input)
             gate_mul_expert = torch.matmul(gate_dnn_out.softmax(1).unsqueeze(1), expert_outs)  # (bs, 1, dim)
-            mmoe_outs.append(gate_mul_expert.squeeze())
+            mmoe_outs.append(gate_mul_expert.squeeze(1))
 
         # tower dnn (task-specific)
         task_outs = []
diff --git a/deepctr_torch/models/multitask/ple.py b/deepctr_torch/models/multitask/ple.py
@@ -177,7 +177,7 @@ def cgc_net(self, inputs, level_num):
             else:
                 gate_dnn_out = self.specific_gate_dnn_final_layer[level_num][i](inputs[i])
             gate_mul_expert = torch.matmul(gate_dnn_out.softmax(1).unsqueeze(1), cur_experts_outputs)  # (bs, 1, dim)
-            cgc_outs.append(gate_mul_expert.squeeze())
+            cgc_outs.append(gate_mul_expert.squeeze(1))
 
         # gates for shared experts
         cur_experts_outputs = specific_expert_outputs + shared_expert_outputs
@@ -189,7 +189,7 @@ def cgc_net(self, inputs, level_num):
         else:
             gate_dnn_out = self.shared_gate_dnn_final_layer[level_num](inputs[-1])
         gate_mul_expert = torch.matmul(gate_dnn_out.softmax(1).unsqueeze(1), cur_experts_outputs)  # (bs, 1, dim)
-        cgc_outs.append(gate_mul_expert.squeeze())
+        cgc_outs.append(gate_mul_expert.squeeze(1))
 
         return cgc_outs
 
diff --git a/tests/models/multitask/MMOE_test.py b/tests/models/multitask/MMOE_test.py
@@ -29,5 +29,20 @@ def test_MMOE(num_experts, expert_dnn_hidden_units, gate_dnn_hidden_units, tower
     check_mtl_model(model, model_name, x, y_list, task_types)
 
 
+def test_MMOE_batch_size_one_multitask_fit():
+    sample_size = 8
+    x, y_list, feature_columns = get_mtl_test_data(
+        sample_size, sparse_feature_num=2, dense_feature_num=1, task_types=['binary', 'binary'])
+
+    model = MMOE(feature_columns, task_types=['binary', 'binary'], device=get_device(use_cuda=False))
+    model.compile('adam', ['binary_crossentropy', 'binary_crossentropy'], metrics=['binary_crossentropy'])
+
+    history = model.fit(x, y_list, batch_size=1, epochs=1, verbose=0)
+    assert "loss" in history.history
+
+    pred = model.predict(x, batch_size=1)
+    assert pred.shape == (sample_size, 2)
+
+
 if __name__ == "__main__":
     pass
diff --git a/tests/models/multitask/PLE_test.py b/tests/models/multitask/PLE_test.py
@@ -30,5 +30,20 @@ def test_PLE(shared_expert_num, specific_expert_num, num_levels, expert_dnn_hidd
     check_mtl_model(model, model_name, x, y_list, task_types)
 
 
+def test_PLE_batch_size_one_multitask_fit():
+    sample_size = 8
+    x, y_list, feature_columns = get_mtl_test_data(
+        sample_size, sparse_feature_num=2, dense_feature_num=1, task_types=['binary', 'binary'])
+
+    model = PLE(feature_columns, task_types=['binary', 'binary'], device=get_device(use_cuda=False))
+    model.compile('adam', ['binary_crossentropy', 'binary_crossentropy'], metrics=['binary_crossentropy'])
+
+    history = model.fit(x, y_list, batch_size=1, epochs=1, verbose=0)
+    assert "loss" in history.history
+
+    pred = model.predict(x, batch_size=1)
+    assert pred.shape == (sample_size, 2)
+
+
 if __name__ == "__main__":
     pass