Disable qdq to mnb fusion in test_mnb_to_qdq (microsoft#2429)

jambayk · web-flow · commit fe2099397505 · 2026-04-22T11:04:23.000-07:00
## Describe your changes
Latest ORT has QDQ to MatMulNBits rules for more cases now. We want to
disable this fusion in the `test_mnb_to_qdq` test since we are trying to
compare the original MNB model with the replacement QDQ model.

## Checklist before requesting a review
- [ ] Add unit tests for this change.
- [ ] Make sure all tests can pass.
- [ ] Update documents if necessary.
- [ ] Lint and apply fixes to your code by running `lintrunner -a`
- [ ] Is this a user-facing change? If yes, give a description of this
change to be included in the release notes.

## (Optional) Issue link
diff --git a/test/passes/onnx/test_mnb_to_qdq.py b/test/passes/onnx/test_mnb_to_qdq.py
@@ -148,25 +148,21 @@ def test_mnb_to_qdq(create_mnb_model, nodes_to_exclude, add_zero_point, use_sign
     # validate
     original_session = onnxruntime.InferenceSession(str(mnb_path))
     original_session.disable_fallback()
+    # disable qdq to mnb fusion so we can test the output of the DQ nodes directly
+    disabled_optimizers = ["QDQSelectorActionTransformer"]
     if is_symmetric and use_signed_int and not add_zero_point and use_transpose_op:
         # there seems to be a bug in ORT graph optimization which changes the int4 DQ to uint8 DQ
         with pytest.raises(Exception, match="uint8"):
-            onnxruntime.InferenceSession(str(qdq_model.model_path))
+            onnxruntime.InferenceSession(str(qdq_model.model_path), disabled_optimizers=disabled_optimizers)
         return
     else:
-        qdq_session = onnxruntime.InferenceSession(str(qdq_model.model_path))
+        qdq_session = onnxruntime.InferenceSession(str(qdq_model.model_path), disabled_optimizers=disabled_optimizers)
         qdq_session.disable_fallback()
 
     input_data = {"input": np.random.randn(1, 1, in_dim).astype(np.float32)}
     original_output = original_session.run(None, input_data)[0]
     qdq_output = qdq_session.run(None, input_data)[0]
     assert original_output.shape == qdq_output.shape
     assert original_output.dtype == qdq_output.dtype
-    if bits == 4 and not use_transpose_op:
-        # Pre transposed DQ model does not match the expected output on x64 CPU
-        # check for assertion failure so we know when the test is fixed
-        with pytest.raises(AssertionError):
-            np.testing.assert_allclose(original_output, qdq_output, atol=1e-4)
-    else:
-        # acc level 4 is used for 8 bit, so the tolerance is higher
-        np.testing.assert_allclose(original_output, qdq_output, atol=1e-2 if bits == 8 else 1e-4)
+    # acc level 4 is used for 8 bit, so the tolerance is higher
+    np.testing.assert_allclose(original_output, qdq_output, atol=1e-2 if bits == 8 else 1e-4)