From f2ba5a64afd1746a937b0e803d146524bd4591a5 Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Wed, 11 Mar 2026 18:41:44 +0300
Subject: [PATCH 01/12] [Matrix] Add tests for fma

---
 test/Feature/HLSLLib/fma.fp64.test | 162 +++++++++++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 test/Feature/HLSLLib/fma.fp64.test
diff --git a/test/Feature/HLSLLib/fma.fp64.test b/test/Feature/HLSLLib/fma.fp64.test
new file mode 100644
index 000000000..23dad6dae
--- /dev/null
+++ b/test/Feature/HLSLLib/fma.fp64.test
@@ -0,0 +1,162 @@
+#--- source.hlsl
+
+StructuredBuffer<double4> A : register(t0);
+StructuredBuffer<double4> B : register(t1);
+StructuredBuffer<double4> C : register(t2);
+
+RWStructuredBuffer<double> Out : register(u3);
+
+[numthreads(1,1,1)]
+void main() {
+  Out[0] = fma(A[0].x, B[0].x, C[0].x);
+
+  double2 v2 = fma(A[1].zw, B[1].zw, C[1].zw);
+  Out[8] = v2.x;
+  Out[9] = v2.y;
+
+  double3 v3 = fma(A[1].xyz, B[1].xyz, C[1].xyz);
+  Out[5] = v3.x;
+  Out[6] = v3.y;
+  Out[7] = v3.z;
+
+  double4 v4 = fma(A[0], B[0], C[0]);
+  Out[1] = v4.x;
+  Out[2] = v4.y;
+  Out[3] = v4.z;
+  Out[4] = v4.w;
+
+  double2x2 m22a = double2x2(A[2].x, A[2].y, A[2].z, A[2].w);
+  double2x2 m22b = double2x2(B[2].x, B[2].y, B[2].z, B[2].w);
+  double2x2 m22c = double2x2(C[2].x, C[2].y, C[2].z, C[2].w);
+  double2x2 r22 = fma(m22a, m22b, m22c);
+  Out[10] = r22[0][0];
+  Out[11] = r22[0][1];
+  Out[12] = r22[1][0];
+  Out[13] = r22[1][1];
+
+  double4x4 m44a = double4x4(A[3].x, A[3].y, A[3].z, A[3].w,
+                             A[4].x, A[4].y, A[4].z, A[4].w,
+                             A[5].x, A[5].y, A[5].z, A[5].w,
+                             A[6].x, A[6].y, A[6].z, A[6].w);
+  double4x4 m44b = double4x4(B[3].x, B[3].y, B[3].z, B[3].w,
+                             B[4].x, B[4].y, B[4].z, B[4].w,
+                             B[5].x, B[5].y, B[5].z, B[5].w,
+                             B[6].x, B[6].y, B[6].z, B[6].w);
+  double4x4 m44c = double4x4(C[3].x, C[3].y, C[3].z, C[3].w,
+                             C[4].x, C[4].y, C[4].z, C[4].w,
+                             C[5].x, C[5].y, C[5].z, C[5].w,
+                             C[6].x, C[6].y, C[6].z, C[6].w);
+  double4x4 r44 = fma(m44a, m44b, m44c);
+  Out[14] = r44[0][0];
+  Out[15] = r44[0][1];
+  Out[16] = r44[0][2];
+  Out[17] = r44[0][3];
+  Out[18] = r44[1][0];
+  Out[19] = r44[1][1];
+  Out[20] = r44[1][2];
+  Out[21] = r44[1][3];
+  Out[22] = r44[2][0];
+  Out[23] = r44[2][1];
+  Out[24] = r44[2][2];
+  Out[25] = r44[2][3];
+  Out[26] = r44[3][0];
+  Out[27] = r44[3][1];
+  Out[28] = r44[3][2];
+  Out[29] = r44[3][3];
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: A
+    Format: Float64
+    Stride: 32
+    Data: [ 2, 3, 4, 5,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            1, 2, 3, 4,
+            5, 6, 7, 8,
+            9, 10, 11, 12,
+            13, 14, 15, 16 ]
+  - Name: B
+    Format: Float64
+    Stride: 32
+    Data: [ 10, 20, 30, 40,
+            2, 3, 4, 5,
+            10, 20, 30, 40,
+            2, 2, 2, 2,
+            2, 2, 2, 2,
+            2, 2, 2, 2,
+            2, 2, 2, 2 ]
+  - Name: C
+    Format: Float64
+    Stride: 32
+    Data: [ 1, 2, 3, 4,
+            -1, -2, -3, -4,
+            -1, -2, -3, -4,
+            1, 1, 1, 1,
+            1, 1, 1, 1,
+            1, 1, 1, 1,
+            1, 1, 1, 1 ]
+  - Name: Out
+    Format: Float64
+    Stride: 8
+    FillSize: 240
+  - Name: Expected
+    Format: Float64
+    Stride: 8
+    Data: [ 21,
+            21, 62, 123, 204,
+            1, 4, 9,
+            9, 16,
+            9, 38, 87, 156,
+            3, 5, 7, 9,
+            11, 13, 15, 17,
+            19, 21, 23, 25,
+            27, 29, 31, 33 ]
+Results:
+  - Result: Result
+    Rule: BufferExact
+    Actual: Out
+    Expected: Expected
+DescriptorSets:
+  - Resources:
+    - Name: A
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: B
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: C
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+#--- end
+
+# REQUIRES: Double
+# RUN: split-file %s %t
+# RUN: %dxc_target -Gis -HV 202x -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o

From 3983163ea3bf6d89d0aa638735df65a95d6cf724 Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Wed, 11 Mar 2026 19:04:16 +0300
Subject: [PATCH 02/12] upgrade tests to use real double values + test some
 edge cases

---
 test/Feature/HLSLLib/fma.fp64.test | 135 ++++++++++++++++-------------
 1 file changed, 74 insertions(+), 61 deletions(-)

diff --git a/test/Feature/HLSLLib/fma.fp64.test b/test/Feature/HLSLLib/fma.fp64.test
index 23dad6dae..25c74ffa8 100644
--- a/test/Feature/HLSLLib/fma.fp64.test
+++ b/test/Feature/HLSLLib/fma.fp64.test
@@ -8,31 +8,18 @@ RWStructuredBuffer<double> Out : register(u3);
 
 [numthreads(1,1,1)]
 void main() {
-  Out[0] = fma(A[0].x, B[0].x, C[0].x);
+  double s = fma(A[0].x, B[0].x, C[0].x);
 
   double2 v2 = fma(A[1].zw, B[1].zw, C[1].zw);
-  Out[8] = v2.x;
-  Out[9] = v2.y;
 
   double3 v3 = fma(A[1].xyz, B[1].xyz, C[1].xyz);
-  Out[5] = v3.x;
-  Out[6] = v3.y;
-  Out[7] = v3.z;
 
   double4 v4 = fma(A[0], B[0], C[0]);
-  Out[1] = v4.x;
-  Out[2] = v4.y;
-  Out[3] = v4.z;
-  Out[4] = v4.w;
 
   double2x2 m22a = double2x2(A[2].x, A[2].y, A[2].z, A[2].w);
   double2x2 m22b = double2x2(B[2].x, B[2].y, B[2].z, B[2].w);
   double2x2 m22c = double2x2(C[2].x, C[2].y, C[2].z, C[2].w);
   double2x2 r22 = fma(m22a, m22b, m22c);
-  Out[10] = r22[0][0];
-  Out[11] = r22[0][1];
-  Out[12] = r22[1][0];
-  Out[13] = r22[1][1];
 
   double4x4 m44a = double4x4(A[3].x, A[3].y, A[3].z, A[3].w,
                              A[4].x, A[4].y, A[4].z, A[4].w,
@@ -47,22 +34,43 @@ void main() {
                              C[5].x, C[5].y, C[5].z, C[5].w,
                              C[6].x, C[6].y, C[6].z, C[6].w);
   double4x4 r44 = fma(m44a, m44b, m44c);
-  Out[14] = r44[0][0];
-  Out[15] = r44[0][1];
-  Out[16] = r44[0][2];
-  Out[17] = r44[0][3];
-  Out[18] = r44[1][0];
-  Out[19] = r44[1][1];
-  Out[20] = r44[1][2];
-  Out[21] = r44[1][3];
-  Out[22] = r44[2][0];
-  Out[23] = r44[2][1];
-  Out[24] = r44[2][2];
-  Out[25] = r44[2][3];
-  Out[26] = r44[3][0];
-  Out[27] = r44[3][1];
-  Out[28] = r44[3][2];
-  Out[29] = r44[3][3];
+  double4 precise = fma(A[7], B[7], C[7]);
+  uint idx = 0;
+
+  Out[idx++] = s;
+  Out[idx++] = v4.x;
+  Out[idx++] = v4.y;
+  Out[idx++] = v4.z;
+  Out[idx++] = v4.w;
+  Out[idx++] = v3.x;
+  Out[idx++] = v3.y;
+  Out[idx++] = v3.z;
+  Out[idx++] = v2.x;
+  Out[idx++] = v2.y;
+  Out[idx++] = r22[0][0];
+  Out[idx++] = r22[0][1];
+  Out[idx++] = r22[1][0];
+  Out[idx++] = r22[1][1];
+  Out[idx++] = r44[0][0];
+  Out[idx++] = r44[0][1];
+  Out[idx++] = r44[0][2];
+  Out[idx++] = r44[0][3];
+  Out[idx++] = r44[1][0];
+  Out[idx++] = r44[1][1];
+  Out[idx++] = r44[1][2];
+  Out[idx++] = r44[1][3];
+  Out[idx++] = r44[2][0];
+  Out[idx++] = r44[2][1];
+  Out[idx++] = r44[2][2];
+  Out[idx++] = r44[2][3];
+  Out[idx++] = r44[3][0];
+  Out[idx++] = r44[3][1];
+  Out[idx++] = r44[3][2];
+  Out[idx++] = r44[3][3];
+  Out[idx++] = precise.x;
+  Out[idx++] = precise.y;
+  Out[idx++] = precise.z;
+  Out[idx++] = precise.w;
 }
 
 //--- pipeline.yaml
@@ -76,49 +84,54 @@ Buffers:
   - Name: A
     Format: Float64
     Stride: 32
-    Data: [ 2, 3, 4, 5,
-            1, 2, 3, 4,
-            1, 2, 3, 4,
-            1, 2, 3, 4,
-            5, 6, 7, 8,
-            9, 10, 11, 12,
-            13, 14, 15, 16 ]
+    Data: [ NaN, -Inf, Inf, -0.0,
+            0.25, -0.25, 10.4, -10.6,
+            1.5, -2.5, 0.5, -0.5,
+            NaN, -Inf, Inf, -0.0,
+            0.0, 10.0, -10.0, 10.5,
+            -10.5, 0.25, -0.25, 42.5,
+            1e+200, -1e+200, 1e-200, -1e-200,
+            0x1.0000000000001p+0, 0x1p+500, 0x1.8p+500, 0x1p-500 ]
   - Name: B
     Format: Float64
     Stride: 32
-    Data: [ 10, 20, 30, 40,
-            2, 3, 4, 5,
-            10, 20, 30, 40,
-            2, 2, 2, 2,
-            2, 2, 2, 2,
-            2, 2, 2, 2,
-            2, 2, 2, 2 ]
+    Data: [ 1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            2.0, -1.0, 4.0, 4.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 2.0, -1.0, 1.0,
+            1.0, 4.0, 4.0, 2.0,
+            1.0, 1.0, 1.0, 1.0,
+            0x1.ffffffffffffep-1, 0x1p-500, 0x1p-500, 1.0 ]
   - Name: C
     Format: Float64
     Stride: 32
-    Data: [ 1, 2, 3, 4,
-            -1, -2, -3, -4,
-            -1, -2, -3, -4,
-            1, 1, 1, 1,
-            1, 1, 1, 1,
-            1, 1, 1, 1,
-            1, 1, 1, 1 ]
+    Data: [ 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0,
+            0.25, 0.75, -1.5, 1.5,
+            0.0, 0.0, 0.0, 0.0,
+            0.0, 0.5, 0.5, 0.0,
+            0.0, 0.5, -0.5, -0.5,
+            0.0, 0.0, 0.0, 0.0,
+            -1.0, 0x1p-52, -0x1p-52, 0x1p-500 ]
   - Name: Out
     Format: Float64
     Stride: 8
-    FillSize: 240
+    FillSize: 272
   - Name: Expected
     Format: Float64
     Stride: 8
-    Data: [ 21,
-            21, 62, 123, 204,
-            1, 4, 9,
-            9, 16,
-            9, 38, 87, 156,
-            3, 5, 7, 9,
-            11, 13, 15, 17,
-            19, 21, 23, 25,
-            27, 29, 31, 33 ]
+    Data: [ NaN,
+            NaN, -Inf, Inf, 0.0,
+            0.25, -0.25, 10.4,
+            10.4, -10.6,
+            3.25, 3.25, 0.5, -0.5,
+            NaN, -Inf, Inf, 0.0,
+            0.0, 20.5, 10.5, 10.5,
+            -10.5, 1.5, -1.5, 84.5,
+            1e+200, -1e+200, 1e-200, -1e-200,
+            -0x1p-104, 0x1.0000000000001p+0,
+            0x1.7ffffffffffffp+0, 0x1p-499 ]
 Results:
   - Result: Result
     Rule: BufferExact

From d1b8197db5d0d0ba3b95f3c15885c4696c3f0826 Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Thu, 12 Mar 2026 18:46:39 +0300
Subject: [PATCH 03/12] update variable name

---
 test/Feature/HLSLLib/fma.fp64.test | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/Feature/HLSLLib/fma.fp64.test b/test/Feature/HLSLLib/fma.fp64.test
index 25c74ffa8..3af6d8f84 100644
--- a/test/Feature/HLSLLib/fma.fp64.test
+++ b/test/Feature/HLSLLib/fma.fp64.test
@@ -34,7 +34,7 @@ void main() {
                              C[5].x, C[5].y, C[5].z, C[5].w,
                              C[6].x, C[6].y, C[6].z, C[6].w);
   double4x4 r44 = fma(m44a, m44b, m44c);
-  double4 precise = fma(A[7], B[7], C[7]);
+  double4 v4p = fma(A[7], B[7], C[7]);
   uint idx = 0;
 
   Out[idx++] = s;
@@ -67,10 +67,10 @@ void main() {
   Out[idx++] = r44[3][1];
   Out[idx++] = r44[3][2];
   Out[idx++] = r44[3][3];
-  Out[idx++] = precise.x;
-  Out[idx++] = precise.y;
-  Out[idx++] = precise.z;
-  Out[idx++] = precise.w;
+  Out[idx++] = v4p.x;
+  Out[idx++] = v4p.y;
+  Out[idx++] = v4p.z;
+  Out[idx++] = v4p.w;
 }
 
 //--- pipeline.yaml

From 8440721f6ab4fa190bf861228b923b5d9f535ef4 Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Wed, 18 Mar 2026 22:55:06 +0300
Subject: [PATCH 04/12] implement suggestions

---
 test/Feature/HLSLLib/fma.fp64.test   | 175 ---------------------------
 test/Feature/HLSLLib/fma.matrix.test | 142 ++++++++++++++++++++++
 test/Feature/HLSLLib/fma.test        |  98 +++++++++++++++
 3 files changed, 240 insertions(+), 175 deletions(-)
 delete mode 100644 test/Feature/HLSLLib/fma.fp64.test
 create mode 100644 test/Feature/HLSLLib/fma.matrix.test
 create mode 100644 test/Feature/HLSLLib/fma.test

diff --git a/test/Feature/HLSLLib/fma.fp64.test b/test/Feature/HLSLLib/fma.fp64.test
deleted file mode 100644
index 3af6d8f84..000000000
--- a/test/Feature/HLSLLib/fma.fp64.test
+++ /dev/null
@@ -1,175 +0,0 @@
-#--- source.hlsl
-
-StructuredBuffer<double4> A : register(t0);
-StructuredBuffer<double4> B : register(t1);
-StructuredBuffer<double4> C : register(t2);
-
-RWStructuredBuffer<double> Out : register(u3);
-
-[numthreads(1,1,1)]
-void main() {
-  double s = fma(A[0].x, B[0].x, C[0].x);
-
-  double2 v2 = fma(A[1].zw, B[1].zw, C[1].zw);
-
-  double3 v3 = fma(A[1].xyz, B[1].xyz, C[1].xyz);
-
-  double4 v4 = fma(A[0], B[0], C[0]);
-
-  double2x2 m22a = double2x2(A[2].x, A[2].y, A[2].z, A[2].w);
-  double2x2 m22b = double2x2(B[2].x, B[2].y, B[2].z, B[2].w);
-  double2x2 m22c = double2x2(C[2].x, C[2].y, C[2].z, C[2].w);
-  double2x2 r22 = fma(m22a, m22b, m22c);
-
-  double4x4 m44a = double4x4(A[3].x, A[3].y, A[3].z, A[3].w,
-                             A[4].x, A[4].y, A[4].z, A[4].w,
-                             A[5].x, A[5].y, A[5].z, A[5].w,
-                             A[6].x, A[6].y, A[6].z, A[6].w);
-  double4x4 m44b = double4x4(B[3].x, B[3].y, B[3].z, B[3].w,
-                             B[4].x, B[4].y, B[4].z, B[4].w,
-                             B[5].x, B[5].y, B[5].z, B[5].w,
-                             B[6].x, B[6].y, B[6].z, B[6].w);
-  double4x4 m44c = double4x4(C[3].x, C[3].y, C[3].z, C[3].w,
-                             C[4].x, C[4].y, C[4].z, C[4].w,
-                             C[5].x, C[5].y, C[5].z, C[5].w,
-                             C[6].x, C[6].y, C[6].z, C[6].w);
-  double4x4 r44 = fma(m44a, m44b, m44c);
-  double4 v4p = fma(A[7], B[7], C[7]);
-  uint idx = 0;
-
-  Out[idx++] = s;
-  Out[idx++] = v4.x;
-  Out[idx++] = v4.y;
-  Out[idx++] = v4.z;
-  Out[idx++] = v4.w;
-  Out[idx++] = v3.x;
-  Out[idx++] = v3.y;
-  Out[idx++] = v3.z;
-  Out[idx++] = v2.x;
-  Out[idx++] = v2.y;
-  Out[idx++] = r22[0][0];
-  Out[idx++] = r22[0][1];
-  Out[idx++] = r22[1][0];
-  Out[idx++] = r22[1][1];
-  Out[idx++] = r44[0][0];
-  Out[idx++] = r44[0][1];
-  Out[idx++] = r44[0][2];
-  Out[idx++] = r44[0][3];
-  Out[idx++] = r44[1][0];
-  Out[idx++] = r44[1][1];
-  Out[idx++] = r44[1][2];
-  Out[idx++] = r44[1][3];
-  Out[idx++] = r44[2][0];
-  Out[idx++] = r44[2][1];
-  Out[idx++] = r44[2][2];
-  Out[idx++] = r44[2][3];
-  Out[idx++] = r44[3][0];
-  Out[idx++] = r44[3][1];
-  Out[idx++] = r44[3][2];
-  Out[idx++] = r44[3][3];
-  Out[idx++] = v4p.x;
-  Out[idx++] = v4p.y;
-  Out[idx++] = v4p.z;
-  Out[idx++] = v4p.w;
-}
-
-//--- pipeline.yaml
-
----
-Shaders:
-  - Stage: Compute
-    Entry: main
-    DispatchSize: [1, 1, 1]
-Buffers:
-  - Name: A
-    Format: Float64
-    Stride: 32
-    Data: [ NaN, -Inf, Inf, -0.0,
-            0.25, -0.25, 10.4, -10.6,
-            1.5, -2.5, 0.5, -0.5,
-            NaN, -Inf, Inf, -0.0,
-            0.0, 10.0, -10.0, 10.5,
-            -10.5, 0.25, -0.25, 42.5,
-            1e+200, -1e+200, 1e-200, -1e-200,
-            0x1.0000000000001p+0, 0x1p+500, 0x1.8p+500, 0x1p-500 ]
-  - Name: B
-    Format: Float64
-    Stride: 32
-    Data: [ 1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            2.0, -1.0, 4.0, 4.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 2.0, -1.0, 1.0,
-            1.0, 4.0, 4.0, 2.0,
-            1.0, 1.0, 1.0, 1.0,
-            0x1.ffffffffffffep-1, 0x1p-500, 0x1p-500, 1.0 ]
-  - Name: C
-    Format: Float64
-    Stride: 32
-    Data: [ 0.0, 0.0, 0.0, 0.0,
-            0.0, 0.0, 0.0, 0.0,
-            0.25, 0.75, -1.5, 1.5,
-            0.0, 0.0, 0.0, 0.0,
-            0.0, 0.5, 0.5, 0.0,
-            0.0, 0.5, -0.5, -0.5,
-            0.0, 0.0, 0.0, 0.0,
-            -1.0, 0x1p-52, -0x1p-52, 0x1p-500 ]
-  - Name: Out
-    Format: Float64
-    Stride: 8
-    FillSize: 272
-  - Name: Expected
-    Format: Float64
-    Stride: 8
-    Data: [ NaN,
-            NaN, -Inf, Inf, 0.0,
-            0.25, -0.25, 10.4,
-            10.4, -10.6,
-            3.25, 3.25, 0.5, -0.5,
-            NaN, -Inf, Inf, 0.0,
-            0.0, 20.5, 10.5, 10.5,
-            -10.5, 1.5, -1.5, 84.5,
-            1e+200, -1e+200, 1e-200, -1e-200,
-            -0x1p-104, 0x1.0000000000001p+0,
-            0x1.7ffffffffffffp+0, 0x1p-499 ]
-Results:
-  - Result: Result
-    Rule: BufferExact
-    Actual: Out
-    Expected: Expected
-DescriptorSets:
-  - Resources:
-    - Name: A
-      Kind: StructuredBuffer
-      DirectXBinding:
-        Register: 0
-        Space: 0
-      VulkanBinding:
-        Binding: 0
-    - Name: B
-      Kind: StructuredBuffer
-      DirectXBinding:
-        Register: 1
-        Space: 0
-      VulkanBinding:
-        Binding: 1
-    - Name: C
-      Kind: StructuredBuffer
-      DirectXBinding:
-        Register: 2
-        Space: 0
-      VulkanBinding:
-        Binding: 2
-    - Name: Out
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 3
-        Space: 0
-      VulkanBinding:
-        Binding: 3
-#--- end
-
-# REQUIRES: Double
-# RUN: split-file %s %t
-# RUN: %dxc_target -Gis -HV 202x -T cs_6_5 -Fo %t.o %t/source.hlsl
-# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/Feature/HLSLLib/fma.matrix.test b/test/Feature/HLSLLib/fma.matrix.test
new file mode 100644
index 000000000..0ccd779ff
--- /dev/null
+++ b/test/Feature/HLSLLib/fma.matrix.test
@@ -0,0 +1,142 @@
+#--- source.hlsl
+
+StructuredBuffer<double4> A : register(t0);
+StructuredBuffer<double4> B : register(t1);
+StructuredBuffer<double4> C : register(t2);
+
+RWStructuredBuffer<double4> Out : register(u3);
+
+[numthreads(1,1,1)]
+void main() {
+  double3x2 r32 = fma(double3x2(A[0].xyz, A[1].xyz),
+                      double3x2(B[0].xyz, B[1].xyz),
+                      double3x2(C[0].xyz, C[1].xyz));
+
+  double2x4 r24 = fma(double2x4(A[2], A[3]),
+                      double2x4(B[2], B[3]),
+                      double2x4(C[2], C[3]));
+
+  double3x1 r31 = fma(double3x1(A[4].xyz),
+                      double3x1(B[4].xyz),
+                      double3x1(C[4].xyz));
+
+  double4x4 r44 = fma(double4x4(A[5], A[6], A[7], A[8]),
+                      double4x4(B[5], B[6], B[7], B[8]),
+                      double4x4(C[5], C[6], C[7], C[8]));
+
+  Out[0] = double4(r32[0], r32[1]);
+  Out[1] = double4(r32[2], 0.0, 0.0);
+  Out[2] = r24[0];
+  Out[3] = r24[1];
+  Out[4] = double4(r31[0][0], r31[1][0], r31[2][0], 0.0);
+  Out[5] = r44[0];
+  Out[6] = r44[1];
+  Out[7] = r44[2];
+  Out[8] = r44[3];
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: A
+    Format: Float64
+    Stride: 32
+    Data: [ 1.5, -2.5, 0.5, 0.0,
+            -0.5, 2.0, 3.0, 0.0,
+            0.25, -0.25, 10.4, -10.6,
+            0.0, 10.0, -10.0, 10.5,
+            1.25, -4.0, 2.5, 0.0,
+            1.0, 2.0, 3.0, 4.0,
+            -1.0, -2.0, -3.0, -4.0,
+            0.5, 1.5, -0.5, -1.5,
+            10.0, -10.0, 2.25, -2.25 ]
+  - Name: B
+    Format: Float64
+    Stride: 32
+    Data: [ 2.0, -1.0, 4.0, 0.0,
+            4.0, -0.5, 1.5, 0.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 2.0, -1.0, 1.0,
+            2.0, -0.5, 3.0, 0.0,
+            2.0, -1.0, 0.5, 3.0,
+            4.0, 0.5, -2.0, -1.0,
+            1.0, 2.0, 3.0, 4.0,
+            -0.5, 1.5, -1.0, 2.0 ]
+  - Name: C
+    Format: Float64
+    Stride: 32
+    Data: [ 0.25, 0.75, -1.5, 0.0,
+            1.5, 1.0, -2.0, 0.0,
+            0.0, 0.0, 0.0, 0.0,
+            0.0, 0.5, 0.5, 0.0,
+            0.5, 1.0, -1.5, 0.0,
+            0.5, 1.0, -1.5, 0.0,
+            -0.5, 2.0, 1.0, 3.0,
+            1.5, -2.0, 0.5, -4.0,
+            5.0, -5.0, 0.25, -0.25 ]
+  - Name: Out
+    Format: Float64
+    Stride: 32
+    FillSize: 288
+  - Name: Expected
+    Format: Float64
+    Stride: 32
+    Data: [ 3.25, 3.25, 0.5, -0.5,
+            0.0, 2.5, 0.0, 0.0,
+            0.25, -0.25, 10.4, -10.6,
+            0.0, 20.5, 10.5, 10.5,
+            3.0, 3.0, 6.0, 0.0,
+            2.5, -1.0, 0.0, 12.0,
+            -4.5, 1.0, 7.0, 7.0,
+            2.0, 1.0, -1.0, -10.0,
+            0.0, -20.0, -2.0, -4.75 ]
+Results:
+  - Result: Result
+    Rule: BufferFloatULP
+    ULPT: 0
+    Actual: Out
+    Expected: Expected
+DescriptorSets:
+  - Resources:
+    - Name: A
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: B
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: C
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+#--- end
+
+# Unimplemented https://github.com/llvm/llvm-project/issues/99117
+# XFAIL: Clang
+
+# REQUIRES: Double
+# RUN: split-file %s %t
+# RUN: %dxc_target -Gis -HV 202x -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/Feature/HLSLLib/fma.test b/test/Feature/HLSLLib/fma.test
new file mode 100644
index 000000000..6d2ba8945
--- /dev/null
+++ b/test/Feature/HLSLLib/fma.test
@@ -0,0 +1,98 @@
+#--- source.hlsl
+
+StructuredBuffer<double4> A : register(t0);
+StructuredBuffer<double4> B : register(t1);
+StructuredBuffer<double4> C : register(t2);
+
+RWStructuredBuffer<double4> Out : register(u3);
+
+[numthreads(1,1,1)]
+void main() {
+  Out[0] = fma(A[0], B[0], C[0]);
+  Out[1] = double4(fma(A[1].xyz, B[1].xyz, C[1].xyz), fma(A[1].w, B[1].w, C[1].w));
+  Out[2] = double4(fma(A[2].xy, B[2].xy, C[2].xy), fma(A[2].zw, B[2].zw, C[2].zw));
+  Out[3] = fma(double4(0.25, -0.25, 10.4, -10.6), double4(1.0, 1.0, 1.0, 1.0), double4(0.0, 0.0, 0.0, 0.0));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: A
+    Format: Float64
+    Stride: 32
+    Data: [ 0.25, -0.25, 10.4, -10.6,
+            1.5, -2.5, 0.5, -0.5,
+            0.0, 10.0, -10.0, 10.5 ]
+  - Name: B
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 1.0, 1.0, 1.0,
+            2.0, -1.0, 4.0, 4.0,
+            1.0, 2.0, -1.0, 1.0 ]
+  - Name: C
+    Format: Float64
+    Stride: 32
+    Data: [ 0.0, 0.0, 0.0, 0.0,
+            0.25, 0.75, -1.5, 1.5,
+            0.0, 0.5, 0.5, 0.0 ]
+  - Name: Out
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Expected
+    Format: Float64
+    Stride: 32
+    Data: [ 0.25, -0.25, 10.4, -10.6,
+            3.25, 3.25, 0.5, -0.5,
+            0.0, 20.5, 10.5, 10.5,
+            0.25, -0.25, 10.399999618530273, -10.600000381469727 ]
+Results:
+  - Result: Result
+    Rule: BufferFloatULP
+    ULPT: 0
+    Actual: Out
+    Expected: Expected
+DescriptorSets:
+  - Resources:
+    - Name: A
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: B
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: C
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+#--- end
+
+# Unimplemented https://github.com/llvm/llvm-project/issues/99117
+# XFAIL: Clang
+
+# REQUIRES: Double
+# RUN: split-file %s %t
+# RUN: %dxc_target -Gis -HV 202x -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o

From 3d2df24c7447bc73d37a0b253546ba4942931cb0 Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Wed, 18 Mar 2026 23:09:28 +0300
Subject: [PATCH 05/12] update values

---
 test/Feature/HLSLLib/fma.test | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/Feature/HLSLLib/fma.test b/test/Feature/HLSLLib/fma.test
index 6d2ba8945..20559e9a7 100644
--- a/test/Feature/HLSLLib/fma.test
+++ b/test/Feature/HLSLLib/fma.test
@@ -11,7 +11,7 @@ void main() {
   Out[0] = fma(A[0], B[0], C[0]);
   Out[1] = double4(fma(A[1].xyz, B[1].xyz, C[1].xyz), fma(A[1].w, B[1].w, C[1].w));
   Out[2] = double4(fma(A[2].xy, B[2].xy, C[2].xy), fma(A[2].zw, B[2].zw, C[2].zw));
-  Out[3] = fma(double4(0.25, -0.25, 10.4, -10.6), double4(1.0, 1.0, 1.0, 1.0), double4(0.0, 0.0, 0.0, 0.0));
+  Out[3] = fma(double4(0.25, -0.25, 10.0, -10.0), double4(1.0, 1.0, 1.0, 1.0), double4(0.0, 0.0, 0.0, 0.0));
 }
 
 //--- pipeline.yaml
@@ -50,7 +50,7 @@ Buffers:
     Data: [ 0.25, -0.25, 10.4, -10.6,
             3.25, 3.25, 0.5, -0.5,
             0.0, 20.5, 10.5, 10.5,
-            0.25, -0.25, 10.399999618530273, -10.600000381469727 ]
+            0.25, -0.25, 10.0, -10.0 ]
 Results:
   - Result: Result
     Rule: BufferFloatULP

From 383306eeb59bfac3ca0c43b3e0c5817d2904cf7e Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Sat, 21 Mar 2026 12:51:58 +0300
Subject: [PATCH 06/12] add XFAILS for one of the targets, add tests for more
 precise values

---
 test/Feature/HLSLLib/fma.matrix.test |  3 +++
 test/Feature/HLSLLib/fma.test        | 39 ++++++++++++++++++----------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/test/Feature/HLSLLib/fma.matrix.test b/test/Feature/HLSLLib/fma.matrix.test
index 0ccd779ff..0fa550d55 100644
--- a/test/Feature/HLSLLib/fma.matrix.test
+++ b/test/Feature/HLSLLib/fma.matrix.test
@@ -136,6 +136,9 @@ DescriptorSets:
 # Unimplemented https://github.com/llvm/llvm-project/issues/99117
 # XFAIL: Clang
 
+# Bug: https://github.com/llvm/offload-test-suite/issues/1000
+# XFAIL: WARP && DXC && QC
+
 # REQUIRES: Double
 # RUN: split-file %s %t
 # RUN: %dxc_target -Gis -HV 202x -T cs_6_5 -Fo %t.o %t/source.hlsl
diff --git a/test/Feature/HLSLLib/fma.test b/test/Feature/HLSLLib/fma.test
index 20559e9a7..c108782c7 100644
--- a/test/Feature/HLSLLib/fma.test
+++ b/test/Feature/HLSLLib/fma.test
@@ -11,7 +11,11 @@ void main() {
   Out[0] = fma(A[0], B[0], C[0]);
   Out[1] = double4(fma(A[1].xyz, B[1].xyz, C[1].xyz), fma(A[1].w, B[1].w, C[1].w));
   Out[2] = double4(fma(A[2].xy, B[2].xy, C[2].xy), fma(A[2].zw, B[2].zw, C[2].zw));
-  Out[3] = fma(double4(0.25, -0.25, 10.0, -10.0), double4(1.0, 1.0, 1.0, 1.0), double4(0.0, 0.0, 0.0, 0.0));
+  // Had to add `l` literals at the end because otherwise we lose precision before numbers are converted to `double4`
+  Out[3] = fma(double4(1.0000000149011612l, -1.0000000149011612l, 1.0000000000000002l, 0.5l), double4(67108865.0l, 67108865.0l, -0.9999999999999998l, -8.0l), double4(-67108866.0l, 67108866.0l, 1.0l, 1.0l));
+  // (1 + 2^-26), -(1 + 2^-26), (1 + 2^-52), 0.5
+  // (2^26 + 1), (2^26 + 1), -(1 - 2^-52), -8
+  // -(2^26 + 2), (2^26 + 2), 1, 1
 }
 
 //--- pipeline.yaml
@@ -25,21 +29,24 @@ Buffers:
   - Name: A
     Format: Float64
     Stride: 32
-    Data: [ 0.25, -0.25, 10.4, -10.6,
-            1.5, -2.5, 0.5, -0.5,
-            0.0, 10.0, -10.0, 10.5 ]
+    Data: [ 1.5, -2.0, 0.75, -3.25,
+            2.5, -4.0, 1.25, 3.5,
+            0x1.0000000000001p+0, -0x1.0000000000001p+0, 0x1.0000002000000p+0, -0x1.0000002000000p+0 ]
+    # last row: (1 + 2^-52), -(1 + 2^-52), (1 + 2^-27), -(1 + 2^-27)
   - Name: B
     Format: Float64
     Stride: 32
-    Data: [ 1.0, 1.0, 1.0, 1.0,
-            2.0, -1.0, 4.0, 4.0,
-            1.0, 2.0, -1.0, 1.0 ]
+    Data: [ 2.25, 4.5, -8.0, -2.0,
+            -1.5, -2.0, 8.0, -2.0,
+            0x1.ffffffffffffep-1, 0x1.ffffffffffffep-1, 134217729.0, 134217729.0 ]
+    # last row: (1 - 2^-52), (1 - 2^-52), (2^27 + 1), (2^27 + 1)
   - Name: C
     Format: Float64
     Stride: 32
-    Data: [ 0.0, 0.0, 0.0, 0.0,
-            0.25, 0.75, -1.5, 1.5,
-            0.0, 0.5, 0.5, 0.0 ]
+    Data: [ 0.125, 1.25, 2.5, -0.5,
+            0.25, -1.5, -0.75, 0.5,
+            -1.0, 1.0, -134217730.0, 134217730.0 ]
+    # last row: -1, 1, -(2^27 + 2), (2^27 + 2)
   - Name: Out
     Format: Float64
     Stride: 32
@@ -47,10 +54,11 @@ Buffers:
   - Name: Expected
     Format: Float64
     Stride: 32
-    Data: [ 0.25, -0.25, 10.4, -10.6,
-            3.25, 3.25, 0.5, -0.5,
-            0.0, 20.5, 10.5, 10.5,
-            0.25, -0.25, 10.0, -10.0 ]
+    Data: [ 3.5, -7.75, -3.5, 6.0,
+            -3.5, 6.5, 9.25, -6.5,
+            -0x1.0000000000000p-104, 0x1.0000000000000p-104, 0x1.0000000000000p-27, -0x1.0000000000000p-27,
+            0x1.0000000000000p-26, -0x1.0000000000000p-26, 0x1.0000000000000p-104, -3.0 ]
+    # last two rows: -(2^-104), 2^-104, 2^-27, -(2^-27), 2^-26, -(2^-26), 2^-104, -3
 Results:
   - Result: Result
     Rule: BufferFloatULP
@@ -92,6 +100,9 @@ DescriptorSets:
 # Unimplemented https://github.com/llvm/llvm-project/issues/99117
 # XFAIL: Clang
 
+# Bug: https://github.com/llvm/offload-test-suite/issues/1000
+# XFAIL: WARP && DXC && QC
+
 # REQUIRES: Double
 # RUN: split-file %s %t
 # RUN: %dxc_target -Gis -HV 202x -T cs_6_5 -Fo %t.o %t/source.hlsl

From 4ceba74cde76e0272962520ffde4d44a5bd618ee Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Wed, 25 Mar 2026 18:56:54 +0300
Subject: [PATCH 07/12] update tests

---
 test/Feature/HLSLLib/fma.matrix.test | 5 ++---
 test/Feature/HLSLLib/fma.test        | 6 ++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/test/Feature/HLSLLib/fma.matrix.test b/test/Feature/HLSLLib/fma.matrix.test
index 0fa550d55..4c7deaac0 100644
--- a/test/Feature/HLSLLib/fma.matrix.test
+++ b/test/Feature/HLSLLib/fma.matrix.test
@@ -97,8 +97,7 @@ Buffers:
             0.0, -20.0, -2.0, -4.75 ]
 Results:
   - Result: Result
-    Rule: BufferFloatULP
-    ULPT: 0
+    Rule: BufferExact
     Actual: Out
     Expected: Expected
 DescriptorSets:
@@ -137,7 +136,7 @@ DescriptorSets:
 # XFAIL: Clang
 
 # Bug: https://github.com/llvm/offload-test-suite/issues/1000
-# XFAIL: WARP && DXC && QC
+# XFAIL: arm64 && WARP && DXC
 
 # REQUIRES: Double
 # RUN: split-file %s %t
diff --git a/test/Feature/HLSLLib/fma.test b/test/Feature/HLSLLib/fma.test
index c108782c7..a0cee6404 100644
--- a/test/Feature/HLSLLib/fma.test
+++ b/test/Feature/HLSLLib/fma.test
@@ -11,7 +11,6 @@ void main() {
   Out[0] = fma(A[0], B[0], C[0]);
   Out[1] = double4(fma(A[1].xyz, B[1].xyz, C[1].xyz), fma(A[1].w, B[1].w, C[1].w));
   Out[2] = double4(fma(A[2].xy, B[2].xy, C[2].xy), fma(A[2].zw, B[2].zw, C[2].zw));
-  // Had to add `l` literals at the end because otherwise we lose precision before numbers are converted to `double4`
   Out[3] = fma(double4(1.0000000149011612l, -1.0000000149011612l, 1.0000000000000002l, 0.5l), double4(67108865.0l, 67108865.0l, -0.9999999999999998l, -8.0l), double4(-67108866.0l, 67108866.0l, 1.0l, 1.0l));
   // (1 + 2^-26), -(1 + 2^-26), (1 + 2^-52), 0.5
   // (2^26 + 1), (2^26 + 1), -(1 - 2^-52), -8
@@ -61,8 +60,7 @@ Buffers:
     # last two rows: -(2^-104), 2^-104, 2^-27, -(2^-27), 2^-26, -(2^-26), 2^-104, -3
 Results:
   - Result: Result
-    Rule: BufferFloatULP
-    ULPT: 0
+    Rule: BufferExact
     Actual: Out
     Expected: Expected
 DescriptorSets:
@@ -101,7 +99,7 @@ DescriptorSets:
 # XFAIL: Clang
 
 # Bug: https://github.com/llvm/offload-test-suite/issues/1000
-# XFAIL: WARP && DXC && QC
+# XFAIL: arm64 && WARP && DXC
 
 # REQUIRES: Double
 # RUN: split-file %s %t

From a482934581e3f09af82c769dabd65b714eda4aec Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Wed, 25 Mar 2026 21:41:10 +0300
Subject: [PATCH 08/12] delete the comment in main, update buffer values to use
 the same values as in constant folding

---
 test/Feature/HLSLLib/fma.test | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/test/Feature/HLSLLib/fma.test b/test/Feature/HLSLLib/fma.test
index a0cee6404..40c77ed0a 100644
--- a/test/Feature/HLSLLib/fma.test
+++ b/test/Feature/HLSLLib/fma.test
@@ -12,9 +12,6 @@ void main() {
   Out[1] = double4(fma(A[1].xyz, B[1].xyz, C[1].xyz), fma(A[1].w, B[1].w, C[1].w));
   Out[2] = double4(fma(A[2].xy, B[2].xy, C[2].xy), fma(A[2].zw, B[2].zw, C[2].zw));
   Out[3] = fma(double4(1.0000000149011612l, -1.0000000149011612l, 1.0000000000000002l, 0.5l), double4(67108865.0l, 67108865.0l, -0.9999999999999998l, -8.0l), double4(-67108866.0l, 67108866.0l, 1.0l, 1.0l));
-  // (1 + 2^-26), -(1 + 2^-26), (1 + 2^-52), 0.5
-  // (2^26 + 1), (2^26 + 1), -(1 - 2^-52), -8
-  // -(2^26 + 2), (2^26 + 2), 1, 1
 }
 
 //--- pipeline.yaml
@@ -30,22 +27,22 @@ Buffers:
     Stride: 32
     Data: [ 1.5, -2.0, 0.75, -3.25,
             2.5, -4.0, 1.25, 3.5,
-            0x1.0000000000001p+0, -0x1.0000000000001p+0, 0x1.0000002000000p+0, -0x1.0000002000000p+0 ]
-    # last row: (1 + 2^-52), -(1 + 2^-52), (1 + 2^-27), -(1 + 2^-27)
+            1.0000000149011612, -1.0000000149011612, 1.0000000000000002, 0.5 ]
+    # last row: (1 + 2^-26), -(1 + 2^-26), (1 + 2^-52), 0.5
   - Name: B
     Format: Float64
     Stride: 32
     Data: [ 2.25, 4.5, -8.0, -2.0,
             -1.5, -2.0, 8.0, -2.0,
-            0x1.ffffffffffffep-1, 0x1.ffffffffffffep-1, 134217729.0, 134217729.0 ]
-    # last row: (1 - 2^-52), (1 - 2^-52), (2^27 + 1), (2^27 + 1)
+            67108865.0, 67108865.0, -0.9999999999999998, -8.0 ]
+    # last row: (2^26 + 1), (2^26 + 1), -(1 - 2^-52), -8
   - Name: C
     Format: Float64
     Stride: 32
     Data: [ 0.125, 1.25, 2.5, -0.5,
             0.25, -1.5, -0.75, 0.5,
-            -1.0, 1.0, -134217730.0, 134217730.0 ]
-    # last row: -1, 1, -(2^27 + 2), (2^27 + 2)
+            -67108866.0, 67108866.0, 1.0, 1.0 ]
+    # last row: -(2^26 + 2), (2^26 + 2), 1, 1
   - Name: Out
     Format: Float64
     Stride: 32
@@ -55,9 +52,9 @@ Buffers:
     Stride: 32
     Data: [ 3.5, -7.75, -3.5, 6.0,
             -3.5, 6.5, 9.25, -6.5,
-            -0x1.0000000000000p-104, 0x1.0000000000000p-104, 0x1.0000000000000p-27, -0x1.0000000000000p-27,
+            0x1.0000000000000p-26, -0x1.0000000000000p-26, 0x1.0000000000000p-104, -3.0,
             0x1.0000000000000p-26, -0x1.0000000000000p-26, 0x1.0000000000000p-104, -3.0 ]
-    # last two rows: -(2^-104), 2^-104, 2^-27, -(2^-27), 2^-26, -(2^-26), 2^-104, -3
+    # last two rows: 2^-26, -(2^-26), 2^-104, -3
 Results:
   - Result: Result
     Rule: BufferExact

From 0110647742a4fecad7175e45dd0ce729f383654e Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Fri, 27 Mar 2026 10:42:47 +0300
Subject: [PATCH 09/12] add xfail for intel vulkan

---
 test/Feature/HLSLLib/fma.test | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/Feature/HLSLLib/fma.test b/test/Feature/HLSLLib/fma.test
index 40c77ed0a..38fc1665e 100644
--- a/test/Feature/HLSLLib/fma.test
+++ b/test/Feature/HLSLLib/fma.test
@@ -95,9 +95,12 @@ DescriptorSets:
 # Unimplemented https://github.com/llvm/llvm-project/issues/99117
 # XFAIL: Clang
 
-# Bug: https://github.com/llvm/offload-test-suite/issues/1000
+# Bug https://github.com/llvm/offload-test-suite/issues/1000
 # XFAIL: arm64 && WARP && DXC
 
+# Bug https://github.com/llvm/offload-test-suite/issues/1026
+# XFAIL: Intel && Vulkan && DXC
+
 # REQUIRES: Double
 # RUN: split-file %s %t
 # RUN: %dxc_target -Gis -HV 202x -T cs_6_5 -Fo %t.o %t/source.hlsl

From 53871f19e36837a3aff7a9a28ca8a8a104c2c142 Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Sat, 28 Mar 2026 09:42:57 +0300
Subject: [PATCH 10/12] delete clang xfail

---
 test/Feature/HLSLLib/fma.matrix.test | 5 +----
 test/Feature/HLSLLib/fma.test        | 3 ---
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/test/Feature/HLSLLib/fma.matrix.test b/test/Feature/HLSLLib/fma.matrix.test
index 4c7deaac0..51fc90797 100644
--- a/test/Feature/HLSLLib/fma.matrix.test
+++ b/test/Feature/HLSLLib/fma.matrix.test
@@ -132,10 +132,7 @@ DescriptorSets:
         Binding: 3
 #--- end
 
-# Unimplemented https://github.com/llvm/llvm-project/issues/99117
-# XFAIL: Clang
-
-# Bug: https://github.com/llvm/offload-test-suite/issues/1000
+# Bug https://github.com/llvm/offload-test-suite/issues/1000
 # XFAIL: arm64 && WARP && DXC
 
 # REQUIRES: Double
diff --git a/test/Feature/HLSLLib/fma.test b/test/Feature/HLSLLib/fma.test
index 38fc1665e..3967f2489 100644
--- a/test/Feature/HLSLLib/fma.test
+++ b/test/Feature/HLSLLib/fma.test
@@ -92,9 +92,6 @@ DescriptorSets:
         Binding: 3
 #--- end
 
-# Unimplemented https://github.com/llvm/llvm-project/issues/99117
-# XFAIL: Clang
-
 # Bug https://github.com/llvm/offload-test-suite/issues/1000
 # XFAIL: arm64 && WARP && DXC
 

From 35c626f5b8feb8b5e2aef8e10b8329af13b4e7fa Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Sat, 28 Mar 2026 14:34:36 +0300
Subject: [PATCH 11/12] xfail arm64 && warp on clang

---
 test/Feature/HLSLLib/fma.matrix.test | 2 +-
 test/Feature/HLSLLib/fma.test        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/Feature/HLSLLib/fma.matrix.test b/test/Feature/HLSLLib/fma.matrix.test
index 51fc90797..90336d880 100644
--- a/test/Feature/HLSLLib/fma.matrix.test
+++ b/test/Feature/HLSLLib/fma.matrix.test
@@ -133,7 +133,7 @@ DescriptorSets:
 #--- end
 
 # Bug https://github.com/llvm/offload-test-suite/issues/1000
-# XFAIL: arm64 && WARP && DXC
+# XFAIL: arm64 && WARP
 
 # REQUIRES: Double
 # RUN: split-file %s %t
diff --git a/test/Feature/HLSLLib/fma.test b/test/Feature/HLSLLib/fma.test
index 3967f2489..60f1cf02b 100644
--- a/test/Feature/HLSLLib/fma.test
+++ b/test/Feature/HLSLLib/fma.test
@@ -93,7 +93,7 @@ DescriptorSets:
 #--- end
 
 # Bug https://github.com/llvm/offload-test-suite/issues/1000
-# XFAIL: arm64 && WARP && DXC
+# XFAIL: arm64 && WARP
 
 # Bug https://github.com/llvm/offload-test-suite/issues/1026
 # XFAIL: Intel && Vulkan && DXC

From df1ccbc25169a324ba9ce9726d642b7ef34b1771 Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe@gmail.com>
Date: Sat, 4 Apr 2026 12:32:55 +0300
Subject: [PATCH 12/12] switch to ULP tolerance and update the link to the
 issue

---
 test/Feature/HLSLLib/fma.matrix.test | 3 ++-
 test/Feature/HLSLLib/fma.test        | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/Feature/HLSLLib/fma.matrix.test b/test/Feature/HLSLLib/fma.matrix.test
index 90336d880..89004ab9a 100644
--- a/test/Feature/HLSLLib/fma.matrix.test
+++ b/test/Feature/HLSLLib/fma.matrix.test
@@ -97,7 +97,8 @@ Buffers:
             0.0, -20.0, -2.0, -4.75 ]
 Results:
   - Result: Result
-    Rule: BufferExact
+    Rule: BufferFloatULP
+    ULPT: 1
     Actual: Out
     Expected: Expected
 DescriptorSets:
diff --git a/test/Feature/HLSLLib/fma.test b/test/Feature/HLSLLib/fma.test
index 60f1cf02b..a728e1839 100644
--- a/test/Feature/HLSLLib/fma.test
+++ b/test/Feature/HLSLLib/fma.test
@@ -57,7 +57,8 @@ Buffers:
     # last two rows: 2^-26, -(2^-26), 2^-104, -3
 Results:
   - Result: Result
-    Rule: BufferExact
+    Rule: BufferFloatULP
+    ULPT: 1
     Actual: Out
     Expected: Expected
 DescriptorSets:
@@ -92,7 +93,7 @@ DescriptorSets:
         Binding: 3
 #--- end
 
-# Bug https://github.com/llvm/offload-test-suite/issues/1000
+# Bug https://github.com/microsoft/DirectXShaderCompiler/issues/8330
 # XFAIL: arm64 && WARP
 
 # Bug https://github.com/llvm/offload-test-suite/issues/1026