diff --git a/test/WaveOps/QuadReadAcrossDiagonal.int64.test b/test/WaveOps/QuadReadAcrossDiagonal.int64.test
index f14147866..e699493d2 100644
--- a/test/WaveOps/QuadReadAcrossDiagonal.int64.test
+++ b/test/WaveOps/QuadReadAcrossDiagonal.int64.test
@@ -232,7 +232,7 @@ DescriptorSets:
 
 # REQUIRES: Int64
 
-# Bug: https://github.com/llvm/offload-test-suite/issues/988
+# Bug: https://github.com/llvm/offload-test-suite/issues/959
 # XFAIL: Metal
 
 # RUN: split-file %s %t
diff --git a/test/WaveOps/QuadReadAcrossX.int64.test b/test/WaveOps/QuadReadAcrossX.int64.test
index eb50f3281..1737bb5f5 100644
--- a/test/WaveOps/QuadReadAcrossX.int64.test
+++ b/test/WaveOps/QuadReadAcrossX.int64.test
@@ -232,8 +232,7 @@ DescriptorSets:
 
 # REQUIRES: Int64
 
-# Bug: https://github.com/llvm/offload-test-suite/issues/988
-# Bug: https://github.com/llvm/offload-test-suite/issues/989
+# Bug: https://github.com/llvm/offload-test-suite/issues/959
 # XFAIL: Metal
 
 # RUN: split-file %s %t
diff --git a/test/WaveOps/QuadReadAcrossY.int64.test b/test/WaveOps/QuadReadAcrossY.int64.test
index e461d9c53..65903c436 100644
--- a/test/WaveOps/QuadReadAcrossY.int64.test
+++ b/test/WaveOps/QuadReadAcrossY.int64.test
@@ -232,7 +232,7 @@ DescriptorSets:
 
 # REQUIRES: Int64
 
-# Bug: https://github.com/llvm/offload-test-suite/issues/989
+# Bug: https://github.com/llvm/offload-test-suite/issues/959
 # XFAIL: Metal
 
 # RUN: split-file %s %t
diff --git a/test/WaveOps/QuadReadLaneAt.32.test b/test/WaveOps/QuadReadLaneAt.32.test
new file mode 100644
index 000000000..ede0ccca5
--- /dev/null
+++ b/test/WaveOps/QuadReadLaneAt.32.test
@@ -0,0 +1,373 @@
+#--- source.hlsl
+// ints
+StructuredBuffer<int4> In : register(t0);
+RWStructuredBuffer<int4> Out1 : register(u1);
+RWStructuredBuffer<int4> Out2 : register(u2);
+RWStructuredBuffer<int4> Out3 : register(u3);
+RWStructuredBuffer<int4> Out4 : register(u4);
+
+// uints
+StructuredBuffer<uint4> UIn : register(t5);
+RWStructuredBuffer<uint4> UOut1 : register(u6);
+RWStructuredBuffer<uint4> UOut2 : register(u7);
+RWStructuredBuffer<uint4> UOut3 : register(u8);
+RWStructuredBuffer<uint4> UOut4 : register(u9);
+
+// floats
+StructuredBuffer<float4> FIn : register(t10);
+RWStructuredBuffer<float4> FOut1 : register(u11);
+RWStructuredBuffer<float4> FOut2 : register(u12);
+RWStructuredBuffer<float4> FOut3 : register(u13);
+RWStructuredBuffer<float4> FOut4 : register(u14);
+
+[numthreads(2,2,1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+
+  // int case
+  int4 v = In[index];
+#ifdef __spirv__
+  // SPIR-V requires the lane index to be a compile-time constant (or
+  // dynamically uniform in SPIR-V 1.5+), so we pass a literal.
+  int scalar = QuadReadLaneAt(v.x, 2);
+  int2 vec2 = QuadReadLaneAt(v.xy, 2);
+  int3 vec3 = QuadReadLaneAt(v.xyz, 2);
+  int4 vec4 = QuadReadLaneAt(v, 2);
+#else
+  // DXIL permits a non-uniform (per-lane) lane index. Demonstrate this with
+  // an identity shuffle where each lane passes its own `index`, then
+  // broadcast lane 2's values so the result matches the SPIR-V path.
+  int4 id = QuadReadLaneAt(v, index);
+  int scalar = QuadReadLaneAt(id.x, 2);
+  int2 vec2 = QuadReadLaneAt(id.xy, 2);
+  int3 vec3 = QuadReadLaneAt(id.xyz, 2);
+  int4 vec4 = QuadReadLaneAt(id, 2);
+#endif
+
+  Out1[index].x = scalar;
+  Out2[index].xy = vec2;
+  Out3[index].xyz = vec3;
+  Out4[index] = vec4;
+
+  // uint case
+  uint4 uv = UIn[index];
+#ifdef __spirv__
+  uint uscalar = QuadReadLaneAt(uv.x, 2);
+  uint2 uvec2 = QuadReadLaneAt(uv.xy, 2);
+  uint3 uvec3 = QuadReadLaneAt(uv.xyz, 2);
+  uint4 uvec4 = QuadReadLaneAt(uv, 2);
+#else
+  uint4 uid = QuadReadLaneAt(uv, index);
+  uint uscalar = QuadReadLaneAt(uid.x, 2);
+  uint2 uvec2 = QuadReadLaneAt(uid.xy, 2);
+  uint3 uvec3 = QuadReadLaneAt(uid.xyz, 2);
+  uint4 uvec4 = QuadReadLaneAt(uid, 2);
+#endif
+
+  UOut1[index].x = uscalar;
+  UOut2[index].xy = uvec2;
+  UOut3[index].xyz = uvec3;
+  UOut4[index] = uvec4;
+
+  // float case
+  float4 fv = FIn[index];
+#ifdef __spirv__
+  float fscalar = QuadReadLaneAt(fv.x, 2);
+  float2 fvec2 = QuadReadLaneAt(fv.xy, 2);
+  float3 fvec3 = QuadReadLaneAt(fv.xyz, 2);
+  float4 fvec4 = QuadReadLaneAt(fv, 2);
+#else
+  float4 fid = QuadReadLaneAt(fv, index);
+  float fscalar = QuadReadLaneAt(fid.x, 2);
+  float2 fvec2 = QuadReadLaneAt(fid.xy, 2);
+  float3 fvec3 = QuadReadLaneAt(fid.xyz, 2);
+  float4 fvec4 = QuadReadLaneAt(fid, 2);
+#endif
+
+  FOut1[index].x = fscalar;
+  FOut2[index].xy = fvec2;
+  FOut3[index].xyz = fvec3;
+  FOut4[index] = fvec4;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+Buffers:
+  - Name: In
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: Out1
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: Out2
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: Out3
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: Out4
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: ExpectedOut1
+    Format: Int32
+    Stride: 16
+    Data: [ 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int32
+    Stride: 16
+    Data: [ 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int32
+    Stride: 16
+    Data: [ 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0 ]
+  - Name: ExpectedOut4
+    Format: Int32
+    Stride: 16
+    Data: [ 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12 ]
+  - Name: UIn
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: UOut1
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UOut2
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UOut3
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UOut4
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UExpectedOut1
+    Format: UInt32
+    Stride: 16
+    Data: [ 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt32
+    Stride: 16
+    Data: [ 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt32
+    Stride: 16
+    Data: [ 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt32
+    Stride: 16
+    Data: [ 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12 ]
+  - Name: FIn
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 ]
+  - Name: FOut1
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FOut2
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FOut3
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FOut4
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FExpectedOut1
+    Format: Float32
+    Stride: 16
+    Data: [ 9.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 0.0 ]
+  - Name: FExpectedOut2
+    Format: Float32
+    Stride: 16
+    Data: [ 9.0, 10.0, 0.0, 0.0, 9.0, 10.0, 0.0, 0.0, 9.0, 10.0, 0.0, 0.0, 9.0, 10.0, 0.0, 0.0 ]
+  - Name: FExpectedOut3
+    Format: Float32
+    Stride: 16
+    Data: [ 9.0, 10.0, 11.0, 0.0, 9.0, 10.0, 11.0, 0.0, 9.0, 10.0, 11.0, 0.0, 9.0, 10.0, 11.0, 0.0 ]
+  - Name: FExpectedOut4
+    Format: Float32
+    Stride: 16
+    Data: [ 9.0, 10.0, 11.0, 12.0, 9.0, 10.0, 11.0, 12.0, 9.0, 10.0, 11.0, 12.0, 9.0, 10.0, 11.0, 12.0 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+  - Result: FExpectedOut1
+    Rule: BufferExact
+    Actual: FOut1
+    Expected: FExpectedOut1
+  - Result: FExpectedOut2
+    Rule: BufferExact
+    Actual: FOut2
+    Expected: FExpectedOut2
+  - Result: FExpectedOut3
+    Rule: BufferExact
+    Actual: FOut3
+    Expected: FExpectedOut3
+  - Result: FExpectedOut4
+    Rule: BufferExact
+    Actual: FOut4
+    Expected: FExpectedOut4
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+    - Name: FIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 10
+        Space: 0
+      VulkanBinding:
+        Binding: 10
+    - Name: FOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 11
+        Space: 0
+      VulkanBinding:
+        Binding: 11
+    - Name: FOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 12
+        Space: 0
+      VulkanBinding:
+        Binding: 12
+    - Name: FOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 13
+        Space: 0
+      VulkanBinding:
+        Binding: 13
+    - Name: FOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 14
+        Space: 0
+      VulkanBinding:
+        Binding: 14
+
+...
+#--- end
+
+# Currently not supported in Clang, ongoing PR
+# XFAIL: Clang
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/QuadReadLaneAt.convergence.test b/test/WaveOps/QuadReadLaneAt.convergence.test
new file mode 100644
index 000000000..c27b200e8
--- /dev/null
+++ b/test/WaveOps/QuadReadLaneAt.convergence.test
@@ -0,0 +1,85 @@
+#--- source.hlsl
+StructuredBuffer<float> In : register(t0);
+RWStructuredBuffer<float> Out : register(u1);
+
+[numthreads(2, 2, 1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+  float value = In[index];
+  // Tests control flow across the quad, but making sure neighboring lanes are active to avoid UB.
+
+  if(index < 2) {
+    // This reads lane 0 which is (0, 0), active in this branch
+#ifdef __spirv__
+    // SPIR-V requires the lane index to be a compile-time constant (or
+    // dynamically uniform in SPIR-V 1.5+).
+    float value_quad_l = QuadReadLaneAt(value, 0);
+#else
+    // DXIL permits a non-uniform (per-lane) lane index. Do a non-uniform
+    // identity shuffle (each active lane reads its own lane) and then
+    // broadcast lane 0's value so the result matches the SPIR-V path.
+    float id = QuadReadLaneAt(value, index);
+    float value_quad_l = QuadReadLaneAt(id, 0);
+#endif
+    Out[index] = value - value_quad_l;
+  } else {
+    // This reads lane 3 which is (1, 1), active in this branch
+#ifdef __spirv__
+    float value_quad_l = QuadReadLaneAt(value, 3);
+#else
+    float id = QuadReadLaneAt(value, index);
+    float value_quad_l = QuadReadLaneAt(id, 3);
+#endif
+    Out[index] = value + value_quad_l;
+  }
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+Buffers:
+  - Name: In
+    Format: Float32
+    Stride: 4
+    Data: [ 1.0, 10.0, 2.0, 20.0 ]
+  - Name: Out
+    Format: Float32
+    Stride: 4
+    FillSize: 16
+  - Name: ExpectedOut
+    Format: Float32
+    Stride: 4
+    Data: [ 0.0, 9.0, 22.0, 40.0 ]
+Results:
+  - Result: ExpectedOut
+    Rule: BufferExact
+    Actual: Out
+    Expected: ExpectedOut
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+...
+#--- end
+
+# Currently not supported in Clang, ongoing PR
+# XFAIL: Clang
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_0 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/QuadReadLaneAt.fp16.test b/test/WaveOps/QuadReadLaneAt.fp16.test
new file mode 100644
index 000000000..fe695c1aa
--- /dev/null
+++ b/test/WaveOps/QuadReadLaneAt.fp16.test
@@ -0,0 +1,145 @@
+#--- source.hlsl
+StructuredBuffer<half4> In: register(t0);
+RWStructuredBuffer<half4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<half4> Out2 : register(u2); // test half2
+RWStructuredBuffer<half4> Out3 : register(u3); // test half3
+RWStructuredBuffer<half4> Out4 : register(u4); // test half4
+
+[numthreads(2,2,1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+  half4 v = In[index];
+
+#ifdef __spirv__
+  // SPIR-V requires the lane index to be a compile-time constant (or
+  // dynamically uniform in SPIR-V 1.5+), so we pass a literal.
+  half scalar = QuadReadLaneAt(v.x, 2);
+  half2 vec2 = QuadReadLaneAt(v.xy, 2);
+  half3 vec3 = QuadReadLaneAt(v.xyz, 2);
+  half4 vec4 = QuadReadLaneAt(v, 2);
+#else
+  // DXIL permits a non-uniform (per-lane) lane index. Demonstrate this with
+  // an identity shuffle where each lane passes its own `index`, then
+  // broadcast lane 2's values so the result matches the SPIR-V path.
+  half4 id = QuadReadLaneAt(v, index);
+  half scalar = QuadReadLaneAt(id.x, 2);
+  half2 vec2 = QuadReadLaneAt(id.xy, 2);
+  half3 vec3 = QuadReadLaneAt(id.xyz, 2);
+  half4 vec4 = QuadReadLaneAt(id, 2);
+#endif
+
+  Out1[index].x = scalar;
+  Out2[index].xy = vec2;
+  Out3[index].xyz = vec3;
+  Out4[index] = vec4;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+Buffers:
+  - Name: In
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x4000, 0x4200, 0x4400, 0x4500, 0x4600, 0x4700, 0x4800, 0x4880, 0x4900, 0x4980, 0x4a00, 0x4a80, 0x4b00, 0x4b80, 0x4c00 ]
+  - Name: Out1
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: Out2
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: Out3
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: Out4
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: ExpectedOut1
+    Format: Float16
+    Stride: 8
+    Data: [ 0x4880, 0x0, 0x0, 0x0, 0x4880, 0x0, 0x0, 0x0, 0x4880, 0x0, 0x0, 0x0, 0x4880, 0x0, 0x0, 0x0 ]
+  - Name: ExpectedOut2
+    Format: Float16
+    Stride: 8
+    Data: [ 0x4880, 0x4900, 0x0, 0x0, 0x4880, 0x4900, 0x0, 0x0, 0x4880, 0x4900, 0x0, 0x0, 0x4880, 0x4900, 0x0, 0x0 ]
+  - Name: ExpectedOut3
+    Format: Float16
+    Stride: 8
+    Data: [ 0x4880, 0x4900, 0x4980, 0x0, 0x4880, 0x4900, 0x4980, 0x0, 0x4880, 0x4900, 0x4980, 0x0, 0x4880, 0x4900, 0x4980, 0x0 ]
+  - Name: ExpectedOut4
+    Format: Float16
+    Stride: 8
+    Data: [ 0x4880, 0x4900, 0x4980, 0x4a00, 0x4880, 0x4900, 0x4980, 0x4a00, 0x4880, 0x4900, 0x4980, 0x4a00, 0x4880, 0x4900, 0x4980, 0x4a00 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+
+...
+#--- end
+
+# REQUIRES: Half
+
+# Currently not supported in Clang, ongoing PR
+# XFAIL: Clang
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/QuadReadLaneAt.fp64.test b/test/WaveOps/QuadReadLaneAt.fp64.test
new file mode 100644
index 000000000..e25e72510
--- /dev/null
+++ b/test/WaveOps/QuadReadLaneAt.fp64.test
@@ -0,0 +1,145 @@
+#--- source.hlsl
+StructuredBuffer<double4> In: register(t0);
+RWStructuredBuffer<double4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<double4> Out2 : register(u2); // test double2
+RWStructuredBuffer<double4> Out3 : register(u3); // test double3
+RWStructuredBuffer<double4> Out4 : register(u4); // test double4
+
+[numthreads(2,2,1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+  double4 v = In[index];
+
+#ifdef __spirv__
+  // SPIR-V requires the lane index to be a compile-time constant (or
+  // dynamically uniform in SPIR-V 1.5+), so we pass a literal.
+  double scalar = QuadReadLaneAt(v.x, 2);
+  double2 vec2 = QuadReadLaneAt(v.xy, 2);
+  double3 vec3 = QuadReadLaneAt(v.xyz, 2);
+  double4 vec4 = QuadReadLaneAt(v, 2);
+#else
+  // DXIL permits a non-uniform (per-lane) lane index. Demonstrate this with
+  // an identity shuffle where each lane passes its own `index`, then
+  // broadcast lane 2's values so the result matches the SPIR-V path.
+  double4 id = QuadReadLaneAt(v, index);
+  double scalar = QuadReadLaneAt(id.x, 2);
+  double2 vec2 = QuadReadLaneAt(id.xy, 2);
+  double3 vec3 = QuadReadLaneAt(id.xyz, 2);
+  double4 vec4 = QuadReadLaneAt(id, 2);
+#endif
+
+  Out1[index].x = scalar;
+  Out2[index].xy = vec2;
+  Out3[index].xyz = vec3;
+  Out4[index] = vec4;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+Buffers:
+  - Name: In
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 ]
+  - Name: Out1
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Out2
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Out3
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Out4
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: ExpectedOut1
+    Format: Float64
+    Stride: 32
+    Data: [ 9.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 0.0 ]
+  - Name: ExpectedOut2
+    Format: Float64
+    Stride: 32
+    Data: [ 9.0, 10.0, 0.0, 0.0, 9.0, 10.0, 0.0, 0.0, 9.0, 10.0, 0.0, 0.0, 9.0, 10.0, 0.0, 0.0 ]
+  - Name: ExpectedOut3
+    Format: Float64
+    Stride: 32
+    Data: [ 9.0, 10.0, 11.0, 0.0, 9.0, 10.0, 11.0, 0.0, 9.0, 10.0, 11.0, 0.0, 9.0, 10.0, 11.0, 0.0 ]
+  - Name: ExpectedOut4
+    Format: Float64
+    Stride: 32
+    Data: [ 9.0, 10.0, 11.0, 12.0, 9.0, 10.0, 11.0, 12.0, 9.0, 10.0, 11.0, 12.0, 9.0, 10.0, 11.0, 12.0 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+
+...
+#--- end
+
+# REQUIRES: Double
+
+# Currently not supported in Clang, ongoing PR
+# XFAIL: Clang
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/QuadReadLaneAt.int16.test b/test/WaveOps/QuadReadLaneAt.int16.test
new file mode 100644
index 000000000..7c4104807
--- /dev/null
+++ b/test/WaveOps/QuadReadLaneAt.int16.test
@@ -0,0 +1,261 @@
+#--- source.hlsl
+// ints
+StructuredBuffer<int16_t4> In: register(t0);
+RWStructuredBuffer<int16_t4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<int16_t4> Out2 : register(u2); // test int16_t2
+RWStructuredBuffer<int16_t4> Out3 : register(u3); // test int16_t3
+RWStructuredBuffer<int16_t4> Out4 : register(u4); // test int16_t4
+
+// uints
+StructuredBuffer<uint16_t4> UIn: register(t5);
+RWStructuredBuffer<uint16_t4> UOut1 : register(u6); // test scalar
+RWStructuredBuffer<uint16_t4> UOut2 : register(u7); // test uint16_t2
+RWStructuredBuffer<uint16_t4> UOut3 : register(u8); // test uint16_t3
+RWStructuredBuffer<uint16_t4> UOut4 : register(u9); // test uint16_t4
+
+[numthreads(2,2,1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+
+  // int case
+  int16_t4 v = In[index];
+#ifdef __spirv__
+  // SPIR-V requires the lane index to be a compile-time constant (or
+  // dynamically uniform in SPIR-V 1.5+), so we pass a literal.
+  int16_t scalar = QuadReadLaneAt(v.x, 2);
+  int16_t2 vec2 = QuadReadLaneAt(v.xy, 2);
+  int16_t3 vec3 = QuadReadLaneAt(v.xyz, 2);
+  int16_t4 vec4 = QuadReadLaneAt(v, 2);
+#else
+  // DXIL permits a non-uniform (per-lane) lane index. Demonstrate this with
+  // an identity shuffle where each lane passes its own `index`, then
+  // broadcast lane 2's values so the result matches the SPIR-V path.
+  int16_t4 id = QuadReadLaneAt(v, index);
+  int16_t scalar = QuadReadLaneAt(id.x, 2);
+  int16_t2 vec2 = QuadReadLaneAt(id.xy, 2);
+  int16_t3 vec3 = QuadReadLaneAt(id.xyz, 2);
+  int16_t4 vec4 = QuadReadLaneAt(id, 2);
+#endif
+
+  Out1[index].x = scalar;
+  Out2[index].xy = vec2;
+  Out3[index].xyz = vec3;
+  Out4[index] = vec4;
+
+  // uint case
+  uint16_t4 uv = UIn[index];
+#ifdef __spirv__
+  uint16_t uscalar = QuadReadLaneAt(uv.x, 2);
+  uint16_t2 uvec2 = QuadReadLaneAt(uv.xy, 2);
+  uint16_t3 uvec3 = QuadReadLaneAt(uv.xyz, 2);
+  uint16_t4 uvec4 = QuadReadLaneAt(uv, 2);
+#else
+  uint16_t4 uid = QuadReadLaneAt(uv, index);
+  uint16_t uscalar = QuadReadLaneAt(uid.x, 2);
+  uint16_t2 uvec2 = QuadReadLaneAt(uid.xy, 2);
+  uint16_t3 uvec3 = QuadReadLaneAt(uid.xyz, 2);
+  uint16_t4 uvec4 = QuadReadLaneAt(uid, 2);
+#endif
+
+  UOut1[index].x = uscalar;
+  UOut2[index].xy = uvec2;
+  UOut3[index].xyz = uvec3;
+  UOut4[index] = uvec4;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+Buffers:
+  - Name: In
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: Out1
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: Out2
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: Out3
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: Out4
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: ExpectedOut1
+    Format: Int16
+    Stride: 8
+    Data: [ 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int16
+    Stride: 8
+    Data: [ 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int16
+    Stride: 8
+    Data: [ 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0 ]
+  - Name: ExpectedOut4
+    Format: Int16
+    Stride: 8
+    Data: [ 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12 ]
+  - Name: UIn
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: UOut1
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UOut2
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UOut3
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UOut4
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UExpectedOut1
+    Format: UInt16
+    Stride: 8
+    Data: [ 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt16
+    Stride: 8
+    Data: [ 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt16
+    Stride: 8
+    Data: [ 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt16
+    Stride: 8
+    Data: [ 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+
+...
+#--- end
+
+# REQUIRES: Int16
+
+# Currently not supported in Clang, ongoing PR
+# XFAIL: Clang
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/QuadReadLaneAt.int64.test b/test/WaveOps/QuadReadLaneAt.int64.test
new file mode 100644
index 000000000..ba6eb614f
--- /dev/null
+++ b/test/WaveOps/QuadReadLaneAt.int64.test
@@ -0,0 +1,264 @@
+#--- source.hlsl
+// ints
+StructuredBuffer<int64_t4> In: register(t0);
+RWStructuredBuffer<int64_t4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<int64_t4> Out2 : register(u2); // test int64_t2
+RWStructuredBuffer<int64_t4> Out3 : register(u3); // test int64_t3
+RWStructuredBuffer<int64_t4> Out4 : register(u4); // test int64_t4
+
+// uints
+StructuredBuffer<uint64_t4> UIn: register(t5);
+RWStructuredBuffer<uint64_t4> UOut1 : register(u6); // test scalar
+RWStructuredBuffer<uint64_t4> UOut2 : register(u7); // test uint64_t2
+RWStructuredBuffer<uint64_t4> UOut3 : register(u8); // test uint64_t3
+RWStructuredBuffer<uint64_t4> UOut4 : register(u9); // test uint64_t4
+
+[numthreads(2,2,1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+
+  // int case
+  int64_t4 v = In[index];
+#ifdef __spirv__
+  // SPIR-V requires the lane index to be a compile-time constant (or
+  // dynamically uniform in SPIR-V 1.5+), so we pass a literal.
+  int64_t scalar = QuadReadLaneAt(v.x, 2);
+  int64_t2 vec2 = QuadReadLaneAt(v.xy, 2);
+  int64_t3 vec3 = QuadReadLaneAt(v.xyz, 2);
+  int64_t4 vec4 = QuadReadLaneAt(v, 2);
+#else
+  // DXIL permits a non-uniform (per-lane) lane index. Demonstrate this with
+  // an identity shuffle where each lane passes its own `index`, then
+  // broadcast lane 2's values so the result matches the SPIR-V path.
+  int64_t4 id = QuadReadLaneAt(v, index);
+  int64_t scalar = QuadReadLaneAt(id.x, 2);
+  int64_t2 vec2 = QuadReadLaneAt(id.xy, 2);
+  int64_t3 vec3 = QuadReadLaneAt(id.xyz, 2);
+  int64_t4 vec4 = QuadReadLaneAt(id, 2);
+#endif
+
+  Out1[index].x = scalar;
+  Out2[index].xy = vec2;
+  Out3[index].xyz = vec3;
+  Out4[index] = vec4;
+
+  // uint case
+  uint64_t4 uv = UIn[index];
+#ifdef __spirv__
+  uint64_t uscalar = QuadReadLaneAt(uv.x, 2);
+  uint64_t2 uvec2 = QuadReadLaneAt(uv.xy, 2);
+  uint64_t3 uvec3 = QuadReadLaneAt(uv.xyz, 2);
+  uint64_t4 uvec4 = QuadReadLaneAt(uv, 2);
+#else
+  uint64_t4 uid = QuadReadLaneAt(uv, index);
+  uint64_t uscalar = QuadReadLaneAt(uid.x, 2);
+  uint64_t2 uvec2 = QuadReadLaneAt(uid.xy, 2);
+  uint64_t3 uvec3 = QuadReadLaneAt(uid.xyz, 2);
+  uint64_t4 uvec4 = QuadReadLaneAt(uid, 2);
+#endif
+
+  UOut1[index].x = uscalar;
+  UOut2[index].xy = uvec2;
+  UOut3[index].xyz = uvec3;
+  UOut4[index] = uvec4;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+Buffers:
+  - Name: In
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: Out1
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: Out2
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: Out3
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: Out4
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: ExpectedOut1
+    Format: Int64
+    Stride: 32
+    Data: [ 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int64
+    Stride: 32
+    Data: [ 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int64
+    Stride: 32
+    Data: [ 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0 ]
+  - Name: ExpectedOut4
+    Format: Int64
+    Stride: 32
+    Data: [ 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12 ]
+  - Name: UIn
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: UOut1
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UOut2
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UOut3
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UOut4
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UExpectedOut1
+    Format: UInt64
+    Stride: 32
+    Data: [ 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0, 9, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt64
+    Stride: 32
+    Data: [ 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0, 9, 10, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt64
+    Stride: 32
+    Data: [ 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0, 9, 10, 11, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt64
+    Stride: 32
+    Data: [ 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+
+...
+#--- end
+
+# REQUIRES: Int64
+
+# Currently not supported in Clang, ongoing PR
+# XFAIL: Clang
+
+# Bug: https://github.com/llvm/offload-test-suite/issues/959
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o