Skip to content

Commit d15e0b6

Browse files
blake-sncclaude
andauthored
Remove asm_dialect=AD_ATT from all llvm.inline_asm calls (Dao-AILab#98)
Remove asm_dialect from all inline asm calls, not just the braced ones. PTX is neither AT&T nor Intel syntax — the default (no dialect) passes the asm string through to ptxas unmodified, which is correct. Contributed by Second Nature Computing (https://joinsecondnature.com) Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 25fc394 commit d15e0b6

5 files changed

Lines changed: 0 additions & 12 deletions

File tree

quack/activation.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
3030
"=f,f",
3131
has_side_effects=False,
3232
is_align_stack=False,
33-
asm_dialect=llvm.AsmDialect.AD_ATT,
3433
)
3534
)
3635

quack/copy_utils.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,6 @@ def cpasync_reduce_bulk_add_f32(
584584
# "l,r,r,l",
585585
has_side_effects=True,
586586
is_align_stack=False,
587-
asm_dialect=llvm.AsmDialect.AD_ATT,
588587
)
589588

590589

@@ -715,7 +714,6 @@ def tma_gather4_load(
715714
"r,l,r,r,r,r,r,r", # constraints: register, long, 6x register
716715
has_side_effects=True,
717716
is_align_stack=False,
718-
asm_dialect=llvm.AsmDialect.AD_ATT,
719717
loc=loc,
720718
ip=ip,
721719
)

quack/rounding.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ def mul_wide_u32(a: Uint32, b: Uint32, *, loc=None, ip=None) -> tuple:
5252
"=r,=r,r,r",
5353
has_side_effects=False,
5454
is_align_stack=False,
55-
asm_dialect=llvm.AsmDialect.AD_ATT,
5655
)
5756
i32_ty = T.i32()
5857
hi = cutlass.Uint32(llvm.extractvalue(i32_ty, result, [0], loc=loc, ip=ip))
@@ -85,7 +84,6 @@ def cvt_f32x2_bf16x2_rs(
8584
"=r,f,f,r",
8685
has_side_effects=False,
8786
is_align_stack=False,
88-
asm_dialect=llvm.AsmDialect.AD_ATT,
8987
)
9088
)
9189

quack/tensormap_manager.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ def update_tensormap_shape(
8787
"r,r",
8888
has_side_effects=True,
8989
is_align_stack=False,
90-
asm_dialect=llvm.AsmDialect.AD_ATT,
9190
)
9291
# wait until it's safe to update tensormap in global memory
9392
with cute.arch.elect_one():
@@ -109,7 +108,6 @@ def update_tensormap_shape(
109108
"l,r",
110109
has_side_effects=True,
111110
is_align_stack=False,
112-
asm_dialect=llvm.AsmDialect.AD_ATT,
113111
)
114112
cute.arch.sync_warp()
115113
cute.nvgpu.cpasync.fence_tma_desc_release()

quack/utils.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ def set_block_rank(
3939
"=r,r,r",
4040
has_side_effects=False,
4141
is_align_stack=False,
42-
asm_dialect=llvm.AsmDialect.AD_ATT,
4342
)
4443
)
4544

@@ -72,7 +71,6 @@ def store_shared_remote(
7271
f"r,{constraint},r",
7372
has_side_effects=True,
7473
is_align_stack=False,
75-
asm_dialect=llvm.AsmDialect.AD_ATT,
7674
)
7775

7876

@@ -120,7 +118,6 @@ def store_shared_remote_x4(
120118
f"r,r,{constraint},{constraint},{constraint},{constraint}",
121119
has_side_effects=True,
122120
is_align_stack=False,
123-
asm_dialect=llvm.AsmDialect.AD_ATT,
124121
)
125122

126123

@@ -156,7 +153,6 @@ def sqrt(a: float | Float32, *, loc=None, ip=None) -> Float32:
156153
"=f,f",
157154
has_side_effects=False,
158155
is_align_stack=False,
159-
asm_dialect=llvm.AsmDialect.AD_ATT,
160156
)
161157
)
162158

@@ -171,7 +167,6 @@ def ceil(a: float | Float32, *, loc=None, ip=None) -> Int32:
171167
"=r,f",
172168
has_side_effects=False,
173169
is_align_stack=False,
174-
asm_dialect=llvm.AsmDialect.AD_ATT,
175170
)
176171
)
177172

0 commit comments

Comments
 (0)