From cde62c42418af9eba4a8c3481370cfa83718d841 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 1 Apr 2026 15:42:05 +0200 Subject: [PATCH 1/6] Add LICM pass. --- src/compiler/passes/licm.jl | 195 ++++++++++++++++++++++++++++++++ src/compiler/passes/pipeline.jl | 2 + src/cuTile.jl | 3 +- 3 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 src/compiler/passes/licm.jl diff --git a/src/compiler/passes/licm.jl b/src/compiler/passes/licm.jl new file mode 100644 index 00000000..809d2850 --- /dev/null +++ b/src/compiler/passes/licm.jl @@ -0,0 +1,195 @@ +# Loop-Invariant Code Motion (LICM) +# +# Single-pass depth-tracking algorithm that hoists loop-invariant operations +# out of loops. Port of cuTile Python's `hoist_loop_invariants` (code_motion.py). +# +# The algorithm walks the IR recursively while tracking the definition depth +# of each value. An operation whose data dependencies all resolve to depths +# *less than* its containing loop can be hoisted above that loop. A stack of +# SSAMaps collects operations at their target depth; at the end of each block, +# the original body is replaced with the (filtered) rebuilt map. + +# Whether a block can be moved, based on the operations it contains. +@enum BlockMobility begin + IMMOVABLE # contains stores, returns, or nested IMMOVABLE blocks + CAN_MOVE_WITH_LOOP # contains continue/break + CAN_MOVE # pure operations only +end + +struct BlockResult + mobility::BlockMobility + min_depth::Int # minimum depth any op in this block needs +end + +mutable struct DependencyInfo + must_stay::Bool + max_outside_depth::Int +end + +function update!(di::DependencyInfo, dep_depth::Int, cur_depth::Int) + if dep_depth >= cur_depth + di.must_stay = true + else + di.max_outside_depth = max(di.max_outside_depth, dep_depth) + end +end + +struct StackItem + new_body::SSAMap + is_loop_body::Bool +end + +""" + licm_pass!(sci::StructuredIRCode) + +Hoist loop-invariant operations out of loops. Must run after rewrite_patterns! +and before token_order_pass! (which inserts tokens that should not be moved). +""" +function licm_pass!(sci::StructuredIRCode) + def_depth = Dict{Any, Int}() + for i in 1:length(sci.argtypes) + def_depth[Argument(i)] = 0 + end + _hoist!(sci.entry, StackItem[], def_depth, false) + return +end + +function _hoist!(block::Block, stack::Vector{StackItem}, def_depth::Dict{Any,Int}, + is_loop_body::Bool) + depth = length(stack) + new_body = SSAMap() + push!(stack, StackItem(new_body, is_loop_body)) + + mobility = CAN_MOVE + min_depth = 0 + + # Register block args at current depth + for ba in block.args + def_depth[ba] = depth + end + + for inst in instructions(block) + s = stmt(inst) + depinfo = DependencyInfo(!is_loop_body, 0) + + if s isa ForOp || s isa LoopOp + body = s.body + # ForOp's iv_arg is separate from body.args (which holds only carries) + if s isa ForOp + def_depth[s.iv_arg] = depth + 1 + end + for ba in body.args + def_depth[ba] = depth + 1 + end + body_result = _hoist!(body, stack, def_depth, true) + if body_result.mobility == IMMOVABLE + mobility = IMMOVABLE + depinfo.must_stay = true + end + for v in s.init_values + _update_from_value!(depinfo, def_depth, v, depth) + end + if s isa ForOp + for v in (s.lower, s.upper, s.step) + _update_from_value!(depinfo, def_depth, v, depth) + end + end + update!(depinfo, body_result.min_depth, depth) + + elseif s isa WhileOp + for ba in s.before.args + def_depth[ba] = depth + 1 + end + for ba in s.after.args + def_depth[ba] = depth + 1 + end + before_result = _hoist!(s.before, stack, def_depth, true) + after_result = _hoist!(s.after, stack, def_depth, true) + worst = min(before_result.mobility, after_result.mobility) + if worst == IMMOVABLE + mobility = IMMOVABLE + depinfo.must_stay = true + end + for v in s.init_values + _update_from_value!(depinfo, def_depth, v, depth) + end + update!(depinfo, before_result.min_depth, depth) + update!(depinfo, after_result.min_depth, depth) + + elseif s isa IfOp + _update_from_value!(depinfo, def_depth, s.condition, depth) + then_result = _hoist!(s.then_region, stack, def_depth, false) + else_result = _hoist!(s.else_region, stack, def_depth, false) + update!(depinfo, then_result.min_depth, depth) + update!(depinfo, else_result.min_depth, depth) + for r in (then_result, else_result) + if r.mobility != CAN_MOVE + mobility = min(mobility, r.mobility) + depinfo.must_stay = true + end + end + + elseif _is_memory_store(block, s) + mobility = IMMOVABLE + depinfo.must_stay = true + else + # Pure operation: check operand depths + _update_operand_depths!(depinfo, def_depth, s, depth) + end + + # Determine target depth + target_depth = depth + if depinfo.must_stay + min_depth = max(min_depth, depinfo.max_outside_depth) + else + while target_depth > depinfo.max_outside_depth && stack[target_depth + 1].is_loop_body + target_depth -= 1 + end + end + + # Place at target depth + push!(stack[target_depth + 1].new_body, (inst.ssa_idx, s, inst.typ)) + + # Record definition depth AFTER hoisting (enables cascading hoists) + def_depth[SSAValue(inst.ssa_idx)] = target_depth + end + + # Handle terminator operands for min_depth computation + term = block.terminator + if term isa ContinueOp || term isa BreakOp + mobility = min(mobility, CAN_MOVE_WITH_LOOP) + end + + pop!(stack) + block.body = new_body + return BlockResult(mobility, min_depth) +end + +# Check if a statement is a memory store (IMMOVABLE for LICM purposes). +# Loads are hoistable (they're pure if operands are invariant). +function _is_memory_store(block::Block, @nospecialize(s)) + s isa Expr || return false + call = resolve_call(block, s) + call === nothing && return false + resolved_func, _ = call + effect = classify_memory_op(resolved_func) + return effect == MEM_STORE +end + +# Update DependencyInfo from a single IR value +function _update_from_value!(di::DependencyInfo, def_depth::Dict{Any,Int}, @nospecialize(val), cur_depth::Int) + d = get(def_depth, val, nothing) + d !== nothing && update!(di, d, cur_depth) +end + +# Update DependencyInfo from all operands of a statement +function _update_operand_depths!(di::DependencyInfo, def_depth::Dict{Any,Int}, @nospecialize(s), cur_depth::Int) + if s isa Expr + start = s.head === :invoke ? 3 : 2 + for i in start:length(s.args) + _update_from_value!(di, def_depth, s.args[i], cur_depth) + end + elseif s isa Core.PiNode + _update_from_value!(di, def_depth, s.val, cur_depth) + end +end diff --git a/src/compiler/passes/pipeline.jl b/src/compiler/passes/pipeline.jl index 261eed4b..cac2b18a 100644 --- a/src/compiler/passes/pipeline.jl +++ b/src/compiler/passes/pipeline.jl @@ -327,6 +327,8 @@ function run_passes!(sci::StructuredIRCode) constants = propagate_constants(sci) rewrite_patterns!(sci, OPTIMIZATION_RULES; constants) + licm_pass!(sci) + alias_result = alias_analysis_pass!(sci) token_order_pass!(sci, alias_result) diff --git a/src/cuTile.jl b/src/cuTile.jl index f761f739..7a94b581 100644 --- a/src/cuTile.jl +++ b/src/cuTile.jl @@ -1,7 +1,7 @@ module cuTile using IRStructurizer -using IRStructurizer: Block, ControlFlowOp, BlockArgument, +using IRStructurizer: Block, ControlFlowOp, BlockArgument, SSAMap, YieldOp, ContinueOp, BreakOp, ConditionOp, IfOp, ForOp, WhileOp, LoopOp, Undef, SourceLocation, source_location @@ -44,6 +44,7 @@ include("compiler/passes/canonicalize.jl") include("compiler/passes/alias_analysis.jl") include("compiler/passes/token_keys.jl") include("compiler/passes/token_order.jl") +include("compiler/passes/licm.jl") include("compiler/passes/dce.jl") include("compiler/passes/pipeline.jl") include("compiler/codegen/debug.jl") From e93e199f267bba824f4fe03987911465ffd80db8 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 1 Apr 2026 15:49:25 +0200 Subject: [PATCH 2/6] Use IRStructurizer APIs. --- src/compiler/passes/licm.jl | 22 +++++++++++++--------- src/cuTile.jl | 2 +- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/compiler/passes/licm.jl b/src/compiler/passes/licm.jl index 809d2850..0ba2475e 100644 --- a/src/compiler/passes/licm.jl +++ b/src/compiler/passes/licm.jl @@ -6,8 +6,8 @@ # The algorithm walks the IR recursively while tracking the definition depth # of each value. An operation whose data dependencies all resolve to depths # *less than* its containing loop can be hoisted above that loop. A stack of -# SSAMaps collects operations at their target depth; at the end of each block, -# the original body is replaced with the (filtered) rebuilt map. +# instruction lists collects operations at their target depth; at the end of +# each block, the original body is rebuilt from the filtered list. # Whether a block can be moved, based on the operations it contains. @enum BlockMobility begin @@ -35,7 +35,7 @@ function update!(di::DependencyInfo, dep_depth::Int, cur_depth::Int) end struct StackItem - new_body::SSAMap + entries::Vector{Tuple{Int,Any,Any}} # (ssa_idx, stmt, typ) triples is_loop_body::Bool end @@ -57,8 +57,7 @@ end function _hoist!(block::Block, stack::Vector{StackItem}, def_depth::Dict{Any,Int}, is_loop_body::Bool) depth = length(stack) - new_body = SSAMap() - push!(stack, StackItem(new_body, is_loop_body)) + push!(stack, StackItem(Tuple{Int,Any,Any}[], is_loop_body)) mobility = CAN_MOVE min_depth = 0 @@ -148,20 +147,25 @@ function _hoist!(block::Block, stack::Vector{StackItem}, def_depth::Dict{Any,Int end # Place at target depth - push!(stack[target_depth + 1].new_body, (inst.ssa_idx, s, inst.typ)) + push!(stack[target_depth + 1].entries, (inst.ssa_idx, s, inst.typ)) # Record definition depth AFTER hoisting (enables cascading hoists) def_depth[SSAValue(inst.ssa_idx)] = target_depth end - # Handle terminator operands for min_depth computation + # Handle terminator for mobility term = block.terminator if term isa ContinueOp || term isa BreakOp mobility = min(mobility, CAN_MOVE_WITH_LOOP) end - pop!(stack) - block.body = new_body + # Rebuild block body from collected entries + entries = pop!(stack).entries + empty!(block) + for (idx, s, typ) in entries + push!(block, idx, s, typ) + end + return BlockResult(mobility, min_depth) end diff --git a/src/cuTile.jl b/src/cuTile.jl index 7a94b581..1fbab4bc 100644 --- a/src/cuTile.jl +++ b/src/cuTile.jl @@ -1,7 +1,7 @@ module cuTile using IRStructurizer -using IRStructurizer: Block, ControlFlowOp, BlockArgument, SSAMap, +using IRStructurizer: Block, ControlFlowOp, BlockArgument, YieldOp, ContinueOp, BreakOp, ConditionOp, IfOp, ForOp, WhileOp, LoopOp, Undef, SourceLocation, source_location From bfe76a365875456a1178a9ae8ca82f4c35911af7 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sun, 5 Apr 2026 22:55:35 +0200 Subject: [PATCH 3/6] Rewrite LICM to focus on alias-safe load hoisting. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous LICM pass hoisted all loop-invariant operations (arithmetic, broadcasts, view constructors, etc.) — all of which are marked Pure in the MLIR Tile IR dialect and already hoisted by MLIR's built-in LICM at optLevel >= 2. Benchmarks confirmed zero performance difference when the pass was disabled entirely. The new pass focuses on what MLIR structurally cannot do: hoisting memory loads out of loops. After token ordering, loads have token dependencies that anchor them inside loops. By hoisting before token insertion, we avoid creating unnecessary token carries. Key changes: - Run alias_analysis_pass! before licm_pass! (was after) - Only hoist loads, not pure ops (MLIR handles those) - Verify alias safety: a load is only hoisted when no store in the loop body writes to an overlapping alias set - Simplified from 200 to 150 lines with clearer structure Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/passes/licm.jl | 248 +++++++++++++++----------------- src/compiler/passes/pipeline.jl | 5 +- 2 files changed, 116 insertions(+), 137 deletions(-) diff --git a/src/compiler/passes/licm.jl b/src/compiler/passes/licm.jl index 0ba2475e..06426b4e 100644 --- a/src/compiler/passes/licm.jl +++ b/src/compiler/passes/licm.jl @@ -1,99 +1,100 @@ # Loop-Invariant Code Motion (LICM) # -# Single-pass depth-tracking algorithm that hoists loop-invariant operations -# out of loops. Port of cuTile Python's `hoist_loop_invariants` (code_motion.py). +# Hoists loop-invariant loads (and their dependency chains) out of loops. # -# The algorithm walks the IR recursively while tracking the definition depth -# of each value. An operation whose data dependencies all resolve to depths -# *less than* its containing loop can be hoisted above that loop. A stack of -# instruction lists collects operations at their target depth; at the end of -# each block, the original body is rebuilt from the filtered list. - -# Whether a block can be moved, based on the operations it contains. -@enum BlockMobility begin - IMMOVABLE # contains stores, returns, or nested IMMOVABLE blocks - CAN_MOVE_WITH_LOOP # contains continue/break - CAN_MOVE # pure operations only -end - -struct BlockResult - mobility::BlockMobility - min_depth::Int # minimum depth any op in this block needs -end - -mutable struct DependencyInfo - must_stay::Bool - max_outside_depth::Int -end - -function update!(di::DependencyInfo, dep_depth::Int, cur_depth::Int) - if dep_depth >= cur_depth - di.must_stay = true - else - di.max_outside_depth = max(di.max_outside_depth, dep_depth) - end -end - -struct StackItem - entries::Vector{Tuple{Int,Any,Any}} # (ssa_idx, stmt, typ) triples - is_loop_body::Bool -end +# Pure operations (arithmetic, broadcasts, view constructors) are NOT hoisted +# here — MLIR's built-in LICM handles those at optLevel >= 2. +# +# This pass targets what MLIR cannot hoist: memory loads. After token ordering, +# loads have token dependencies that anchor them inside loops. By hoisting +# before token insertion, we avoid creating unnecessary token carries. +# +# Safety: a load is hoistable only when (1) all its operands are loop-invariant, +# and (2) no store in the loop body aliases with the load's memory region. +# Alias information comes from alias_analysis_pass!, which must run first. """ - licm_pass!(sci::StructuredIRCode) + licm_pass!(sci::StructuredIRCode, alias_result::Dict{Any, AliasSet}) + +Hoist loop-invariant loads out of loops. Must run after alias_analysis_pass! +and before token_order_pass!. -Hoist loop-invariant operations out of loops. Must run after rewrite_patterns! -and before token_order_pass! (which inserts tokens that should not be moved). +A load is hoistable when: +- All operands are defined outside the loop +- No store in the loop body writes to an aliasing memory region """ -function licm_pass!(sci::StructuredIRCode) +function licm_pass!(sci::StructuredIRCode, alias_result::Dict{Any, AliasSet}) def_depth = Dict{Any, Int}() for i in 1:length(sci.argtypes) def_depth[Argument(i)] = 0 end - _hoist!(sci.entry, StackItem[], def_depth, false) + _hoist_loads!(sci.entry, Vector{Vector{Tuple{Int,Any,Any}}}(), def_depth, + alias_result, false) return end -function _hoist!(block::Block, stack::Vector{StackItem}, def_depth::Dict{Any,Int}, - is_loop_body::Bool) - depth = length(stack) - push!(stack, StackItem(Tuple{Int,Any,Any}[], is_loop_body)) +# Collect alias sets of all stores in a block (recursively through nested CFs). +function _collect_store_aliases(block::Block, alias_result::Dict{Any, AliasSet}) + store_aliases = AliasSet[] + for inst in instructions(block) + s = stmt(inst) + if s isa ControlFlowOp + for b in blocks(s) + append!(store_aliases, _collect_store_aliases(b, alias_result)) + end + else + call = resolve_call(block, s) + call === nothing && continue + resolved_func, operands = call + if classify_memory_op(resolved_func) == MEM_STORE + aset = get_alias_set_for_operand(alias_result, first(operands)) + push!(store_aliases, aset) + end + end + end + return store_aliases +end + +# Check if a load's alias set conflicts with any store alias set in the loop. +function _aliases_with_store(load_alias::AliasSet, store_aliases::Vector{AliasSet}) + for sa in store_aliases + if load_alias isa AliasUniverse || sa isa AliasUniverse + return true + end + if !isempty(intersect(load_alias, sa)) + return true + end + end + return false +end - mobility = CAN_MOVE - min_depth = 0 +function _hoist_loads!(block::Block, stack::Vector{Vector{Tuple{Int,Any,Any}}}, + def_depth::Dict{Any,Int}, alias_result::Dict{Any, AliasSet}, + is_loop_body::Bool) + depth = length(stack) + push!(stack, Tuple{Int,Any,Any}[]) # Register block args at current depth for ba in block.args def_depth[ba] = depth end + # If this is a loop body, collect store alias sets for the load safety check + store_aliases = is_loop_body ? _collect_store_aliases(block, alias_result) : AliasSet[] + for inst in instructions(block) s = stmt(inst) - depinfo = DependencyInfo(!is_loop_body, 0) + hoisted = false if s isa ForOp || s isa LoopOp body = s.body - # ForOp's iv_arg is separate from body.args (which holds only carries) if s isa ForOp def_depth[s.iv_arg] = depth + 1 end for ba in body.args def_depth[ba] = depth + 1 end - body_result = _hoist!(body, stack, def_depth, true) - if body_result.mobility == IMMOVABLE - mobility = IMMOVABLE - depinfo.must_stay = true - end - for v in s.init_values - _update_from_value!(depinfo, def_depth, v, depth) - end - if s isa ForOp - for v in (s.lower, s.upper, s.step) - _update_from_value!(depinfo, def_depth, v, depth) - end - end - update!(depinfo, body_result.min_depth, depth) + _hoist_loads!(body, stack, def_depth, alias_result, true) elseif s isa WhileOp for ba in s.before.args @@ -102,98 +103,75 @@ function _hoist!(block::Block, stack::Vector{StackItem}, def_depth::Dict{Any,Int for ba in s.after.args def_depth[ba] = depth + 1 end - before_result = _hoist!(s.before, stack, def_depth, true) - after_result = _hoist!(s.after, stack, def_depth, true) - worst = min(before_result.mobility, after_result.mobility) - if worst == IMMOVABLE - mobility = IMMOVABLE - depinfo.must_stay = true - end - for v in s.init_values - _update_from_value!(depinfo, def_depth, v, depth) - end - update!(depinfo, before_result.min_depth, depth) - update!(depinfo, after_result.min_depth, depth) + _hoist_loads!(s.before, stack, def_depth, alias_result, true) + _hoist_loads!(s.after, stack, def_depth, alias_result, true) elseif s isa IfOp - _update_from_value!(depinfo, def_depth, s.condition, depth) - then_result = _hoist!(s.then_region, stack, def_depth, false) - else_result = _hoist!(s.else_region, stack, def_depth, false) - update!(depinfo, then_result.min_depth, depth) - update!(depinfo, else_result.min_depth, depth) - for r in (then_result, else_result) - if r.mobility != CAN_MOVE - mobility = min(mobility, r.mobility) - depinfo.must_stay = true - end - end - - elseif _is_memory_store(block, s) - mobility = IMMOVABLE - depinfo.must_stay = true - else - # Pure operation: check operand depths - _update_operand_depths!(depinfo, def_depth, s, depth) - end - - # Determine target depth - target_depth = depth - if depinfo.must_stay - min_depth = max(min_depth, depinfo.max_outside_depth) - else - while target_depth > depinfo.max_outside_depth && stack[target_depth + 1].is_loop_body + _hoist_loads!(s.then_region, stack, def_depth, alias_result, false) + _hoist_loads!(s.else_region, stack, def_depth, alias_result, false) + + elseif is_loop_body && _is_hoistable_load(block, s, def_depth, depth, + alias_result, store_aliases) + # Hoist this load to the enclosing scope + target_depth = depth - 1 + while target_depth > 0 && _can_hoist_to(stack, target_depth) target_depth -= 1 end + push!(stack[target_depth + 1], (inst.ssa_idx, s, inst.typ)) + def_depth[SSAValue(inst.ssa_idx)] = target_depth + hoisted = true end - # Place at target depth - push!(stack[target_depth + 1].entries, (inst.ssa_idx, s, inst.typ)) - - # Record definition depth AFTER hoisting (enables cascading hoists) - def_depth[SSAValue(inst.ssa_idx)] = target_depth - end - - # Handle terminator for mobility - term = block.terminator - if term isa ContinueOp || term isa BreakOp - mobility = min(mobility, CAN_MOVE_WITH_LOOP) + if !hoisted + # Keep at current depth + push!(stack[depth + 1], (inst.ssa_idx, s, inst.typ)) + def_depth[SSAValue(inst.ssa_idx)] = depth + end end # Rebuild block body from collected entries - entries = pop!(stack).entries + entries = pop!(stack) empty!(block) for (idx, s, typ) in entries push!(block, idx, s, typ) end +end - return BlockResult(mobility, min_depth) +# Check if a stack entry is a loop body (for multi-level hoisting) +function _can_hoist_to(stack::Vector{Vector{Tuple{Int,Any,Any}}}, target_depth::Int) + # We'd need to track is_loop_body per stack entry to do multi-level hoisting. + # For now, only hoist one level out. + return false end -# Check if a statement is a memory store (IMMOVABLE for LICM purposes). -# Loads are hoistable (they're pure if operands are invariant). -function _is_memory_store(block::Block, @nospecialize(s)) +# Check if a statement is a load that can be safely hoisted. +function _is_hoistable_load(block::Block, @nospecialize(s), def_depth::Dict{Any,Int}, + cur_depth::Int, alias_result::Dict{Any, AliasSet}, + store_aliases::Vector{AliasSet}) s isa Expr || return false call = resolve_call(block, s) call === nothing && return false - resolved_func, _ = call - effect = classify_memory_op(resolved_func) - return effect == MEM_STORE -end + resolved_func, operands = call -# Update DependencyInfo from a single IR value -function _update_from_value!(di::DependencyInfo, def_depth::Dict{Any,Int}, @nospecialize(val), cur_depth::Int) - d = get(def_depth, val, nothing) - d !== nothing && update!(di, d, cur_depth) + # Must be a load operation + classify_memory_op(resolved_func) == MEM_LOAD || return false + + # All operands must be defined outside this loop + _all_operands_outside(s, def_depth, cur_depth) || return false + + # Load must not alias with any store in the loop + load_alias = get_alias_set_for_operand(alias_result, first(operands)) + return !_aliases_with_store(load_alias, store_aliases) end -# Update DependencyInfo from all operands of a statement -function _update_operand_depths!(di::DependencyInfo, def_depth::Dict{Any,Int}, @nospecialize(s), cur_depth::Int) - if s isa Expr - start = s.head === :invoke ? 3 : 2 - for i in start:length(s.args) - _update_from_value!(di, def_depth, s.args[i], cur_depth) - end - elseif s isa Core.PiNode - _update_from_value!(di, def_depth, s.val, cur_depth) +# Check that all operands of a statement are defined at depth < cur_depth. +function _all_operands_outside(@nospecialize(s), def_depth::Dict{Any,Int}, cur_depth::Int) + s isa Expr || return true + start = s.head === :invoke ? 3 : 2 + for i in start:length(s.args) + d = get(def_depth, s.args[i], nothing) + d === nothing && continue # constants/literals are always available + d >= cur_depth && return false end + return true end diff --git a/src/compiler/passes/pipeline.jl b/src/compiler/passes/pipeline.jl index cac2b18a..13328c27 100644 --- a/src/compiler/passes/pipeline.jl +++ b/src/compiler/passes/pipeline.jl @@ -327,9 +327,10 @@ function run_passes!(sci::StructuredIRCode) constants = propagate_constants(sci) rewrite_patterns!(sci, OPTIMIZATION_RULES; constants) - licm_pass!(sci) - alias_result = alias_analysis_pass!(sci) + + licm_pass!(sci, alias_result) + token_order_pass!(sci, alias_result) dce_pass!(sci) From b0658dff30bbded6503f554e8a6c72d67e4fe7f5 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 6 Apr 2026 11:24:33 +0200 Subject: [PATCH 4/6] Rewrite LICM to hoist all loop-invariant ops after token ordering. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous LICM only targeted loads and ran before token ordering, but failed to hoist anything because load dependencies (make_partition_view, Core.tuple) were always generated inline inside the loop body. The new approach mirrors cuTile Python's code_motion.py: run after token_order_pass! and hoist ALL loop-invariant operations based on data dependencies. Token dependencies naturally prevent unsafe hoisting of loads that alias with stores — no separate alias analysis needed for LICM. This correctly hoists loop-invariant loads and their entire dependency chain (tensor_view → partition_view → load → reshape → broadcast). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/passes/licm.jl | 285 ++++++++++++++++++-------------- src/compiler/passes/pipeline.jl | 4 +- test/codegen/integration.jl | 32 ++++ 3 files changed, 199 insertions(+), 122 deletions(-) diff --git a/src/compiler/passes/licm.jl b/src/compiler/passes/licm.jl index 06426b4e..87800a77 100644 --- a/src/compiler/passes/licm.jl +++ b/src/compiler/passes/licm.jl @@ -1,100 +1,134 @@ # Loop-Invariant Code Motion (LICM) # -# Hoists loop-invariant loads (and their dependency chains) out of loops. +# Hoists loop-invariant operations out of loops. Runs AFTER token_order_pass! +# so that token dependencies correctly prevent unsafe hoisting of aliasing loads. # -# Pure operations (arithmetic, broadcasts, view constructors) are NOT hoisted -# here — MLIR's built-in LICM handles those at optLevel >= 2. +# Operations classified as stores (store_partition_view, store_ptr_tko, atomics, +# print_tko) and control flow exits (return) are never hoisted. All other +# operations — including loads, arithmetic, partition views, token nodes — are +# hoisted when all their data dependencies are defined outside the loop. # -# This pass targets what MLIR cannot hoist: memory loads. After token ordering, -# loads have token dependencies that anchor them inside loops. By hoisting -# before token insertion, we avoid creating unnecessary token carries. -# -# Safety: a load is hoistable only when (1) all its operands are loop-invariant, -# and (2) no store in the loop body aliases with the load's memory region. -# Alias information comes from alias_analysis_pass!, which must run first. +# This mirrors cuTile Python's code_motion.py:hoist_loop_invariants. + +# Indicates whether a block could in theory be moved, based on the operations +# it contains (side effects, jumps). Does not consider data dependencies. +@enum _BlockMobility::Int8 begin + # The block (or any ancestor) cannot be moved due to side effects. + _IMMOVABLE = 0 + # The block itself can't be hoisted alone, but its containing loop can. + # Happens when the block contains Continue or Break. + _CAN_MOVE_WITH_LOOP = 1 + # The block can move (subject to data dependencies). + _CAN_MOVE = 2 +end -""" - licm_pass!(sci::StructuredIRCode, alias_result::Dict{Any, AliasSet}) +struct _BlockResult + mobility::_BlockMobility + min_depth::Int # deepest outside dependency of any hoisted-out op +end -Hoist loop-invariant loads out of loops. Must run after alias_analysis_pass! -and before token_order_pass!. +# Helper for accumulating data dependency information per operation. +mutable struct _DepInfo + must_stay::Bool + max_outside_depth::Int +end -A load is hoistable when: -- All operands are defined outside the loop -- No store in the loop body writes to an aliasing memory region -""" -function licm_pass!(sci::StructuredIRCode, alias_result::Dict{Any, AliasSet}) - def_depth = Dict{Any, Int}() - for i in 1:length(sci.argtypes) - def_depth[Argument(i)] = 0 +function _update_dep!(di::_DepInfo, dep_depth::Int, cur_depth::Int) + if dep_depth >= cur_depth + di.must_stay = true + else + di.max_outside_depth = max(di.max_outside_depth, dep_depth) end - _hoist_loads!(sci.entry, Vector{Vector{Tuple{Int,Any,Any}}}(), def_depth, - alias_result, false) - return end -# Collect alias sets of all stores in a block (recursively through nested CFs). -function _collect_store_aliases(block::Block, alias_result::Dict{Any, AliasSet}) - store_aliases = AliasSet[] - for inst in instructions(block) - s = stmt(inst) - if s isa ControlFlowOp - for b in blocks(s) - append!(store_aliases, _collect_store_aliases(b, alias_result)) - end - else - call = resolve_call(block, s) - call === nothing && continue - resolved_func, operands = call - if classify_memory_op(resolved_func) == MEM_STORE - aset = get_alias_set_for_operand(alias_result, first(operands)) - push!(store_aliases, aset) - end - end - end - return store_aliases +# Update dependency info from an SSA value or literal. +function _check_val!(di::_DepInfo, val, def_depth::Dict{Any,Int}, cur_depth::Int) + d = get(def_depth, val, nothing) + d === nothing && return # constants/literals always available + _update_dep!(di, d, cur_depth) end -# Check if a load's alias set conflicts with any store alias set in the loop. -function _aliases_with_store(load_alias::AliasSet, store_aliases::Vector{AliasSet}) - for sa in store_aliases - if load_alias isa AliasUniverse || sa isa AliasUniverse - return true +# Extract all SSA dependencies from a statement. +function _check_stmt_deps!(di::_DepInfo, @nospecialize(s), def_depth::Dict{Any,Int}, + cur_depth::Int) + if s isa Expr + start = s.head === :invoke ? 3 : 2 + for i in start:length(s.args) + _check_val!(di, s.args[i], def_depth, cur_depth) end - if !isempty(intersect(load_alias, sa)) - return true + elseif s isa JoinTokensNode + for tok in s.tokens + _check_val!(di, tok, def_depth, cur_depth) end + elseif s isa TokenResultNode + _check_val!(di, SSAValue(s.mem_op_ssa), def_depth, cur_depth) + end + # MakeTokenNode, PiNode, GlobalRef, literals: no SSA deps +end + +struct _StackItem + entries::Vector{Tuple{Int,Any,Any}} # (ssa_idx, stmt, type) + is_loop_body::Bool +end + +""" + licm_pass!(sci::StructuredIRCode) + +Hoist loop-invariant operations out of loops. Must run after token_order_pass!. +""" +function licm_pass!(sci::StructuredIRCode) + def_depth = Dict{Any,Int}() + for i in 1:length(sci.argtypes) + def_depth[Argument(i)] = 0 end - return false + _hoist!(sci.entry, _StackItem[], def_depth, false) + return end -function _hoist_loads!(block::Block, stack::Vector{Vector{Tuple{Int,Any,Any}}}, - def_depth::Dict{Any,Int}, alias_result::Dict{Any, AliasSet}, - is_loop_body::Bool) +function _hoist!(block::Block, stack::Vector{_StackItem}, def_depth::Dict{Any,Int}, + is_loop_body::Bool) depth = length(stack) - push!(stack, Tuple{Int,Any,Any}[]) + push!(stack, _StackItem(Tuple{Int,Any,Any}[], is_loop_body)) - # Register block args at current depth for ba in block.args def_depth[ba] = depth end - # If this is a loop body, collect store alias sets for the load safety check - store_aliases = is_loop_body ? _collect_store_aliases(block, alias_result) : AliasSet[] + mobility = _CAN_MOVE + min_depth = 0 for inst in instructions(block) s = stmt(inst) - hoisted = false + di = _DepInfo(!is_loop_body, 0) - if s isa ForOp || s isa LoopOp - body = s.body - if s isa ForOp - def_depth[s.iv_arg] = depth + 1 + if s isa ForOp + def_depth[s.iv_arg] = depth + 1 + for ba in s.body.args + def_depth[ba] = depth + 1 + end + body_res = _hoist!(s.body, stack, def_depth, true) + if body_res.mobility == _IMMOVABLE + mobility = _IMMOVABLE + di.must_stay = true + end + for v in s.init_values + _check_val!(di, v, def_depth, depth) end - for ba in body.args + _update_dep!(di, body_res.min_depth, depth) + + elseif s isa LoopOp + for ba in s.body.args def_depth[ba] = depth + 1 end - _hoist_loads!(body, stack, def_depth, alias_result, true) + body_res = _hoist!(s.body, stack, def_depth, true) + if body_res.mobility == _IMMOVABLE + mobility = _IMMOVABLE + di.must_stay = true + end + for v in s.init_values + _check_val!(di, v, def_depth, depth) + end + _update_dep!(di, body_res.min_depth, depth) elseif s isa WhileOp for ba in s.before.args @@ -103,75 +137,86 @@ function _hoist_loads!(block::Block, stack::Vector{Vector{Tuple{Int,Any,Any}}}, for ba in s.after.args def_depth[ba] = depth + 1 end - _hoist_loads!(s.before, stack, def_depth, alias_result, true) - _hoist_loads!(s.after, stack, def_depth, alias_result, true) + before_res = _hoist!(s.before, stack, def_depth, true) + after_res = _hoist!(s.after, stack, def_depth, true) + if min(before_res.mobility, after_res.mobility) == _IMMOVABLE + mobility = _IMMOVABLE + di.must_stay = true + end + for v in s.init_values + _check_val!(di, v, def_depth, depth) + end + _update_dep!(di, before_res.min_depth, depth) + _update_dep!(di, after_res.min_depth, depth) elseif s isa IfOp - _hoist_loads!(s.then_region, stack, def_depth, alias_result, false) - _hoist_loads!(s.else_region, stack, def_depth, alias_result, false) - - elseif is_loop_body && _is_hoistable_load(block, s, def_depth, depth, - alias_result, store_aliases) - # Hoist this load to the enclosing scope - target_depth = depth - 1 - while target_depth > 0 && _can_hoist_to(stack, target_depth) - target_depth -= 1 + _check_val!(di, s.condition, def_depth, depth) + for region in (s.then_region, s.else_region) + branch_res = _hoist!(region, stack, def_depth, false) + _update_dep!(di, branch_res.min_depth, depth) + if branch_res.mobility != _CAN_MOVE + mobility = min(mobility, branch_res.mobility) + di.must_stay = true + end end - push!(stack[target_depth + 1], (inst.ssa_idx, s, inst.typ)) - def_depth[SSAValue(inst.ssa_idx)] = target_depth - hoisted = true + + elseif _is_store(block, s) + mobility = _IMMOVABLE + di.must_stay = true + + elseif s isa ContinueOp || s isa BreakOp + mobility = min(mobility, _CAN_MOVE_WITH_LOOP) + di.must_stay = true + + elseif s isa YieldOp || s isa ConditionOp || s isa ReturnNode + di.must_stay = true + # Track deps for YieldOp/ConditionOp so min_depth is correct + if s isa YieldOp + for v in s.values + _check_val!(di, v, def_depth, depth) + end + elseif s isa ConditionOp + _check_val!(di, s.condition, def_depth, depth) + for v in s.args + _check_val!(di, v, def_depth, depth) + end + end + + else + # Movable operation: loads, arithmetic, make_partition_view, etc. + _check_stmt_deps!(di, s, def_depth, depth) end - if !hoisted - # Keep at current depth - push!(stack[depth + 1], (inst.ssa_idx, s, inst.typ)) - def_depth[SSAValue(inst.ssa_idx)] = depth + # Determine target depth + target_depth = depth + if di.must_stay + min_depth = max(min_depth, di.max_outside_depth) + else + while target_depth > di.max_outside_depth && stack[target_depth].is_loop_body + target_depth -= 1 + end end + + push!(stack[target_depth + 1].entries, (inst.ssa_idx, s, inst.typ)) + + # Record definition depth AFTER hoisting so subsequent ops see the new depth + def_depth[SSAValue(inst.ssa_idx)] = target_depth end # Rebuild block body from collected entries - entries = pop!(stack) + entries = pop!(stack).entries empty!(block) for (idx, s, typ) in entries push!(block, idx, s, typ) end -end -# Check if a stack entry is a loop body (for multi-level hoisting) -function _can_hoist_to(stack::Vector{Vector{Tuple{Int,Any,Any}}}, target_depth::Int) - # We'd need to track is_loop_body per stack entry to do multi-level hoisting. - # For now, only hoist one level out. - return false + return _BlockResult(mobility, min_depth) end -# Check if a statement is a load that can be safely hoisted. -function _is_hoistable_load(block::Block, @nospecialize(s), def_depth::Dict{Any,Int}, - cur_depth::Int, alias_result::Dict{Any, AliasSet}, - store_aliases::Vector{AliasSet}) - s isa Expr || return false +# Check if a statement is a store/atomic (side-effecting memory write). +function _is_store(block::Block, @nospecialize(s)) call = resolve_call(block, s) call === nothing && return false - resolved_func, operands = call - - # Must be a load operation - classify_memory_op(resolved_func) == MEM_LOAD || return false - - # All operands must be defined outside this loop - _all_operands_outside(s, def_depth, cur_depth) || return false - - # Load must not alias with any store in the loop - load_alias = get_alias_set_for_operand(alias_result, first(operands)) - return !_aliases_with_store(load_alias, store_aliases) -end - -# Check that all operands of a statement are defined at depth < cur_depth. -function _all_operands_outside(@nospecialize(s), def_depth::Dict{Any,Int}, cur_depth::Int) - s isa Expr || return true - start = s.head === :invoke ? 3 : 2 - for i in start:length(s.args) - d = get(def_depth, s.args[i], nothing) - d === nothing && continue # constants/literals are always available - d >= cur_depth && return false - end - return true + resolved_func, _ = call + return classify_memory_op(resolved_func) == MEM_STORE end diff --git a/src/compiler/passes/pipeline.jl b/src/compiler/passes/pipeline.jl index 13328c27..f0c3a885 100644 --- a/src/compiler/passes/pipeline.jl +++ b/src/compiler/passes/pipeline.jl @@ -329,9 +329,9 @@ function run_passes!(sci::StructuredIRCode) alias_result = alias_analysis_pass!(sci) - licm_pass!(sci, alias_result) - token_order_pass!(sci, alias_result) + licm_pass!(sci) + dce_pass!(sci) end diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl index 2b66bfaf..c25baf2f 100644 --- a/test/codegen/integration.jl +++ b/test/codegen/integration.jl @@ -420,6 +420,38 @@ end end end + @testset "loop-invariant load (manually hoisted)" begin + # Test that a manually-hoisted loop-invariant load appears before the loop. + # Pattern: Y[n, m] = X[n, m] * W[m], iterating over N-tiles. + # W[bid_m] doesn't depend on the loop variable, so the user hoists it. + spec2d = ct.ArraySpec{2}(16, true) + spec1d = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + # W load must appear BEFORE the for loop + @check "load_view_tko" + @check "for %loopIdx in" + # Inside the loop: only the X load + @check "load_view_tko" + @check "mulf" + @check "store_view_tko" + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,1,spec1d}, + ct.TileArray{Float32,2,spec2d}, ct.Constant{Int,1024}}) do X, W, Y, TILE_N + bid_m = ct.bid(1) + num_tiles = ct.num_tiles(X, 1, (TILE_N, 1)) + # Hoisted: W load before loop + w = ct.load(W; index=bid_m, shape=(1,)) + for j in Int32(1):num_tiles + x = ct.load(X; index=(j, bid_m), shape=(TILE_N, 1), + padding_mode=ct.PaddingMode.Zero) + y = x .* w + ct.store(Y; index=(j, bid_m), tile=y) + end + return + end + end + end + #========================================================================= Gather/Scatter Operations =========================================================================# From c927a47dac0c2521d082a9e4b34ffa9cf168b6c8 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 6 Apr 2026 11:31:22 +0200 Subject: [PATCH 5/6] Drop underscore prefixes from internal LICM names. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/passes/licm.jl | 102 ++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/src/compiler/passes/licm.jl b/src/compiler/passes/licm.jl index 87800a77..9ff3eb64 100644 --- a/src/compiler/passes/licm.jl +++ b/src/compiler/passes/licm.jl @@ -12,28 +12,28 @@ # Indicates whether a block could in theory be moved, based on the operations # it contains (side effects, jumps). Does not consider data dependencies. -@enum _BlockMobility::Int8 begin +@enum BlockMobility::Int8 begin # The block (or any ancestor) cannot be moved due to side effects. - _IMMOVABLE = 0 + IMMOVABLE = 0 # The block itself can't be hoisted alone, but its containing loop can. # Happens when the block contains Continue or Break. - _CAN_MOVE_WITH_LOOP = 1 + CAN_MOVE_WITH_LOOP = 1 # The block can move (subject to data dependencies). - _CAN_MOVE = 2 + CAN_MOVE = 2 end -struct _BlockResult - mobility::_BlockMobility +struct BlockResult + mobility::BlockMobility min_depth::Int # deepest outside dependency of any hoisted-out op end # Helper for accumulating data dependency information per operation. -mutable struct _DepInfo +mutable struct DepInfo must_stay::Bool max_outside_depth::Int end -function _update_dep!(di::_DepInfo, dep_depth::Int, cur_depth::Int) +function update_dep!(di::DepInfo, dep_depth::Int, cur_depth::Int) if dep_depth >= cur_depth di.must_stay = true else @@ -42,31 +42,31 @@ function _update_dep!(di::_DepInfo, dep_depth::Int, cur_depth::Int) end # Update dependency info from an SSA value or literal. -function _check_val!(di::_DepInfo, val, def_depth::Dict{Any,Int}, cur_depth::Int) +function check_val!(di::DepInfo, val, def_depth::Dict{Any,Int}, cur_depth::Int) d = get(def_depth, val, nothing) d === nothing && return # constants/literals always available - _update_dep!(di, d, cur_depth) + update_dep!(di, d, cur_depth) end # Extract all SSA dependencies from a statement. -function _check_stmt_deps!(di::_DepInfo, @nospecialize(s), def_depth::Dict{Any,Int}, - cur_depth::Int) +function check_stmt_deps!(di::DepInfo, @nospecialize(s), def_depth::Dict{Any,Int}, + cur_depth::Int) if s isa Expr start = s.head === :invoke ? 3 : 2 for i in start:length(s.args) - _check_val!(di, s.args[i], def_depth, cur_depth) + check_val!(di, s.args[i], def_depth, cur_depth) end elseif s isa JoinTokensNode for tok in s.tokens - _check_val!(di, tok, def_depth, cur_depth) + check_val!(di, tok, def_depth, cur_depth) end elseif s isa TokenResultNode - _check_val!(di, SSAValue(s.mem_op_ssa), def_depth, cur_depth) + check_val!(di, SSAValue(s.mem_op_ssa), def_depth, cur_depth) end # MakeTokenNode, PiNode, GlobalRef, literals: no SSA deps end -struct _StackItem +struct StackItem entries::Vector{Tuple{Int,Any,Any}} # (ssa_idx, stmt, type) is_loop_body::Bool end @@ -81,54 +81,54 @@ function licm_pass!(sci::StructuredIRCode) for i in 1:length(sci.argtypes) def_depth[Argument(i)] = 0 end - _hoist!(sci.entry, _StackItem[], def_depth, false) + hoist!(sci.entry, StackItem[], def_depth, false) return end -function _hoist!(block::Block, stack::Vector{_StackItem}, def_depth::Dict{Any,Int}, +function hoist!(block::Block, stack::Vector{StackItem}, def_depth::Dict{Any,Int}, is_loop_body::Bool) depth = length(stack) - push!(stack, _StackItem(Tuple{Int,Any,Any}[], is_loop_body)) + push!(stack, StackItem(Tuple{Int,Any,Any}[], is_loop_body)) for ba in block.args def_depth[ba] = depth end - mobility = _CAN_MOVE + mobility = CAN_MOVE min_depth = 0 for inst in instructions(block) s = stmt(inst) - di = _DepInfo(!is_loop_body, 0) + di = DepInfo(!is_loop_body, 0) if s isa ForOp def_depth[s.iv_arg] = depth + 1 for ba in s.body.args def_depth[ba] = depth + 1 end - body_res = _hoist!(s.body, stack, def_depth, true) - if body_res.mobility == _IMMOVABLE - mobility = _IMMOVABLE + body_res = hoist!(s.body, stack, def_depth, true) + if body_res.mobility == IMMOVABLE + mobility = IMMOVABLE di.must_stay = true end for v in s.init_values - _check_val!(di, v, def_depth, depth) + check_val!(di, v, def_depth, depth) end - _update_dep!(di, body_res.min_depth, depth) + update_dep!(di, body_res.min_depth, depth) elseif s isa LoopOp for ba in s.body.args def_depth[ba] = depth + 1 end - body_res = _hoist!(s.body, stack, def_depth, true) - if body_res.mobility == _IMMOVABLE - mobility = _IMMOVABLE + body_res = hoist!(s.body, stack, def_depth, true) + if body_res.mobility == IMMOVABLE + mobility = IMMOVABLE di.must_stay = true end for v in s.init_values - _check_val!(di, v, def_depth, depth) + check_val!(di, v, def_depth, depth) end - _update_dep!(di, body_res.min_depth, depth) + update_dep!(di, body_res.min_depth, depth) elseif s isa WhileOp for ba in s.before.args @@ -137,35 +137,35 @@ function _hoist!(block::Block, stack::Vector{_StackItem}, def_depth::Dict{Any,In for ba in s.after.args def_depth[ba] = depth + 1 end - before_res = _hoist!(s.before, stack, def_depth, true) - after_res = _hoist!(s.after, stack, def_depth, true) - if min(before_res.mobility, after_res.mobility) == _IMMOVABLE - mobility = _IMMOVABLE + before_res = hoist!(s.before, stack, def_depth, true) + after_res = hoist!(s.after, stack, def_depth, true) + if min(before_res.mobility, after_res.mobility) == IMMOVABLE + mobility = IMMOVABLE di.must_stay = true end for v in s.init_values - _check_val!(di, v, def_depth, depth) + check_val!(di, v, def_depth, depth) end - _update_dep!(di, before_res.min_depth, depth) - _update_dep!(di, after_res.min_depth, depth) + update_dep!(di, before_res.min_depth, depth) + update_dep!(di, after_res.min_depth, depth) elseif s isa IfOp - _check_val!(di, s.condition, def_depth, depth) + check_val!(di, s.condition, def_depth, depth) for region in (s.then_region, s.else_region) - branch_res = _hoist!(region, stack, def_depth, false) - _update_dep!(di, branch_res.min_depth, depth) - if branch_res.mobility != _CAN_MOVE + branch_res = hoist!(region, stack, def_depth, false) + update_dep!(di, branch_res.min_depth, depth) + if branch_res.mobility != CAN_MOVE mobility = min(mobility, branch_res.mobility) di.must_stay = true end end - elseif _is_store(block, s) - mobility = _IMMOVABLE + elseif is_store(block, s) + mobility = IMMOVABLE di.must_stay = true elseif s isa ContinueOp || s isa BreakOp - mobility = min(mobility, _CAN_MOVE_WITH_LOOP) + mobility = min(mobility, CAN_MOVE_WITH_LOOP) di.must_stay = true elseif s isa YieldOp || s isa ConditionOp || s isa ReturnNode @@ -173,18 +173,18 @@ function _hoist!(block::Block, stack::Vector{_StackItem}, def_depth::Dict{Any,In # Track deps for YieldOp/ConditionOp so min_depth is correct if s isa YieldOp for v in s.values - _check_val!(di, v, def_depth, depth) + check_val!(di, v, def_depth, depth) end elseif s isa ConditionOp - _check_val!(di, s.condition, def_depth, depth) + check_val!(di, s.condition, def_depth, depth) for v in s.args - _check_val!(di, v, def_depth, depth) + check_val!(di, v, def_depth, depth) end end else # Movable operation: loads, arithmetic, make_partition_view, etc. - _check_stmt_deps!(di, s, def_depth, depth) + check_stmt_deps!(di, s, def_depth, depth) end # Determine target depth @@ -210,11 +210,11 @@ function _hoist!(block::Block, stack::Vector{_StackItem}, def_depth::Dict{Any,In push!(block, idx, s, typ) end - return _BlockResult(mobility, min_depth) + return BlockResult(mobility, min_depth) end # Check if a statement is a store/atomic (side-effecting memory write). -function _is_store(block::Block, @nospecialize(s)) +function is_store(block::Block, @nospecialize(s)) call = resolve_call(block, s) call === nothing && return false resolved_func, _ = call From 313f1b5cdcd7eab213d51986ee40a8987b59d55b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 6 Apr 2026 12:38:01 +0200 Subject: [PATCH 6/6] Use IRStructurizer's code motion primitives for LICM. Rewrite LICM from a 200-line stack-based depth-tracking algorithm to a simple fixpoint loop using IRStructurizer's is_defined_outside, move_before!, and operands. Processes innermost loops first (post-order), repeatedly hoisting ops whose operands are all defined outside the loop. Co-Authored-By: Claude Opus 4.6 (1M context) --- Project.toml | 2 +- src/compiler/passes/licm.jl | 216 ++++++------------------------------ src/compiler/utils.jl | 5 + src/cuTile.jl | 3 +- 4 files changed, 42 insertions(+), 184 deletions(-) diff --git a/Project.toml b/Project.toml index c9d425ce..26bcd471 100644 --- a/Project.toml +++ b/Project.toml @@ -30,5 +30,5 @@ CUDA_Tile_jll = "13.1" CompilerCaching = "0.2" EnumX = "1.0" GPUArrays = "11" -IRStructurizer = "0.5.1" +IRStructurizer = "0.5.3" julia = "1.11" diff --git a/src/compiler/passes/licm.jl b/src/compiler/passes/licm.jl index 9ff3eb64..5a0af06a 100644 --- a/src/compiler/passes/licm.jl +++ b/src/compiler/passes/licm.jl @@ -8,209 +8,61 @@ # operations — including loads, arithmetic, partition views, token nodes — are # hoisted when all their data dependencies are defined outside the loop. # +# Uses IRStructurizer's `is_defined_outside`, `move_before!`, and `operands` +# primitives. Processes innermost loops first and repeats until fixpoint. +# # This mirrors cuTile Python's code_motion.py:hoist_loop_invariants. -# Indicates whether a block could in theory be moved, based on the operations -# it contains (side effects, jumps). Does not consider data dependencies. -@enum BlockMobility::Int8 begin - # The block (or any ancestor) cannot be moved due to side effects. - IMMOVABLE = 0 - # The block itself can't be hoisted alone, but its containing loop can. - # Happens when the block contains Continue or Break. - CAN_MOVE_WITH_LOOP = 1 - # The block can move (subject to data dependencies). - CAN_MOVE = 2 -end - -struct BlockResult - mobility::BlockMobility - min_depth::Int # deepest outside dependency of any hoisted-out op -end - -# Helper for accumulating data dependency information per operation. -mutable struct DepInfo - must_stay::Bool - max_outside_depth::Int -end - -function update_dep!(di::DepInfo, dep_depth::Int, cur_depth::Int) - if dep_depth >= cur_depth - di.must_stay = true - else - di.max_outside_depth = max(di.max_outside_depth, dep_depth) - end -end - -# Update dependency info from an SSA value or literal. -function check_val!(di::DepInfo, val, def_depth::Dict{Any,Int}, cur_depth::Int) - d = get(def_depth, val, nothing) - d === nothing && return # constants/literals always available - update_dep!(di, d, cur_depth) -end - -# Extract all SSA dependencies from a statement. -function check_stmt_deps!(di::DepInfo, @nospecialize(s), def_depth::Dict{Any,Int}, - cur_depth::Int) - if s isa Expr - start = s.head === :invoke ? 3 : 2 - for i in start:length(s.args) - check_val!(di, s.args[i], def_depth, cur_depth) - end - elseif s isa JoinTokensNode - for tok in s.tokens - check_val!(di, tok, def_depth, cur_depth) - end - elseif s isa TokenResultNode - check_val!(di, SSAValue(s.mem_op_ssa), def_depth, cur_depth) - end - # MakeTokenNode, PiNode, GlobalRef, literals: no SSA deps -end - -struct StackItem - entries::Vector{Tuple{Int,Any,Any}} # (ssa_idx, stmt, type) - is_loop_body::Bool -end - """ licm_pass!(sci::StructuredIRCode) Hoist loop-invariant operations out of loops. Must run after token_order_pass!. """ function licm_pass!(sci::StructuredIRCode) - def_depth = Dict{Any,Int}() - for i in 1:length(sci.argtypes) - def_depth[Argument(i)] = 0 + for (loop_inst, loop_op) in collect_loops(sci.entry) + hoist_from_loop!(loop_inst, loop_op) end - hoist!(sci.entry, StackItem[], def_depth, false) - return end -function hoist!(block::Block, stack::Vector{StackItem}, def_depth::Dict{Any,Int}, - is_loop_body::Bool) - depth = length(stack) - push!(stack, StackItem(Tuple{Int,Any,Any}[], is_loop_body)) - - for ba in block.args - def_depth[ba] = depth - end - - mobility = CAN_MOVE - min_depth = 0 +# Collect (instruction, loop_op) pairs in post-order (innermost first). +function collect_loops(root::Block) + result = Tuple{Instruction, Union{ForOp, LoopOp, WhileOp}}[] + collect_loops!(result, root) + return result +end +function collect_loops!(result, block::Block) for inst in instructions(block) s = stmt(inst) - di = DepInfo(!is_loop_body, 0) - - if s isa ForOp - def_depth[s.iv_arg] = depth + 1 - for ba in s.body.args - def_depth[ba] = depth + 1 - end - body_res = hoist!(s.body, stack, def_depth, true) - if body_res.mobility == IMMOVABLE - mobility = IMMOVABLE - di.must_stay = true - end - for v in s.init_values - check_val!(di, v, def_depth, depth) - end - update_dep!(di, body_res.min_depth, depth) - - elseif s isa LoopOp - for ba in s.body.args - def_depth[ba] = depth + 1 - end - body_res = hoist!(s.body, stack, def_depth, true) - if body_res.mobility == IMMOVABLE - mobility = IMMOVABLE - di.must_stay = true - end - for v in s.init_values - check_val!(di, v, def_depth, depth) - end - update_dep!(di, body_res.min_depth, depth) - + if s isa ForOp || s isa LoopOp + collect_loops!(result, s.body) + push!(result, (inst, s)) elseif s isa WhileOp - for ba in s.before.args - def_depth[ba] = depth + 1 - end - for ba in s.after.args - def_depth[ba] = depth + 1 - end - before_res = hoist!(s.before, stack, def_depth, true) - after_res = hoist!(s.after, stack, def_depth, true) - if min(before_res.mobility, after_res.mobility) == IMMOVABLE - mobility = IMMOVABLE - di.must_stay = true - end - for v in s.init_values - check_val!(di, v, def_depth, depth) - end - update_dep!(di, before_res.min_depth, depth) - update_dep!(di, after_res.min_depth, depth) - - elseif s isa IfOp - check_val!(di, s.condition, def_depth, depth) - for region in (s.then_region, s.else_region) - branch_res = hoist!(region, stack, def_depth, false) - update_dep!(di, branch_res.min_depth, depth) - if branch_res.mobility != CAN_MOVE - mobility = min(mobility, branch_res.mobility) - di.must_stay = true - end + collect_loops!(result, s.before) + collect_loops!(result, s.after) + push!(result, (inst, s)) + elseif s isa ControlFlowOp + for b in blocks(s) + collect_loops!(result, b) end - - elseif is_store(block, s) - mobility = IMMOVABLE - di.must_stay = true - - elseif s isa ContinueOp || s isa BreakOp - mobility = min(mobility, CAN_MOVE_WITH_LOOP) - di.must_stay = true - - elseif s isa YieldOp || s isa ConditionOp || s isa ReturnNode - di.must_stay = true - # Track deps for YieldOp/ConditionOp so min_depth is correct - if s isa YieldOp - for v in s.values - check_val!(di, v, def_depth, depth) - end - elseif s isa ConditionOp - check_val!(di, s.condition, def_depth, depth) - for v in s.args - check_val!(di, v, def_depth, depth) - end - end - - else - # Movable operation: loads, arithmetic, make_partition_view, etc. - check_stmt_deps!(di, s, def_depth, depth) end + end +end - # Determine target depth - target_depth = depth - if di.must_stay - min_depth = max(min_depth, di.max_outside_depth) - else - while target_depth > di.max_outside_depth && stack[target_depth].is_loop_body - target_depth -= 1 +function hoist_from_loop!(loop_inst::Instruction, loop_op) + changed = true + while changed + changed = false + for body in blocks(loop_op) + for inst in collect(instructions(body)) + stmt(inst) isa ControlFlowOp && continue + is_store(body, stmt(inst)) && continue + all(v -> is_defined_outside(v, loop_op), operands(body, inst)) || continue + move_before!(inst, loop_inst) + changed = true end end - - push!(stack[target_depth + 1].entries, (inst.ssa_idx, s, inst.typ)) - - # Record definition depth AFTER hoisting so subsequent ops see the new depth - def_depth[SSAValue(inst.ssa_idx)] = target_depth end - - # Rebuild block body from collected entries - entries = pop!(stack).entries - empty!(block) - for (idx, s, typ) in entries - push!(block, idx, s, typ) - end - - return BlockResult(mobility, min_depth) end # Check if a statement is a store/atomic (side-effecting memory write). diff --git a/src/compiler/utils.jl b/src/compiler/utils.jl index fb84a0ac..e57b8204 100644 --- a/src/compiler/utils.jl +++ b/src/compiler/utils.jl @@ -81,6 +81,11 @@ IRStructurizer.walk_uses!(f, node::JoinTokensNode) = IRStructurizer.walk_uses!(f, ::TokenResultNode) = nothing IRStructurizer.walk_uses!(f, ::MakeTokenNode) = nothing +# operands extensions for cuTile-specific IR nodes. +operands(::Block, s::JoinTokensNode) = s.tokens +operands(::Block, s::TokenResultNode) = Any[SSAValue(s.mem_op_ssa)] +operands(::Block, ::MakeTokenNode) = Any[] + """ is_token_type(typ) -> Bool diff --git a/src/cuTile.jl b/src/cuTile.jl index 1fbab4bc..960c1b56 100644 --- a/src/cuTile.jl +++ b/src/cuTile.jl @@ -4,7 +4,8 @@ using IRStructurizer using IRStructurizer: Block, ControlFlowOp, BlockArgument, YieldOp, ContinueOp, BreakOp, ConditionOp, IfOp, ForOp, WhileOp, LoopOp, Undef, - SourceLocation, source_location + SourceLocation +import IRStructurizer: operands using Base: compilerbarrier, donotdelete using Core: MethodInstance, CodeInfo, SSAValue, Argument, SlotNumber,