fix attn_metadata bug

Simon12345777 · Simon12345777 · commit c596fff4b8bd · 2026-05-15T12:05:29.000Z
diff --git a/csrc/engine/compiler/chunk_prefill_compiler.cpp b/csrc/engine/compiler/chunk_prefill_compiler.cpp
@@ -1,4 +1,5 @@
 #include "chunk_prefill_compiler.hpp"
+#include "../../global_state/global_state.hpp"
 #include "infinicore/context/context.hpp"
 
 
@@ -121,6 +122,16 @@ void ChunkPrefillCompiler::compile() {
                     {total_tokens}, infinicore::DataType::I64, infinicore::context::getDevice());
                 set_zeros(input.slot_mapping.value());
 
+                // Attention reads attn_metadata from thread-local forward context.
+                infinilm::global_state::get_forward_context().attn_metadata = {
+                    input.past_sequence_lengths,
+                    input.total_sequence_lengths,
+                    input.input_offsets,
+                    input.cu_seqlens,
+                    input.block_tables,
+                    input.slot_mapping,
+                };
+
                 barrier_->wait();
                 infinicore::context::startGraphRecording();
                 auto output = model_->forward(input);
diff --git a/scripts/test_perf.py b/scripts/test_perf.py
@@ -29,7 +29,7 @@
 
 NUM_REQUESTS = 64
 CONCURRENCY = 20
-API_URL = "http://127.0.0.1:8000"
+API_URL = "http://127.0.0.1:3456"
 MODEL = "FM9G-7B"