issue/224 - feat: add --warmup flag and disable warmup by default

spike-zhu · spike-zhu · commit a247d405ec0c · 2026-02-12T11:43:32.000+08:00
diff --git a/examples/jiuge.py b/examples/jiuge.py
@@ -131,6 +131,11 @@ def get_args():
         default=1.0,
         help="sampling temperature",
     )
+    parser.add_argument(
+        "--warmup",
+        action="store_true",
+        help="Perform a warmup run before benchmarking/inference."
+    )
 
     return parser.parse_args()
 
@@ -239,39 +244,40 @@ def test(
     # ---------------------------------------------------------------------------- #
     #                                Warmup
     # ---------------------------------------------------------------------------- #
-    warmup_steps = 1
-
-    # Choose a length that approximates the real workload.
-    # It should be long enough to trigger the correct kernel paths,
-    # but not so long that warmup becomes unnecessarily expensive.
-    avg_prompt_len = min(64, max(len(ids) for ids in input_ids_list))
-
-    # Use truncated versions of real prompts for warmup
-    warmup_ids = [
-        ids[:avg_prompt_len] if len(ids) >= avg_prompt_len else ids
-        for ids in input_ids_list
-    ]
+    if args.warmup:
+        warmup_steps = 1
+
+        # Choose a length that approximates the real workload.
+        # It should be long enough to trigger the correct kernel paths,
+        # but not so long that warmup becomes unnecessarily expensive.
+        avg_prompt_len = min(64, max(len(ids) for ids in input_ids_list))
+
+        # Use truncated versions of real prompts for warmup
+        warmup_ids = [
+            ids[:avg_prompt_len] if len(ids) >= avg_prompt_len else ids
+            for ids in input_ids_list
+        ]
 
-    input_ids_infini = infinicore.from_list(warmup_ids)
+        input_ids_infini = infinicore.from_list(warmup_ids)
 
-    print("=================== warmup start ===================")
+        print("=================== warmup start ===================")
 
-    for _ in range(warmup_steps):
-        _ = model.generate(
-            input_ids_infini,
-            GenerationConfig(
-                max_new_tokens=2,  # warmup decode kernel
-                temperature=1,
-                top_k=1,
-                top_p=0.8,
-            ),
-            _measure_and_log_time=False,
-        )
+        for _ in range(warmup_steps):
+            _ = model.generate(
+                input_ids_infini,
+                GenerationConfig(
+                    max_new_tokens=2,  # warmup decode kernel
+                    temperature=temperature,
+                    top_k=top_k,
+                    top_p=top_p,
+                ),
+                _measure_and_log_time=False,
+            )
 
-    print("=================== warmup done ====================")
+        print("=================== warmup done ====================")
 
-    # Reset KV cache 
-    model.reset_cache(cache_config)
+        # Reset KV cache 
+        model.reset_cache(cache_config)
 
     # ---------------------------------------------------------------------------- #
     #                        Generate