Cleanup comments, ammend lighteval

Oseltamivir · Oseltamivir · commit 1d889b8d75d3 · 2025-12-06T11:25:50.000+08:00
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -160,7 +160,6 @@ run_benchmark_serving() {
 
 _install_lm_eval_deps() {
     python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
-    # Temporary: workaround issue by using main
     python3 -m pip install -q --no-cache-dir --no-deps \
         "git+https://github.com/EleutherAI/lm-evaluation-harness.git@b315ef3b05176acc9732bb7fdec116abe1ecc476" || true
 }
@@ -356,7 +355,7 @@ META
 # ------------------------------
 
 _install_lighteval_deps() {
-    python3 -m pip install -q --no-cache-dir "lighteval[api]" "litellm" || true
+    python3 -m pip install -q --no-cache-dir "lighteval==0.13.0" "litellm==1.80.7" || true
 }
 
 # Patch lighteval's LiteLLMClient to handle reasoning content and Python name mangling
@@ -615,7 +614,7 @@ run_lighteval_eval() {
     local base_url="http://0.0.0.0:${port}/v1"
     export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
 
-    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p=1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}"
+    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p:1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}"
     local TASK_SPEC="${task}|${num_fewshot}"
 
     # Respect absolute paths (e.g., /tmp/eval_out); otherwise write under /workspace
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -39,8 +39,6 @@ export TORCH_CUDA_ARCH_LIST="9.0"
 PORT=$(( 8888 + $PORT_OFFSET ))
 MODEL_NAME=${MODEL##*/}
 
-export TORCH_CUDA_ARCH_LIST="9.0"
-
 PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
  --config config.yaml \
  --gpu-memory-utilization 0.9 \
diff --git a/utils/evals/custom_gsm8k.py b/utils/evals/custom_gsm8k.py
@@ -13,7 +13,7 @@
     evaluation_splits=["test"],
     few_shots_split=None,
     few_shots_select="random_sampling_from_train",
-    generation_size=768,         # raised this from 256
+    generation_size=1024,         # raised this from 256
     metrics=[Metrics.expr_gold_metric],
     stop_sequence=None,           # avoid early stop on "Question:"
     version=0,