Resolve issues/nits

Oseltamivir · Oseltamivir · commit 5ec3378de2a8 · 2025-12-04T13:14:30.000+08:00
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -162,7 +162,7 @@ _install_lm_eval_deps() {
     python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
     # Temporary: workaround issue by using main
     python3 -m pip install -q --no-cache-dir --no-deps \
-        "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true
+        "git+https://github.com/EleutherAI/lm-evaluation-harness.git@b315ef3b05176acc9732bb7fdec116abe1ecc476" || true
 }
 
 # Patch lm-eval filters to be robust to empty strings via sitecustomize
@@ -450,11 +450,6 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples
             if (not content) and reasoning:
                 return response
 
-            if not content and LITELLM_CACHE:
-                logger.info("Empty content with caching on; retrying uncached once")
-                kwargs["caching"] = False
-                response = litellm.completion(**kwargs)
-
             return response
         except litellm.BadRequestError as e:
             if "message" in e.__dict__ and "policy" in e.__dict__["message"]:
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -53,7 +53,7 @@ run_benchmark_serving \
     --input-len "$ISL" \
     --output-len "$OSL" \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
-    --num-prompts $(( $CONC * 1 )) \
+    --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
@@ -12,7 +12,6 @@ docker run --rm --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
- ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh"
diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh
@@ -15,7 +15,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
-${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh
@@ -15,7 +15,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
- ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
@@ -11,7 +11,6 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
 
 srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE"
-
 srun --jobid=$JOB_ID \
 --container-image=$SQUASH_FILE \
 --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
diff --git a/runners/launch_mi325x-tw.sh b/runners/launch_mi325x-tw.sh
@@ -11,7 +11,6 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
 
 srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE"
-
 srun --jobid=$JOB_ID \
 --container-image=$SQUASH_FILE \
 --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh
@@ -37,7 +37,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
-${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh"
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
@@ -313,9 +313,17 @@ def main():
             continue
 
         # Merge with meta
+        # Prefer explicit hardware identifiers from meta (if present) and fall back to parsed pretty_env_info
+        hw_meta = (
+            meta.get('hw')
+            or meta.get('runner')
+            or meta.get('RUNNER_TYPE')
+            or None
+        )
+        hw_value = hw_meta if hw_meta else m.get('hardware', 'Unknown GPU')
         row = {
             'model': m.get('model') or meta.get('model') or 'unknown',
-            'hw': m.get('hardware', 'Unknown GPU'),
+            'hw': hw_value,
             'framework': (meta.get('framework') or 'unknown').lower(),
             'precision': (meta.get('precision') or 'unknown').lower(),
             'tp': int(meta.get('tp') or 1),
diff --git a/utils/evals/READMEevals.md b/utils/evals/READMEevals.md
@@ -0,0 +1,28 @@
+# Evals
+
+## What?
+Quick graded QnA which measures model performance. Examples of test suites:
+- **gsm8k**: Grade school math questions
+- **gpqa**: Graduate level, Google-Proof multiple choice questions
+- **math500**: Math questions spanning topics like probability, algebra, trigonometry, and geometry.
+
+## When?
+At highest concurrency for highest TP and lowest TP, per GPU per model per ISL/OSL. Logic is defined in `mark_eval_entries` of `utils/matrix-logic/generate_sweep_configs.py`
+
+## Why?
+To verify how model outputs are affected by throughput optimizations. 
+- TP/Conc might affect model outputs
+- Check kernel implementations for correctness
+
+## How?
+- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. Either  EleutherAI/lm-evaluation-harness(lmeval) or  lighteval with litellm is ran, using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
+
+## Misc
+Following files are task definitions from lmeval, more info on changes within the files
+- `utils/evals/math500.yaml`
+- `utils/evals/gsm8k.yaml`
+Following files are task definitions from lighteval, more info on changes within the files
+- `utils/evals/custom_gsm8k.py`
+
+
+
diff --git a/utils/evals/math500.yaml b/utils/evals/math500.yaml
@@ -0,0 +1,36 @@
+# YAML from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
+# Changed regex and prompt
+tag:
+  - math_word_problems
+task: hendrycks_math_algebra
+dataset_path: HuggingFaceH4/MATH-500
+process_docs: !function utils.process_docs
+dataset_name: algebra
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_text: "You are solving competition math problems.\n\nFormat rules:\n- Answer in a new. That line must start with `Answer: ` (capital A, colon, one space).\n- After `Answer: `, write ONLY the answer as inline LaTeX.\n- Use ONLY ASCII LaTeX commands (e.g. \\pi, \\frac{1}{2}, -). NO Unicode symbols.\n- Do NOT wrap the answer in $, $$, \\( \\), \\[ \\], or any other delimiters.\n- Do NOT use \\displaystyle or any display-style commands. Answer only this problem, the rest are examples. Problem: {{problem}}\n"
+process_results: !function utils.process_results
+doc_to_target: "{{answer}}"
+generation_kwargs:
+  until:
+    - "Problem:"
+  do_sample: false
+  temperature: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    regexes_to_ignore:
+      - "\\\\left"
+      - "\\\\right"
+      - "\\s+"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "Answer:\\s*([^\\n]+)"
+      - function: "take_first"
+metadata:
+  version: 1.0