Skip to content

Commit 5ec3378

Browse files
committed
Resolve issues/nits
1 parent dd96fcf commit 5ec3378

11 files changed

Lines changed: 75 additions & 14 deletions

benchmarks/benchmark_lib.sh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ _install_lm_eval_deps() {
162162
python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
163163
# Temporary: workaround issue by using main
164164
python3 -m pip install -q --no-cache-dir --no-deps \
165-
"git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true
165+
"git+https://github.com/EleutherAI/lm-evaluation-harness.git@b315ef3b05176acc9732bb7fdec116abe1ecc476" || true
166166
}
167167

168168
# Patch lm-eval filters to be robust to empty strings via sitecustomize
@@ -450,11 +450,6 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples
450450
if (not content) and reasoning:
451451
return response
452452
453-
if not content and LITELLM_CACHE:
454-
logger.info("Empty content with caching on; retrying uncached once")
455-
kwargs["caching"] = False
456-
response = litellm.completion(**kwargs)
457-
458453
return response
459454
except litellm.BadRequestError as e:
460455
if "message" in e.__dict__ and "policy" in e.__dict__["message"]:

benchmarks/gptoss_fp4_h100_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ run_benchmark_serving \
5353
--input-len "$ISL" \
5454
--output-len "$OSL" \
5555
--random-range-ratio "$RANDOM_RANGE_RATIO" \
56-
--num-prompts $(( $CONC * 1 )) \
56+
--num-prompts $(( $CONC * 10 )) \
5757
--max-concurrency "$CONC" \
5858
--result-filename "$RESULT_FILENAME" \
5959
--result-dir /workspace/

runners/launch_h100-cr.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ docker run --rm --network=host --name=$server_name \
1212
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
1313
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
1414
-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
15-
${GH_SUM_ENV} ${GH_SUM_MOUNT} \
1615
--entrypoint=/bin/bash \
1716
$IMAGE \
1817
benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh"

runners/launch_mi300x-amd.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
1515
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
1616
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
1717
-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
18-
${GH_SUM_ENV} ${GH_SUM_MOUNT} \
1918
--entrypoint=/bin/bash \
2019
$IMAGE \
2120
benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"

runners/launch_mi300x-cr.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
1515
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
1616
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
1717
-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
18-
${GH_SUM_ENV} ${GH_SUM_MOUNT} \
1918
--entrypoint=/bin/bash \
2019
$IMAGE \
2120
benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"

runners/launch_mi325x-amd.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no
1111
JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
1212

1313
srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE"
14-
1514
srun --jobid=$JOB_ID \
1615
--container-image=$SQUASH_FILE \
1716
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \

runners/launch_mi325x-tw.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no
1111
JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
1212

1313
srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE"
14-
1514
srun --jobid=$JOB_ID \
1615
--container-image=$SQUASH_FILE \
1716
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \

runners/launch_mi355x-amd.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
3737
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
3838
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \
3939
-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
40-
${GH_SUM_ENV} ${GH_SUM_MOUNT} \
4140
--entrypoint=/bin/bash \
4241
$IMAGE \
4342
benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh"

utils/collect_eval_results.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,9 +313,17 @@ def main():
313313
continue
314314

315315
# Merge with meta
316+
# Prefer explicit hardware identifiers from meta (if present) and fall back to parsed pretty_env_info
317+
hw_meta = (
318+
meta.get('hw')
319+
or meta.get('runner')
320+
or meta.get('RUNNER_TYPE')
321+
or None
322+
)
323+
hw_value = hw_meta if hw_meta else m.get('hardware', 'Unknown GPU')
316324
row = {
317325
'model': m.get('model') or meta.get('model') or 'unknown',
318-
'hw': m.get('hardware', 'Unknown GPU'),
326+
'hw': hw_value,
319327
'framework': (meta.get('framework') or 'unknown').lower(),
320328
'precision': (meta.get('precision') or 'unknown').lower(),
321329
'tp': int(meta.get('tp') or 1),

utils/evals/READMEevals.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Evals
2+
3+
## What?
4+
Quick graded QnA which measures model performance. Examples of test suites:
5+
- **gsm8k**: Grade school math questions
6+
- **gpqa**: Graduate level, Google-Proof multiple choice questions
7+
- **math500**: Math questions spanning topics like probability, algebra, trigonometry, and geometry.
8+
9+
## When?
10+
At highest concurrency for highest TP and lowest TP, per GPU per model per ISL/OSL. Logic is defined in `mark_eval_entries` of `utils/matrix-logic/generate_sweep_configs.py`
11+
12+
## Why?
13+
To verify how model outputs are affected by throughput optimizations.
14+
- TP/Conc might affect model outputs
15+
- Check kernel implementations for correctness
16+
17+
## How?
18+
- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. Either EleutherAI/lm-evaluation-harness(lmeval) or lighteval with litellm is ran, using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
19+
20+
## Misc
21+
Following files are task definitions from lmeval, more info on changes within the files
22+
- `utils/evals/math500.yaml`
23+
- `utils/evals/gsm8k.yaml`
24+
Following files are task definitions from lighteval, more info on changes within the files
25+
- `utils/evals/custom_gsm8k.py`
26+
27+
28+

0 commit comments

Comments
 (0)