Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/run_maxtext_jetstream_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ jobs:
- name: Log message if dependent job succeeded
if: ${{ ! (failure() && github.event.pull_request == null) }}
run: echo "Conditions for creating/updating issue not met. Skipping."
- name: Upload manifest to gcs
if: ${{ ! failure() }}
run: gsutil cp ${{ env.BUILD_MANIFEST_DIR }}/${{ needs.build_stable_stack.outputs.manifest_name }} gs://jetstream-inference-stable-stack-artifacts/manifest-files/
- name: Send email
uses: dawidd6/action-send-mail@v3.6.0
with:
Expand All @@ -202,5 +205,5 @@ jobs:
to: singhvijaya@google.com, yuyanpeng@google.com, vipannalla@google.com
from: JetStream Runs
secure: true
attachments: ${{ env.BUILD_MANIFEST_DIR }}/${{ needs.build_stable_stack.outputs.manifest_name }},${{ env.BENCHMARK_REPORT_DIR }}/moe_8x7b.txt,${{ env.BENCHMARK_REPORT_DIR }}/moe_8x22b.txt,${{ env.BENCHMARK_REPORT_DIR }}/moe_8x22b_long_context_8k_prefill.txt,${{ env.BENCHMARK_REPORT_DIR }}/moe_8x7b_jetstream.txt,${{ env.BENCHMARK_REPORT_DIR }}/llama_70b_jetstream.txt
attachments: ${{ env.BUILD_MANIFEST_DIR }}/${{ needs.build_stable_stack.outputs.manifest_name }},${{ env.BENCHMARK_REPORT_DIR }}/moe_8x7b.txt,${{ env.BENCHMARK_REPORT_DIR }}/moe_8x22b.txt,${{ env.BENCHMARK_REPORT_DIR }}/moe_8x22b_long_context_8k_prefill.txt,${{ env.BENCHMARK_REPORT_DIR }}/moe_8x7b_jetstream.txt,${{ env.BENCHMARK_REPORT_DIR }}/llama_70b_jetstream.txt, ${OUTPUT_DIR}/golden-numbers.txt, ${OUTPUT_DIR}/result_comparison.txt
body: workflow for ${{github.repository}} completed successfully!
9 changes: 7 additions & 2 deletions .github/workflows/test_llama_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ sleep 800
cd ..

python JetStream/benchmarks/benchmark_serving.py --tokenizer maxtext/assets/tokenizer.llama2 --save-result --save-request-outputs --request-outputs-file-path outputs.json --num-prompts 1200 --max-output-length 1024 --dataset openorca --run-eval True > ${OUTPUT_DIR}/llama_70b_jetstream.txt
tail -n25 ${OUTPUT_DIR}/llama_70b_jetstream.txt > ${OUTPUT_DIR}/llama_70b_jetstream.tmp && mv ${OUTPUT_DIR}/llama_70b_jetstream.tmp ${OUTPUT_DIR}/llama_70b_jetstream.txt
#tail -n25 ${OUTPUT_DIR}/llama_70b_jetstream.txt > ${OUTPUT_DIR}/llama_70b_jetstream.tmp && mv ${OUTPUT_DIR}/llama_70b_jetstream.tmp ${OUTPUT_DIR}/llama_70b_jetstream.txt

# kill Jetstream server
kill -9 %%
kill -9 %%
tail -n25 ${OUTPUT_DIR}/llama_70b_jetstream.txt > ${OUTPUT_DIR}/llama_70b_jetstream.tmp
echo "\n8x7b Maxtext Jetstream Run throughput and accuracy for llama 70b" >> ${OUTPUT_DIR}/result_comparison.txt
grep "\nthroughput" ${OUTPUT_DIR}/llama_70b_jetstream.tmp >> ${OUTPUT_DIR}/result_comparison.txt
grep "\nrouge1" ${OUTPUT_DIR}/llama_70b_jetstream.tmp >> ${OUTPUT_DIR}/result_comparison.txt
mv ${OUTPUT_DIR}/llama_70b_jetstream.tmp ${OUTPUT_DIR}/llama_70b_jetstream.txt
33 changes: 30 additions & 3 deletions .github/workflows/test_moe_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,25 @@ cd maxtext

# moe 8x7b microbenchmark
LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.inference_microbenchmark MaxText/configs/inference.yml tokenizer_path=assets/tokenizer.mistral-v1 max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x7b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=8 megablox=False quantization=int8 quantize_kvcache=False checkpoint_is_quantized=True load_parameters_path=gs://jetstream-runner/8-7B-int8 capacity_factor=1 attention=dot_product model_call_mode=inference sparse_matmul=False weight_dtype=bfloat16 > ${OUTPUT_DIR}/moe_8x7b.txt
tail -n5 ${OUTPUT_DIR}/moe_8x7b.txt > ${OUTPUT_DIR}/moe_8x7b.tmp && mv ${OUTPUT_DIR}/moe_8x7b.tmp ${OUTPUT_DIR}/moe_8x7b.txt
tail -n5 ${OUTPUT_DIR}/moe_8x7b.txt > ${OUTPUT_DIR}/moe_8x7b.tmp
echo "\n" >> ${OUTPUT_DIR}/result_comparison.txt
echo "\n8x7b microbenchmark prefill decode latencies" >> ${OUTPUT_DIR}/result_comparison.txt
grep "\nPREFILL" ${OUTPUT_DIR}/moe_8x7b.tmp >> ${OUTPUT_DIR}/result_comparison.txt
grep "\nDECODE" ${OUTPUT_DIR}/moe_8x7b.tmp >> ${OUTPUT_DIR}/result_comparison.txt
mv ${OUTPUT_DIR}/moe_8x7b.tmp ${OUTPUT_DIR}/moe_8x7b.txt
#tail -n5 ${OUTPUT_DIR}/moe_8x7b.txt > ${OUTPUT_DIR}/moe_8x7b.tmp && mv ${OUTPUT_DIR}/moe_8x7b.tmp ${OUTPUT_DIR}/moe_8x7b.txt

# moe 8x22B microbenchmark
LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.inference_microbenchmark MaxText/configs/inference.yml load_parameters_path=gs://jetstream-runner/8-22B-int8 max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x22b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false per_device_batch_size=24 attention=dot_product megablox=False quantization=int8 checkpoint_is_quantized=True quantize_kvcache=True capacity_factor=1 tokenizer_path=assets/tokenizer.mistral-v3 inference_microbenchmark_prefill_lengths="128,1024" sparse_matmul=False model_call_mode=inference > ${OUTPUT_DIR}/moe_8x22b.txt
tail -n5 ${OUTPUT_DIR}/moe_8x22b.txt > ${OUTPUT_DIR}/moe_8x22b.tmp && mv ${OUTPUT_DIR}/moe_8x22b.tmp ${OUTPUT_DIR}/moe_8x22b.txt

tail -n5 ${OUTPUT_DIR}/moe_8x22b.txt > ${OUTPUT_DIR}/moe_8x22b.tmp
echo "\n" >> ${OUTPUT_DIR}/result_comparison.txt
echo "\n8x22b microbenchmark prefill decode latencies" >> ${OUTPUT_DIR}/result_comparison.tmp
grep "\nPREFILL" ${OUTPUT_DIR}/moe_8x22b.tmp >> ${OUTPUT_DIR}/result_comparison.txt
grep "\nDECODE" ${OUTPUT_DIR}/moe_8x22b.tmp >> ${OUTPUT_DIR}/result_comparison.txt
mv ${OUTPUT_DIR}/moe_8x22b.tmp ${OUTPUT_DIR}/moe_8x22b.txt

#tail -n5 ${OUTPUT_DIR}/moe_8x22b.txt > ${OUTPUT_DIR}/moe_8x22b.tmp && mv ${OUTPUT_DIR}/moe_8x22b.tmp ${OUTPUT_DIR}/moe_8x22b.txt

# moe 8x22B 8k context length chunked prefill with 2k prefill chunk size
LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.benchmark_chunked_prefill MaxText/configs/inference.yml load_parameters_path=gs://jetstream-runner/8-22B-int8 max_prefill_predict_length=8192 max_target_length=9000 model_name=mixtral-8x22b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 scan_layers=false per_device_batch_size=24 attention=dot_product megablox=False quantization=int8 checkpoint_is_quantized=True quantize_kvcache=False capacity_factor=1 tokenizer_path=assets/tokenizer.mistral-v3 inference_microbenchmark_prefill_lengths="8192" sparse_matmul=False model_call_mode=inference ici_context_autoregressive_parallelism=8 use_chunked_prefill=True prefill_chunk_size=2048 > ${OUTPUT_DIR}/moe_8x22b_long_context_8k_prefill.txt
Expand All @@ -29,7 +43,20 @@ sleep 600
cd ..

python JetStream/benchmarks/benchmark_serving.py --tokenizer maxtext/assets/tokenizer.mistral-v1 --save-result --save-request-outputs --request-outputs-file-path outputs.json --num-prompts 1200 --max-output-length 1024 --dataset openorca --run-eval True > ${OUTPUT_DIR}/moe_8x7b_jetstream.txt
tail -n25 ${OUTPUT_DIR}/moe_8x7b_jetstream.txt > ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp && mv ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp ${OUTPUT_DIR}/moe_8x7b_jetstream.txt
# tail -n25 ${OUTPUT_DIR}/moe_8x7b_jetstream.txt > ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp && mv ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp ${OUTPUT_DIR}/moe_8x7b_jetstream.txt

# kill Jetstream server
kill -9 %%

tail -n25 ${OUTPUT_DIR}/moe_8x7b_jetstream.txt > ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp
echo "\n" >> ${OUTPUT_DIR}/result_comparison.txt

echo "\n8x7b Maxtext Jetstream Run throughput and accuracy for Mixtral 8x7B" >> ${OUTPUT_DIR}/result_comparison.txt
grep "\nthroughput" ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp >> ${OUTPUT_DIR}/result_comparison.txt
grep "\nrouge1" ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp >> ${OUTPUT_DIR}/result_comparison.txt

mv ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp ${OUTPUT_DIR}/moe_8x7b_jetstream.txt


# download golden numbers from gcs
gsutil cp gs://jetstream-inference-stable-stack-artifacts/golden-numbers/golden-numbers.txt ${OUTPUT_DIR}/