diff --git a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile index 987eefc5..a3d84895 100644 --- a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile +++ b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile @@ -1,7 +1,7 @@ # Copyright (c) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:1.20.0-543 +FROM vault.habana.ai/gaudi-docker/1.21.2/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:1.21.2-76 # Need node to build doc HTML. Taken from https://stackoverflow.com/a/67491580 RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -11,16 +11,18 @@ RUN npm install n -g && \ n latest RUN python3 -m pip install --no-cache-dir --upgrade pip -RUN python3 -m pip install --upgrade-strategy eager optimum[habana] +RUN python3 -m pip install --upgrade-strategy eager RUN python3 -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 RUN mkdir -p /workspace WORKDIR /workspace -RUN git clone https://github.com/huggingface/optimum-habana && cd optimum-habana && git checkout v1.16.0 +RUN git clone -b v1.18.0 https://github.com/huggingface/optimum-habana && \ + cd optimum-habana && \ + python3 setup.py install WORKDIR /workspace/optimum-habana/examples/text-generation RUN python3 -m pip install -r requirements.txt RUN python3 -m pip install -r requirements_lm_eval.txt COPY . . -COPY Gaudi_1-20.json Gaudi.json +COPY Gaudi_1-21.json Gaudi.json COPY HQT_1-20.zip HQT.zip RUN python3 -m pip install -r requirements_bm.txt diff --git a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Gaudi_1-21.json b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Gaudi_1-21.json new file mode 100644 index 00000000..76ac73d1 --- /dev/null +++ b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Gaudi_1-21.json @@ -0,0 +1,756 @@ +{ + "Gaudi3": [ + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-2-70b-hf --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "128", + "bs": "1750", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1750 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "4853" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "2048", + "bs": "512", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "6835" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "128", + "bs": "242", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 242 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "506" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "2048", + "bs": "241", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 241 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2859" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 1 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "128", + "output_len": "128", + "bs": "1536", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1536 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "25097" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "128", + "output_len": "2048", + "bs": "768", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "20425" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "2048", + "output_len": "128", + "bs": "256", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2765" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "2048", + "output_len": "2048", + "bs": "256", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "9013" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "128", + "bs": "2048", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2048 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "5466" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "2048", + "bs": "450", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 450 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "6535" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "128", + "bs": "223", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 223 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "663" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "2048", + "bs": "175", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 175 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2891" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "128", + "bs": "4000", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 4000 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "18290" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "2048", + "bs": "768", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "21138" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "128", + "bs": "512", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2273" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "2048", + "bs": "600", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 600 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "10600" + }, + { + "model": "Llama3.3_70b", + "num_cards": "8", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.3_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "128", + "bs": "3986", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 3986 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "16622" + }, + { + "model": "Llama3.3_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "2048", + "bs": "2048", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 2048 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "24705" + }, + { + "model": "Llama3.3_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "128", + "bs": "774", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 774 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "1890" + }, + { + "model": "Llama3.3_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "2048", + "bs": "719", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 719 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "11043" + }, + { + "model": "Llama3.1_405b", + "num_cards": "8", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_405b", + "num_cards": "8", + "input_len": "128", + "output_len": "128", + "bs": "2996", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2996 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "3488" + }, + { + "model": "Llama3.1_405b", + "num_cards": "8", + "input_len": "128", + "output_len": "2048", + "bs": "460", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 460 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "4998" + }, + { + "model": "Llama3.1_405b", + "num_cards": "8", + "input_len": "2048", + "output_len": "128", + "bs": "195", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 195 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "394" + }, + { + "model": "Llama3.1_405b", + "num_cards": "8", + "input_len": "2048", + "output_len": "2048", + "bs": "180", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 180 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2238" + } + ], + "Gaudi2": [ + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-2-70b-hf --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "128", + "bs": "1750", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1750 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2935" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "2048", + "bs": "256", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "3963" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "128", + "bs": "95", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 95 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "318" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "2048", + "bs": "159", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 159 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "1767" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 1 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "128", + "output_len": "128", + "bs": "2816", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2816 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 3", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "19907" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "128", + "output_len": "2048", + "bs": "512", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "14866" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "2048", + "output_len": "128", + "bs": "179", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 179 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 3", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2099" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "2048", + "output_len": "2048", + "bs": "256", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "6060" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "128", + "bs": "1792", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1792 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "3421" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "2048", + "bs": "256", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "3827" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "128", + "bs": "142", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 142 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "462" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "2048", + "bs": "139", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 139 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "1665" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "128", + "bs": "4000", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 4000 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "10404" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "2048", + "bs": "768", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "13639" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "128", + "bs": "383", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 383 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "1553" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "2048", + "bs": "476", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 476 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "PT_HPU_LAZY_MODE": "1", + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "6715" + } + ] +}