diff --git a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile
index 987eefc5..a3d84895 100644
--- a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile
+++ b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile
@@ -1,7 +1,7 @@
 # Copyright (c) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:1.20.0-543
+FROM vault.habana.ai/gaudi-docker/1.21.2/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:1.21.2-76
 
 # Need node to build doc HTML. Taken from https://stackoverflow.com/a/67491580
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -11,16 +11,18 @@ RUN npm install n -g && \
     n latest
 
 RUN python3 -m pip install --no-cache-dir --upgrade pip
-RUN python3 -m pip install --upgrade-strategy eager optimum[habana]
+RUN python3 -m pip install --upgrade-strategy eager
 RUN python3 -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
 RUN mkdir -p /workspace
 WORKDIR /workspace
-RUN git clone https://github.com/huggingface/optimum-habana && cd optimum-habana && git checkout v1.16.0
+RUN git clone -b v1.18.0 https://github.com/huggingface/optimum-habana && \
+    cd optimum-habana && \
+    python3 setup.py install
 
 WORKDIR /workspace/optimum-habana/examples/text-generation
 RUN python3 -m pip install -r requirements.txt
 RUN python3 -m pip install -r requirements_lm_eval.txt
 COPY . .
-COPY Gaudi_1-20.json Gaudi.json
+COPY Gaudi_1-21.json Gaudi.json
 COPY HQT_1-20.zip HQT.zip
 RUN python3 -m pip install -r requirements_bm.txt
diff --git a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Gaudi_1-21.json b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Gaudi_1-21.json
new file mode 100644
index 00000000..76ac73d1
--- /dev/null
+++ b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Gaudi_1-21.json
@@ -0,0 +1,756 @@
+{
+  "Gaudi3": [
+    {
+      "model": "Llama2_70b",
+      "num_cards": "2",
+      "input_len": "",
+      "output_len": "",
+      "bs": "",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-2-70b-hf --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": ""
+    },
+    {
+      "model": "Llama2_70b",
+      "num_cards": "2",
+      "input_len": "128",
+      "output_len": "128",
+      "bs": "1750",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1750 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "4853"
+    },
+    {
+      "model": "Llama2_70b",
+      "num_cards": "2",
+      "input_len": "128",
+      "output_len": "2048",
+      "bs": "512",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "6835"
+    },
+    {
+      "model": "Llama2_70b",
+      "num_cards": "2",
+      "input_len": "2048",
+      "output_len": "128",
+      "bs": "242",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 242 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "506"
+    },
+    {
+      "model": "Llama2_70b",
+      "num_cards": "2",
+      "input_len": "2048",
+      "output_len": "2048",
+      "bs": "241",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 241 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "2859"
+    },
+    {
+      "model": "Llama3.1_8b",
+      "num_cards": "1",
+      "input_len": "",
+      "output_len": "",
+      "bs": "",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 1 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": ""
+    },
+    {
+      "model": "Llama3.1_8b",
+      "num_cards": "1",
+      "input_len": "128",
+      "output_len": "128",
+      "bs": "1536",
+      "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1536 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "25097"
+    },
+    {
+      "model": "Llama3.1_8b",
+      "num_cards": "1",
+      "input_len": "128",
+      "output_len": "2048",
+      "bs": "768",
+      "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "20425"
+    },
+    {
+      "model": "Llama3.1_8b",
+      "num_cards": "1",
+      "input_len": "2048",
+      "output_len": "128",
+      "bs": "256",
+      "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "2765"
+    },
+    {
+      "model": "Llama3.1_8b",
+      "num_cards": "1",
+      "input_len": "2048",
+      "output_len": "2048",
+      "bs": "256",
+      "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "9013"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "2",
+      "input_len": "",
+      "output_len": "",
+      "bs": "",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": ""
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "2",
+      "input_len": "128",
+      "output_len": "128",
+      "bs": "2048",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2048 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "5466"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "2",
+      "input_len": "128",
+      "output_len": "2048",
+      "bs": "450",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 450 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "6535"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "2",
+      "input_len": "2048",
+      "output_len": "128",
+      "bs": "223",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 223 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "663"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "2",
+      "input_len": "2048",
+      "output_len": "2048",
+      "bs": "175",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 175 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "2891"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "8",
+      "input_len": "",
+      "output_len": "",
+      "bs": "",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": ""
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "8",
+      "input_len": "128",
+      "output_len": "128",
+      "bs": "4000",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 4000 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "18290"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "8",
+      "input_len": "128",
+      "output_len": "2048",
+      "bs": "768",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "21138"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "8",
+      "input_len": "2048",
+      "output_len": "128",
+      "bs": "512",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "2273"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "8",
+      "input_len": "2048",
+      "output_len": "2048",
+      "bs": "600",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 600 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "10600"
+    },
+    {
+      "model": "Llama3.3_70b",
+      "num_cards": "8",
+      "input_len": "",
+      "output_len": "",
+      "bs": "",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": ""
+    },
+    {
+      "model": "Llama3.3_70b",
+      "num_cards": "8",
+      "input_len": "128",
+      "output_len": "128",
+      "bs": "3986",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 3986 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "16622"
+    },
+    {
+      "model": "Llama3.3_70b",
+      "num_cards": "8",
+      "input_len": "128",
+      "output_len": "2048",
+      "bs": "2048",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 2048 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "24705"
+    },
+    {
+      "model": "Llama3.3_70b",
+      "num_cards": "8",
+      "input_len": "2048",
+      "output_len": "128",
+      "bs": "774",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 774 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "1890"
+    },
+    {
+      "model": "Llama3.3_70b",
+      "num_cards": "8",
+      "input_len": "2048",
+      "output_len": "2048",
+      "bs": "719",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 719 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "11043"
+    },
+    {
+      "model": "Llama3.1_405b",
+      "num_cards": "8",
+      "input_len": "",
+      "output_len": "",
+      "bs": "",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": ""
+    },
+    {
+      "model": "Llama3.1_405b",
+      "num_cards": "8",
+      "input_len": "128",
+      "output_len": "128",
+      "bs": "2996",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2996 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "3488"
+    },
+    {
+      "model": "Llama3.1_405b",
+      "num_cards": "8",
+      "input_len": "128",
+      "output_len": "2048",
+      "bs": "460",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 460 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "4998"
+    },
+    {
+      "model": "Llama3.1_405b",
+      "num_cards": "8",
+      "input_len": "2048",
+      "output_len": "128",
+      "bs": "195",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 195 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "394"
+    },
+    {
+      "model": "Llama3.1_405b",
+      "num_cards": "8",
+      "input_len": "2048",
+      "output_len": "2048",
+      "bs": "180",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 180 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "2238"
+    }
+  ],
+  "Gaudi2": [
+    {
+      "model": "Llama2_70b",
+      "num_cards": "2",
+      "input_len": "",
+      "output_len": "",
+      "bs": "",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-2-70b-hf --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": ""
+    },
+    {
+      "model": "Llama2_70b",
+      "num_cards": "2",
+      "input_len": "128",
+      "output_len": "128",
+      "bs": "1750",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1750 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "2935"
+    },
+    {
+      "model": "Llama2_70b",
+      "num_cards": "2",
+      "input_len": "128",
+      "output_len": "2048",
+      "bs": "256",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "3963"
+    },
+    {
+      "model": "Llama2_70b",
+      "num_cards": "2",
+      "input_len": "2048",
+      "output_len": "128",
+      "bs": "95",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 95 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "318"
+    },
+    {
+      "model": "Llama2_70b",
+      "num_cards": "2",
+      "input_len": "2048",
+      "output_len": "2048",
+      "bs": "159",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 159 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "1767"
+    },
+    {
+      "model": "Llama3.1_8b",
+      "num_cards": "1",
+      "input_len": "",
+      "output_len": "",
+      "bs": "",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 1 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": ""
+    },
+    {
+      "model": "Llama3.1_8b",
+      "num_cards": "1",
+      "input_len": "128",
+      "output_len": "128",
+      "bs": "2816",
+      "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2816 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 3",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "19907"
+    },
+    {
+      "model": "Llama3.1_8b",
+      "num_cards": "1",
+      "input_len": "128",
+      "output_len": "2048",
+      "bs": "512",
+      "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "14866"
+    },
+    {
+      "model": "Llama3.1_8b",
+      "num_cards": "1",
+      "input_len": "2048",
+      "output_len": "128",
+      "bs": "179",
+      "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 179 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 3",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "2099"
+    },
+    {
+      "model": "Llama3.1_8b",
+      "num_cards": "1",
+      "input_len": "2048",
+      "output_len": "2048",
+      "bs": "256",
+      "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "6060"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "2",
+      "input_len": "",
+      "output_len": "",
+      "bs": "",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": ""
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "2",
+      "input_len": "128",
+      "output_len": "128",
+      "bs": "1792",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1792 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "3421"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "2",
+      "input_len": "128",
+      "output_len": "2048",
+      "bs": "256",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "3827"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "2",
+      "input_len": "2048",
+      "output_len": "128",
+      "bs": "142",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 142 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "462"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "2",
+      "input_len": "2048",
+      "output_len": "2048",
+      "bs": "139",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 139 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "1665"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "8",
+      "input_len": "",
+      "output_len": "",
+      "bs": "",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_measure.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": ""
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "8",
+      "input_len": "128",
+      "output_len": "128",
+      "bs": "4000",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 4000 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "10404"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "8",
+      "input_len": "128",
+      "output_len": "2048",
+      "bs": "768",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "13639"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "8",
+      "input_len": "2048",
+      "output_len": "128",
+      "bs": "383",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 383 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "1553"
+    },
+    {
+      "model": "Llama3.1_70b",
+      "num_cards": "8",
+      "input_len": "2048",
+      "output_len": "2048",
+      "bs": "476",
+      "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 476 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute",
+      "env_vars": {
+        "PT_HPU_LAZY_MODE": "1",
+        "HF_DATASETS_TRUST_REMOTE_CODE": "true",
+        "QUANT_CONFIG": "./quantization_config/maxabs_quant.json",
+        "TQDM_DISABLE": "1"
+      },
+      "ref_perf": "6715"
+    }
+  ]
+}