JakeStevens
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 27 additions & 5 deletions b/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 27 additions & 5 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/README.md‎
Lines changed: 78 additions & 30 deletions b/‎examples/qualcomm/oss_scripts/llama/README.md‎
Lines changed: 78 additions & 30 deletions
@@ -8199,7 +8199,6 @@ def test_static_llm_model(self):  # noqa: C901
             "1024",
             "--max_context_len",
             "1024",
-            "--skip_user_prompt_calibration",
         ]
 
         match self.static_llm_eval_method:
@@ -8249,10 +8248,17 @@ def test_static_llm_model(self):  # noqa: C901
                     ]
                 )
             case _:
-                cmds.remove("--skip_user_prompt_calibration")
                 logging.warning(
                     "No llm eval method chosen. Only generate model output."
                 )
+                cmds.extend(
+                    [
+                        "--calib_tasks",
+                        "wikitext",
+                        "--calib_limit",
+                        "1",
+                    ]
+                )
 
         if is_llama_model:
             cmds.extend(
@@ -8425,6 +8431,10 @@ def test_codegen2_1b(self):
             "128",
             "--max_context_len",
             "128",
+            "--calib_tasks",
+            "wikitext",
+            "--calib_limit",
+            "1",
         ]
         self.add_default_cmds(cmds)
 
@@ -8486,6 +8496,10 @@ def test_llama_stories_260k(self):
             "128",
             "--max_context_len",
             "128",
+            "--calib_tasks",
+            "wikitext",
+            "--calib_limit",
+            "1",
         ]
         self.add_default_cmds(cmds)
 
@@ -8549,6 +8563,10 @@ def test_llama_stories_110m(self):
             "128",
             "--max_context_len",
             "128",
+            "--calib_tasks",
+            "wikitext",
+            "--calib_limit",
+            "1",
         ]
         if self.use_fp16:
             cmds.append("--use_fp16")
@@ -8702,7 +8720,7 @@ class VLMSpecs(MLLMSpecs):
     def setUp(self):
         self.alm_specs = {
             "granite_speech_3_3-2b": TestExampleMultimodalityScript.ALMSpecs(
-                max_seq_len=512,
+                max_seq_len=1024,
                 sm8650_token_rate=5,
                 sm8750_token_rate=8,
                 encoder_pte_size=900_000_000,  # 900MB
@@ -8714,7 +8732,7 @@ def setUp(self):
         }
         self.vlm_specs = {
             "smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs(
-                max_seq_len=128,
+                max_seq_len=1024,
                 sm8650_token_rate=50,
                 sm8750_token_rate=55,
                 encoder_pte_size=110_000_000,  # 110MB
@@ -8724,7 +8742,7 @@ def setUp(self):
                 golden_image_feature="city",
             ),
             "internvl3_1b": TestExampleMultimodalityScript.VLMSpecs(
-                max_seq_len=320,
+                max_seq_len=1024,
                 sm8650_token_rate=11,
                 sm8750_token_rate=13,
                 encoder_pte_size=425_000_000,  # 425MB
@@ -8776,6 +8794,8 @@ def test_static_asr(self):
             "kv",
             "--max_seq_len",
             f"{alm_specs.max_seq_len}",
+            "--calib_samples",
+            "./examples/qualcomm/oss_scripts/llama/assets/samples/audio.json",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
@@ -8859,6 +8879,8 @@ def test_static_vlm(self):
             "kv",
             "--max_seq_len",
             f"{vlm_specs.max_seq_len}",
+            "--calib_samples",
+            "./examples/qualcomm/oss_scripts/llama/assets/samples/vision.json",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
 
@@ -130,12 +130,12 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL
 Default example using hybrid mode.
 ```bash
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_tasks wikitext --calib_limit 1
-```
+
 
 #### Codegen2
 Default example using kv mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model codegen2_1b --model_mode kv --max_seq_len 1024 --prompt "def hello_world():"
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model codegen2_1b --model_mode kv --max_seq_len 1024 --prompt "def hello_world():" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/text.json
 ```
 
 #### Gemma 2B
@@ -210,7 +210,17 @@ Default example using hybrid mode.
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm3-3b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_tasks wikitext --calib_limit 1
 ```
 
-## Multimodal Support
+#### Using custom calibration samples for LLMs
+
+Instead of `--calib_tasks`, you can supply your own conversation JSON files via `--calib_samples`. The samples are fed into the quantization calibration pass to collect activation observer statistics — they do not affect the inference prompt. This is useful when you want to calibrate on domain-specific or instruct-format data rather than a generic lm_eval task.
+
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm2_135m --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/text.json
+```
+
+You can also provide both `--calib_tasks` and `--calib_samples` at the same time; the pipeline concatenates both data sources for calibration.
+
+
 
 ### Overview
 
@@ -268,7 +278,7 @@ pip install soundfile
 
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model granite_speech_3_3-2b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "can you transcribe the speech into a written format?" --audio_path "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model granite_speech_3_3-2b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "can you transcribe the speech into a written format?" --audio_path "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/audio.json
 ```
 
 ### Specifying Custom Audio
@@ -281,9 +291,6 @@ You can specify a custom audio file for ALM models using the `--audio_path` flag
 - **Local file paths**: Absolute or relative paths to `.wav` files on your system
   - Example: `"/path/to/your/audio.wav"`
 
-**Default behavior:**
-If `--audio_path` is not specified, the system will automatically use the default audio file defined in the model's configuration file (`encoder/encoder_config.py`).
-
 #### Audio Preprocessing
 
 The audio encoder configuration is defined in `encoder/encoder_config.py`:
@@ -294,7 +301,6 @@ The audio encoder configuration is defined in `encoder/encoder_config.py`:
 class GraniteSpeechEncoder(AudioModalityConfig):
     encoder_class = GraniteSpeechCTCEncoderWrapper
     audio_seq_len = 171
-    audio_url = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"  # Default audio (content: "After his nap, ...")
     quant_recipe = GraniteSpeechEncoderQuantRecipe
 ```
 
@@ -351,13 +357,13 @@ Vision-Language Models (VLMs) combine computer vision and natural language proce
 #### SmolVLM 500M
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode hybrid --prefill_ar_len 16 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode hybrid --prefill_ar_len 16 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json
 ```
 
 #### InternVL 1B
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model internvl3_1b --model_mode hybrid --prefill_ar_len 32 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg"
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model internvl3_1b --model_mode hybrid --prefill_ar_len 32 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json
 ```
 
 ### Specifying Custom Image
@@ -370,9 +376,6 @@ Take a example image of Statue-of-Liberty in New York Bay
 - **Local file paths**: Absolute or relative paths to image files on your system
   - Example: [`./examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png`](assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png)
 
-**Default behavior:**
-If `--image_path` is not specified, the system will automatically use the default image URL defined in the model's configuration file (`encoder/encoder_config.py`).
-
 #### Image Preprocessing
 
 Each VLM model has specific preprocessing requirements defined in its configuration:
@@ -385,7 +388,6 @@ class SmolVLMEncoder(VisionModalityConfig):
     img_seq_len = 64
     img_resized_h = 512
     img_resized_w = 512
-    img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"  # Default image
     quant_recipe = SmolVLMEncoderQuantRecipe
 ```
 
@@ -427,7 +429,7 @@ PROMPT2="Answer the question: What's the main object in first image?"
 PROMPT3="<image>Caption this image."
 
 # Execute the multi-turn conversation
-python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL"
+python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL" --calib_samples examples/qualcomm/oss_scripts/llama/assets/samples/vision.json
 ```
 
 **How it works:**
@@ -453,16 +455,19 @@ The VLM inference pipeline consists of:
    - Special tokens (e.g., `<image>`, `<|fake_token_around_image|>`, `<fake_token_around_image>`) mark modality boundaries (see [tokenizer.py](tokenizer.py))
 
    ```python
-   # Special tokens for Vision-Language Model
-   VLM_SPECIAL_TOKENS = {
-       "smolvlm_500m_instruct": {
-           "image_token": "<image>",
-           "global_img": "<global-img>",
-           "fake_wrap_start": "<fake_token_around_image>",
-           "fake_wrap_end": "<fake_token_around_image>",
-       },
-       ...
-   }
+   # Token fields on each encoder config subclass (encoder/encoder_config.py)
+   @dataclass(init=False, frozen=True)
+   class SmolVLMEncoder(VisionModalityConfig):
+       img_token = "<image>"
+       fake_wrap_start = "<fake_token_around_image>"
+       fake_wrap_end = "<fake_token_around_image>"
+       global_img_token = "<global-img>"
+
+   @dataclass(init=False, frozen=True)
+   class InternVL3Encoder(VisionModalityConfig):
+       img_token = "<IMG_CONTEXT>"
+       fake_wrap_start = "<img>"
+       fake_wrap_end = "</img>"
    ```
    - Final fused sequence: `[batch, img_seq_len + text_seq_len, hidden_dim]`
 
@@ -545,16 +550,13 @@ From the example script above, 1 wikitext sample is used to evaluate all 3 phase
 Example:
 ```bash
 # 1st run to compile with --calib_limit 1
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --calib_tasks wikitext --calib_limit 1 --compile_only
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --calib_tasks wikitext --calib_limit 1 -a ${FOLDER_TO_PRE_GEN_PTE} --compile_only
 ```
 ```bash
 # 2nd run to perform QNN device execution with --eval_limit 3
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --eval_tasks wikitext --eval_limit 3 --pre_gen_pte ${PATH_TO_ARTIFACT_IN_1ST_RUN} --quant_attrs_path ${PATH_TO_ARTIFACT_IN_1ST_RUN}/kv_llama_qnn_quant_attrs.json
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods tasks_eval --eval_tasks wikitext --eval_limit 3 --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
 ```
 
-#### Tasks quantization calibration
-If `--calib_tasks ${TASK}` is not provided, the program will use `--prompt ${PROMPT}` as the dataset for quantization calibration.
-`--calib_tasks` and `--eval_tasks` are independent flags. `--calib_tasks` controls which tasks are used for quantization calibration, while `--eval_tasks` controls which tasks are used for perplexity evaluation. They can be set to different tasks or limits as needed.
 
 #### SQNR Evalution
 To evaluate QNN's output logits against the golden logits from `nn.Module`, users can provide the flag `--sqnr_eval`. Please note that SQNR evaluation will only compare the logits of the user's prompt and will not compare the new tokens generated by the model.
@@ -563,6 +565,52 @@ Example:
 python examples/qualcomm/oss_scripts/llama/llama.py --build_folder build-android --device ${SERIAL_NUM} --soc_model ${SOC_MODEL} --prompt "I would like to learn python, could you teach me with a simple example?" --temperature 0 --model_mode kv --max_seq_len 1024 --decoder_model qwen2_5-0_5b --eval_methods sqnr_eval
 ```
 
+
+
+#### Quantization
+
+The calibration data is independent from the runtime evaluation set, and only affects quantization quality, not the inference output.
+
+Calibration data is required for compilation. There are two ways to supply it:
+
+1. **`--calib_tasks`** — calibrate on one or more lm_eval tasks (tune with `--calib_limit` and `--calib_num_fewshot`). LLM-only.
+2. **`--calib_samples`** — calibrate on custom conversation samples provided as JSON files (see format below). Required for multimodal models (VLM/ALM).
+
+For LLMs, provide at least one of the two; for multimodal models, `--calib_samples` is mandatory.
+
+Calibration and runtime evaluation use separate flag sets and can target different tasks or limits as needed:
+
+| Purpose | Flags |
+|---|---|
+| Calibration data (lm_eval tasks) | `--calib_tasks`, `--calib_limit`, `--calib_num_fewshot` |
+| Calibration data (custom samples) | `--calib_samples` (JSON files, HuggingFace message format) |
+
+##### Custom calibration samples (`--calib_samples`)
+
+`--calib_samples` accepts one or more JSON files. Each file is a flat list of sample objects. Each sample has a `messages` field following the HuggingFace chat template, and an optional `files` field for media inputs (local paths or URLs):
+
+```json
+[
+  {
+    "files": ["path/or/url/to/files"],
+    "messages": [
+      {"role": "user",    "content": "..." },
+      {"role": "assistant", "content": "..."}
+    ]
+  }
+]
+```
+
+`files` is only required for multimodal models (VLM: image paths/URLs, ALM: audio paths/URLs). For LLM-only models, `files` can be omitted. `content` can be a plain string or a list of HuggingFace content blocks (e.g. `[{"type": "image"}, {"type": "text", "text": "..."}]` for vision inputs).
+
+Ready-to-use examples for each model type are provided under `assets/samples/`:
+
+| Model type | Example file |
+|---|---|
+| LLM | [assets/samples/text.json](assets/samples/text.json) |
+| ALM (audio) | [assets/samples/audio.json](assets/samples/audio.json) |
+| VLM (vision) | [assets/samples/vision.json](assets/samples/vision.json) |
+
 #### Quantization Guidance
 
 To automatically identify sensitive layers and generate a mixed-precision recipe suggestion, add the `--quant_recipe_suggestion` flag. During calibration, the analyzer compares FP32 and QDQ intermediate outputs layer-by-layer using SQNR, then writes two files to the working directory: