rsenthilkumar6
diff --git a/‎.github/ISSUE_TEMPLATE/011-bug-results.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/ISSUE_TEMPLATE/011-bug-results.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/019-bug-misc.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/ISSUE_TEMPLATE/019-bug-misc.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/build-and-test-snapdragon.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build-and-test-snapdragon.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 2 additions & 1 deletion b/‎common/arg.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎common/common.cpp‎
Lines changed: 2 additions & 2 deletions b/‎common/common.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 3 additions & 3 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/backend/snapdragon/CMakeUserPresets.json‎
Lines changed: 4 additions & 4 deletions b/‎docs/backend/snapdragon/CMakeUserPresets.json‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/backend/snapdragon/README.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/backend/snapdragon/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/mmvq.cu‎
Lines changed: 2 additions & 0 deletions b/‎ggml/src/ggml-cuda/mmvq.cu‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎ggml/src/ggml-hexagon/ggml-hexagon.cpp‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-hexagon/ggml-hexagon.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -100,8 +100,8 @@ body:
       label: Relevant log output
       description: >
           Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          For very long logs (thousands of lines), preferably upload them as files instead.
-          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
+          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
+          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
       value: |
         <details>
         <summary>Logs</summary>
 
@@ -88,8 +88,8 @@ body:
       description: >
           If applicable, please copy and paste any relevant log output, including any generated text.
           If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
-          For very long logs (thousands of lines), please upload them as files instead.
-          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
+          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
+          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
       value: |
         <details>
         <summary>Logs</summary>
 
@@ -31,7 +31,7 @@ jobs:
   android-ndk-snapdragon:
     runs-on: ubuntu-latest
     container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
+      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6'
     defaults:
       run:
         shell: bash
@@ -61,7 +61,7 @@ jobs:
   linux-iot-snapdragon:
     runs-on: ubuntu-latest
     container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1'
+      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6'
     defaults:
       run:
         shell: bash
 
@@ -3364,7 +3364,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             " - 1: error\n"
             " - 2: warning\n"
             " - 3: info\n"
-            " - 4: debug\n"
+            " - 4: trace (more info)\n"
+            " - 5: debug\n"
             "(default: %d)\n", params.verbosity),
         [](common_params & params, int value) {
             params.verbosity = value;
 
@@ -1173,7 +1173,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)
             params.tensor_buft_overrides.data(),
             params.fit_params_target.data(),
             params.fit_params_min_ctx,
-            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+            params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
     }
 
     llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
@@ -1366,7 +1366,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
     }
 
     if (params.warmup) {
-        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
+        LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
 
         llama_set_warmup(lctx, true);
 
 
@@ -115,15 +115,15 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--mmproj", action="store_true",
-        help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
+        help="Export multimodal projector (mmproj) for vision models. This will only work on some vision models. An 'mmproj-' prefix will be added to the output file name.",
     )
     parser.add_argument(
         "--mtp", action="store_true",
-        help="(Experimental) Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. Output file name will get a '-MTP' suffix.",
+        help="Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. An 'mtp-' prefix will be added to the output file name.",
     )
     parser.add_argument(
         "--no-mtp", action="store_true",
-        help="(Experimental) Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, so the bundled default is more space-efficient overall.",
+        help="Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, but even though the bundled default is more space-efficient overall, this allows differing quantization which may be more performant.",
     )
     parser.add_argument(
         "--mistral-format", action="store_true",
 
@@ -10,8 +10,8 @@
             "ANDROID_ABI":      "arm64-v8a",
             "ANDROID_PLATFORM": "android-31",
             "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
-            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
-            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
             "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
             "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
             "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
@@ -59,8 +59,8 @@
         "toolset":      { "value": "host=x86_64", "strategy": "external" },
         "cacheVariables": {
             "CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake",
-            "CMAKE_C_FLAGS":   "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
-            "CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS":   "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE",
             "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
             "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
             "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
 
@@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
 This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
 
 ```
-~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
+~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6
 [d]/> cd /workspace
 ```
 
 
@@ -359,7 +359,9 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
                 case GGML_TYPE_Q5_1:
                 case GGML_TYPE_Q8_0:
                 case GGML_TYPE_Q4_K:
+                    return 8;
                 case GGML_TYPE_Q6_K:
+                    return 2;
                 case GGML_TYPE_IQ4_NL:
                     return 8;
                 default:
 
@@ -2661,7 +2661,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
 
     int mode = op_params[2];
 
-    if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
+    if (mode == GGML_ROPE_TYPE_VISION) {
         return false;
     }
     if (mode & 1) {
Original file line number	Diff line number	Diff line change
`@@ -1173,7 +1173,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)`
`1173`	`1173`	`params.tensor_buft_overrides.data(),`
`1174`	`1174`	`params.fit_params_target.data(),`
`1175`	`1175`	`params.fit_params_min_ctx,`
`1176`		`- params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);`
	`1176`	`+ params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);`
`1177`	`1177`	`}`
`1178`	`1178`
`1179`	`1179`	`llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);`
`@@ -1366,7 +1366,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode`
`1366`	`1366`	`}`
`1367`	`1367`
`1368`	`1368`	`if (params.warmup) {`
`1369`		`- LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);`
	`1369`	`+ LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);`
`1370`	`1370`
`1371`	`1371`	`llama_set_warmup(lctx, true);`
`1372`	`1372`
Original file line number	Diff line number	Diff line change
`@@ -115,15 +115,15 @@ def parse_args() -> argparse.Namespace:`
`115`	`115`	`)`
`116`	`116`	`parser.add_argument(`
`117`	`117`	`"--mmproj", action="store_true",`
`118`		`- help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",`
	`118`	`+ help="Export multimodal projector (mmproj) for vision models. This will only work on some vision models. An 'mmproj-' prefix will be added to the output file name.",`
`119`	`119`	`)`
`120`	`120`	`parser.add_argument(`
`121`	`121`	`"--mtp", action="store_true",`
`122`		`- help="(Experimental) Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. Output file name will get a '-MTP' suffix.",`
	`122`	`+ help="Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. An 'mtp-' prefix will be added to the output file name.",`
`123`	`123`	`)`
`124`	`124`	`parser.add_argument(`
`125`	`125`	`"--no-mtp", action="store_true",`
`126`		`- help="(Experimental) Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, so the bundled default is more space-efficient overall.",`
	`126`	`+ help="Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, but even though the bundled default is more space-efficient overall, this allows differing quantization which may be more performant.",`
`127`	`127`	`)`
`128`	`128`	`parser.add_argument(`
`129`	`129`	`"--mistral-format", action="store_true",`
Original file line number	Diff line number	Diff line change
`@@ -2661,7 +2661,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess`
`2661`	`2661`
`2662`	`2662`	`int mode = op_params[2];`
`2663`	`2663`
`2664`		`- if ((mode & GGML_ROPE_TYPE_MROPE) \|\| (mode & GGML_ROPE_TYPE_VISION)) {`
	`2664`	`+ if (mode == GGML_ROPE_TYPE_VISION) {`
`2665`	`2665`	`return false;`
`2666`	`2666`	`}`
`2667`	`2667`	`if (mode & 1) {`