quant-wht: add IQ support and configurable skip list

mengqin · mengqin · commit f73287f748ec · 2026-05-02T19:15:41.000-07:00
Extend GGUF-level quant_wht to IQ tensor types and make WHT skipping
metadata-driven. --quant-wht now uses the default low-precision skip list,
--quant-wht-full rotates all eligible tensors, and --quant-wht-skip-type
allows overriding skipped GGML tensor types.

Persist general.quant_wht.skip_types in GGUF and make model loading/decode
honor that list; missing skip_types keeps full WHT behavior for compatibility.
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
@@ -56,7 +56,16 @@ static bool ggml_cuda_quant_wht_type_supported(const ggml_type type) {
            type == GGML_TYPE_Q4_K ||
            type == GGML_TYPE_Q5_K ||
            type == GGML_TYPE_Q6_K ||
-           type == GGML_TYPE_Q8_0;
+           type == GGML_TYPE_Q8_0 ||
+           type == GGML_TYPE_IQ1_S ||
+           type == GGML_TYPE_IQ1_M ||
+           type == GGML_TYPE_IQ2_XXS ||
+           type == GGML_TYPE_IQ2_XS ||
+           type == GGML_TYPE_IQ2_S ||
+           type == GGML_TYPE_IQ3_XXS ||
+           type == GGML_TYPE_IQ3_S ||
+           type == GGML_TYPE_IQ4_NL ||
+           type == GGML_TYPE_IQ4_XS;
 }
 
 static void ggml_cuda_quant_wht_log_once(const ggml_type type, const char * path) {
diff --git a/include/llama.h b/include/llama.h
@@ -411,7 +411,9 @@ extern "C" {
         bool pure;                                                  // quantize all tensors to the default type
         bool keep_split;                                            // quantize to the same number of shards
         bool dry_run;                                               // calculate and show the final quantization size without performing quantization
-        bool quant_wht;                                             // store eligible Q_K tensors in WHT-rotated domain
+        bool quant_wht;                                             // store eligible Q_K/Q8_0/IQ tensors in WHT-rotated domain
+        bool quant_wht_full;                                        // rotate every eligible tensor instead of using the skip list
+        const char * quant_wht_skip_types;                          // comma-separated GGML tensor types to leave unrotated when quant_wht_full is false
         uint32_t quant_wht_dim;                                      // WHT dimension, currently only 256 is supported
         const struct llama_model_imatrix_data * imatrix;            // pointer to importance matrix data
         const struct llama_model_kv_override * kv_overrides;        // pointer to kv overrides
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -40,6 +40,7 @@ struct llama_hparams {
     bool use_par_res;
     bool swin_norm;
     bool quant_wht_enabled = false;
+    char quant_wht_skip_types[512] = {};
 
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -7,6 +7,7 @@
 
 #include <algorithm>
 #include <array>
+#include <cctype>
 #include <cinttypes>
 #include <cstdint>
 #include <cstdlib>
@@ -1061,7 +1062,82 @@ static bool llama_model_quant_wht_type_supported(ggml_type type) {
            type == GGML_TYPE_Q4_K ||
            type == GGML_TYPE_Q5_K ||
            type == GGML_TYPE_Q6_K ||
-           type == GGML_TYPE_Q8_0;
+           type == GGML_TYPE_Q8_0 ||
+           type == GGML_TYPE_IQ1_S ||
+           type == GGML_TYPE_IQ1_M ||
+           type == GGML_TYPE_IQ2_XXS ||
+           type == GGML_TYPE_IQ2_XS ||
+           type == GGML_TYPE_IQ2_S ||
+           type == GGML_TYPE_IQ3_XXS ||
+           type == GGML_TYPE_IQ3_S ||
+           type == GGML_TYPE_IQ4_NL ||
+           type == GGML_TYPE_IQ4_XS;
+}
+
+static const char * llama_model_quant_wht_type_name(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q2_K:    return "Q2_K";
+        case GGML_TYPE_Q3_K:    return "Q3_K";
+        case GGML_TYPE_Q4_K:    return "Q4_K";
+        case GGML_TYPE_Q5_K:    return "Q5_K";
+        case GGML_TYPE_Q6_K:    return "Q6_K";
+        case GGML_TYPE_Q8_0:    return "Q8_0";
+        case GGML_TYPE_IQ1_S:   return "IQ1_S";
+        case GGML_TYPE_IQ1_M:   return "IQ1_M";
+        case GGML_TYPE_IQ2_XXS: return "IQ2_XXS";
+        case GGML_TYPE_IQ2_XS:  return "IQ2_XS";
+        case GGML_TYPE_IQ2_S:   return "IQ2_S";
+        case GGML_TYPE_IQ3_XXS: return "IQ3_XXS";
+        case GGML_TYPE_IQ3_S:   return "IQ3_S";
+        case GGML_TYPE_IQ4_NL:  return "IQ4_NL";
+        case GGML_TYPE_IQ4_XS:  return "IQ4_XS";
+        default:                return nullptr;
+    }
+}
+
+static std::string llama_model_quant_wht_normalize_type_token(std::string token) {
+    token.erase(std::remove_if(token.begin(), token.end(), [](unsigned char c) { return std::isspace(c) != 0; }), token.end());
+    std::transform(token.begin(), token.end(), token.begin(), [](unsigned char c) { return (char) std::toupper(c); });
+    return token;
+}
+
+static ggml_type llama_model_quant_wht_parse_type_token(const std::string & token) {
+    const std::string name = llama_model_quant_wht_normalize_type_token(token);
+    for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
+        const ggml_type type = (ggml_type) i;
+        const char * type_name = llama_model_quant_wht_type_name(type);
+        if (type_name != nullptr && name == type_name) {
+            return type;
+        }
+    }
+    return GGML_TYPE_COUNT;
+}
+
+static bool llama_model_quant_wht_skip_list_has(const std::string & skip_types, ggml_type type) {
+    size_t start = 0;
+    while (start <= skip_types.size()) {
+        const size_t end = skip_types.find(',', start);
+        const std::string token = skip_types.substr(start, end == std::string::npos ? std::string::npos : end - start);
+        if (!llama_model_quant_wht_normalize_type_token(token).empty()) {
+            const ggml_type parsed = llama_model_quant_wht_parse_type_token(token);
+            if (parsed == GGML_TYPE_COUNT || !llama_model_quant_wht_type_supported(parsed)) {
+                throw std::runtime_error(format("unsupported general.quant_wht.skip_types entry: %s", token.c_str()));
+            }
+            if (parsed == type) {
+                return true;
+            }
+        }
+        if (end == std::string::npos) {
+            break;
+        }
+        start = end + 1;
+    }
+    return false;
+}
+
+static bool llama_model_quant_wht_type_enabled(ggml_type type, const std::string & skip_types) {
+    return llama_model_quant_wht_type_supported(type) &&
+           !llama_model_quant_wht_skip_list_has(skip_types, type);
 }
 
 static bool llama_model_quant_wht_backend_supported(ggml_backend_dev_t dev) {
@@ -1174,12 +1250,12 @@ struct ggml_tensor * llama_model_loader::create_tensor(
         const bool quant_wht_tensor =
             hparams.quant_wht_enabled &&
             (op == GGML_OP_MUL_MAT || op == GGML_OP_MUL_MAT_ID) &&
-            llama_model_quant_wht_type_supported(t_meta->type) &&
+            llama_model_quant_wht_type_enabled(t_meta->type, hparams.quant_wht_skip_types) &&
             llama_model_quant_wht_name_supported(tn);
 
         if (hparams.quant_wht_enabled &&
                 (op == GGML_OP_MUL_MAT || op == GGML_OP_MUL_MAT_ID) &&
-                llama_model_quant_wht_type_supported(t_meta->type) &&
+                llama_model_quant_wht_type_enabled(t_meta->type, hparams.quant_wht_skip_types) &&
                 llama_model_quant_wht_name_supported(tn) &&
                 t_meta->ne[0] % 256 != 0) {
             throw std::runtime_error(format("general.quant_wht tensor %s has unsupported reduction dimension %" PRId64,
@@ -1191,8 +1267,9 @@ struct ggml_tensor * llama_model_loader::create_tensor(
             if (getenv("GGML_CUDA_LOG_QUANT_WHT") != nullptr) {
                 static int n_logged = 0;
                 if (n_logged < 8) {
-                    LLAMA_LOG_INFO("%s: quant_wht tensor flagged: %s type=%s dim=%" PRId64 "\n",
-                            __func__, tn.str().c_str(), ggml_type_name(t_meta->type), t_meta->ne[0]);
+                    LLAMA_LOG_INFO("%s: quant_wht tensor flagged: %s type=%s dim=%" PRId64 " skip_types=%s\n",
+                            __func__, tn.str().c_str(), ggml_type_name(t_meta->type), t_meta->ne[0],
+                            hparams.quant_wht_skip_types[0] == '\0' ? "<none>" : hparams.quant_wht_skip_types);
                     ++n_logged;
                 }
             }
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -22,6 +22,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
+#include <cstdio>
 #include <cstdint>
 #include <cstring>
 #include <cmath>
@@ -731,14 +732,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         if (!ml.get_key("general.quant_wht.version", quant_wht_version, false)) {
             throw std::runtime_error("general.quant_wht.enabled=true but general.quant_wht.version is missing");
         }
+        quant_wht_skip_types.clear();
+        ml.get_key("general.quant_wht.skip_types", quant_wht_skip_types, false);
         if (quant_wht_dim != 256 || quant_wht_scheme != "pqk_rht_v1" || quant_wht_version != 1) {
             throw std::runtime_error(format("unsupported general.quant_wht metadata: dim=%u scheme=%s version=%u",
                         quant_wht_dim, quant_wht_scheme.c_str(), quant_wht_version));
         }
         hparams.quant_wht_dim = quant_wht_dim;
         hparams.quant_wht_version = quant_wht_version;
-        LLAMA_LOG_WARN("%s: WARNING: experimental WHT-rotated Q_K GGUF detected (dim=%u, scheme=%s, version=%u)\n",
-                __func__, quant_wht_dim, quant_wht_scheme.c_str(), quant_wht_version);
+        if (quant_wht_skip_types.size() >= sizeof(hparams.quant_wht_skip_types)) {
+            throw std::runtime_error("general.quant_wht.skip_types is too long");
+        }
+        snprintf(hparams.quant_wht_skip_types, sizeof(hparams.quant_wht_skip_types), "%s", quant_wht_skip_types.c_str());
+        LLAMA_LOG_WARN("%s: WARNING: experimental WHT-rotated Q_K/Q8_0/IQ GGUF detected (dim=%u, scheme=%s, version=%u, skip_types=%s)\n",
+                __func__, quant_wht_dim, quant_wht_scheme.c_str(), quant_wht_version,
+                quant_wht_skip_types.empty() ? "<none>" : quant_wht_skip_types.c_str());
     }
 
     // everything past this point is not vocab-related
@@ -8185,6 +8193,7 @@ void llama_model::print_info() const {
     if (quant_wht_enabled) {
         LLAMA_LOG_INFO("%s: quant_wht_dim         = %u\n",     __func__, quant_wht_dim);
         LLAMA_LOG_INFO("%s: quant_wht_scheme      = %s\n",     __func__, quant_wht_scheme.c_str());
+        LLAMA_LOG_INFO("%s: quant_wht_skip_types  = %s\n",     __func__, quant_wht_skip_types.empty() ? "<none>" : quant_wht_skip_types.c_str());
     }
 
     if (!hparams.vocab_only) {
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -566,6 +566,7 @@ struct llama_model {
     uint32_t quant_wht_dim = 0;
     uint32_t quant_wht_version = 0;
     std::string quant_wht_scheme;
+    std::string quant_wht_skip_types;
 
     // list of devices used in this model
     std::vector<llama_device> devices;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp