Skip to content

Commit a53e4cf

Browse files
committed
WIP: SpectralQuant
1 parent b670c83 commit a53e4cf

59 files changed

Lines changed: 10280 additions & 79 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

SpectralQuant.md

Lines changed: 197 additions & 0 deletions
Large diffs are not rendered by default.

common/arg.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,9 @@ const std::vector<ggml_type> kv_cache_types = {
390390
GGML_TYPE_TURBO2_0,
391391
GGML_TYPE_TURBO3_0,
392392
GGML_TYPE_TURBO4_0,
393+
GGML_TYPE_SKV2_0,
394+
GGML_TYPE_SKV3_0,
395+
GGML_TYPE_SKV4_0,
393396
};
394397

395398
static ggml_type kv_cache_type_from_str(const std::string & s) {
@@ -2031,6 +2034,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20312034
params.cache_type_v = kv_cache_type_from_str(value);
20322035
}
20332036
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
2037+
add_opt(common_arg(
2038+
{"--spectral-calibration"}, "FILE",
2039+
"spectral calibration GGUF sidecar for SKV cache types",
2040+
[](common_params & params, const std::string & value) {
2041+
params.spectral_calibration = value;
2042+
}
2043+
).set_env("LLAMA_ARG_SPECTRAL_CALIBRATION"));
2044+
add_opt(common_arg(
2045+
{"--spectral-profile"}, "PROFILE",
2046+
string_format(
2047+
"spectral profile filter for SKV/SQ runtime\n"
2048+
"allowed values: auto, all, nonuniform, selcorr\n"
2049+
"(default: %s)",
2050+
params.spectral_profile.c_str()
2051+
),
2052+
[](common_params & params, const std::string & value) {
2053+
params.spectral_profile = value;
2054+
}
2055+
).set_env("LLAMA_ARG_SPECTRAL_PROFILE"));
20342056
add_opt(common_arg(
20352057
{"--hellaswag"},
20362058
"compute HellaSwag score over random tasks from datafile supplied with -f",

common/common.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1449,6 +1449,8 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
14491449
mparams.progress_callback = params.load_progress_callback;
14501450
mparams.progress_callback_user_data = params.load_progress_callback_user_data;
14511451
mparams.no_alloc = params.no_alloc;
1452+
mparams.spectral_calibration = params.spectral_calibration.empty() ? nullptr : params.spectral_calibration.c_str();
1453+
mparams.spectral_profile = params.spectral_profile.empty() ? nullptr : params.spectral_profile.c_str();
14521454

14531455
return mparams;
14541456
}
@@ -1485,6 +1487,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
14851487

14861488
cparams.type_k = params.cache_type_k;
14871489
cparams.type_v = params.cache_type_v;
1490+
cparams.spectral_calibration = params.spectral_calibration.empty() ? nullptr : params.spectral_calibration.c_str();
1491+
cparams.spectral_profile = params.spectral_profile.empty() ? nullptr : params.spectral_profile.c_str();
14881492

14891493
return cparams;
14901494
}

common/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,8 @@ struct common_params_speculative {
343343

344344
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
345345
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
346+
std::string spectral_calibration; // spectral calibration GGUF sidecar for SKV runtime
347+
std::string spectral_profile = "auto"; // spectral profile filter: auto, nonuniform, selcorr, all
346348

347349
struct cpu_params cpuparams;
348350
struct cpu_params cpuparams_batch;
@@ -548,6 +550,8 @@ struct common_params {
548550

549551
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
550552
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
553+
std::string spectral_calibration; // spectral calibration GGUF sidecar for SKV runtime
554+
std::string spectral_profile = "auto"; // spectral profile filter: auto, nonuniform, selcorr, all
551555

552556
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
553557

ggml/include/ggml.h

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,13 @@ extern "C" {
433433
GGML_TYPE_TURBO2_0 = 43, // TurboQuant 2-bit KV cache: 2-bit PolarQuant (no QJL)
434434
GGML_TYPE_TQ3_1S = 44, // TurboQuant 3-bit weight: WHT-rotated 8-level Lloyd-Max, block_size=32
435435
GGML_TYPE_TQ4_1S = 45, // TurboQuant 4-bit weight: WHT-rotated 16-level Lloyd-Max, block_size=32
436-
GGML_TYPE_COUNT = 46,
436+
GGML_TYPE_SQ2_0 = 46, // SpectralQuant 2-bit weight: spectral-domain non-uniform codes + semantic correction
437+
GGML_TYPE_SQ3_1S = 47, // SpectralQuant 3-bit weight: spectral-domain non-uniform codes + semantic correction
438+
GGML_TYPE_SQ4_1S = 48, // SpectralQuant 4-bit weight: spectral-domain non-uniform codes + semantic correction
439+
GGML_TYPE_SKV2_0 = 49, // SpectralQuant 2-bit KV cache: separate semantic/tail codebooks
440+
GGML_TYPE_SKV3_0 = 50, // SpectralQuant 3-bit KV cache: separate semantic/tail codebooks
441+
GGML_TYPE_SKV4_0 = 51, // SpectralQuant 4-bit KV cache: separate semantic/tail codebooks
442+
GGML_TYPE_COUNT = 52,
437443
};
438444

439445
// precision
@@ -2717,6 +2723,67 @@ extern "C" {
27172723
int64_t n_per_row,
27182724
const float * imatrix);
27192725

2726+
struct ggml_spectral_weight_meta {
2727+
uint32_t dim;
2728+
uint32_t split;
2729+
uint32_t correction_dim;
2730+
uint32_t semantic_codebook_size;
2731+
uint32_t tail_codebook_size;
2732+
const float * basis;
2733+
const float * semantic_codebook;
2734+
const float * tail_codebook;
2735+
};
2736+
2737+
GGML_API bool ggml_is_spectral_weight_type(enum ggml_type type);
2738+
GGML_API size_t ggml_quantize_spectral_weight(
2739+
enum ggml_type type,
2740+
const float * src,
2741+
void * dst,
2742+
int64_t nrows,
2743+
int64_t n_per_row,
2744+
const struct ggml_spectral_weight_meta * meta);
2745+
GGML_API bool ggml_spectral_register_tensor(
2746+
const void * owner,
2747+
const void * data,
2748+
size_t size,
2749+
enum ggml_type type,
2750+
const struct ggml_spectral_weight_meta * meta);
2751+
GGML_API void ggml_spectral_unregister_owner(const void * owner);
2752+
2753+
struct ggml_spectral_kv_head_meta {
2754+
uint32_t dim;
2755+
uint32_t split;
2756+
uint32_t semantic_codebook_size;
2757+
uint32_t tail_codebook_size;
2758+
const float * basis;
2759+
const float * semantic_codebook;
2760+
const float * tail_codebook;
2761+
const float * qjl_matrix;
2762+
};
2763+
2764+
struct ggml_spectral_kv_meta {
2765+
bool is_key;
2766+
bool use_correction;
2767+
uint32_t n_head;
2768+
uint32_t head_dim;
2769+
uint32_t head_dim_padded;
2770+
uint32_t n_rows;
2771+
uint32_t qjl_bytes_per_head;
2772+
const struct ggml_spectral_kv_head_meta * heads;
2773+
float * vec_norms;
2774+
float * residual_norms;
2775+
uint8_t * qjl_signs;
2776+
};
2777+
2778+
GGML_API bool ggml_is_spectral_kv_type(enum ggml_type type);
2779+
GGML_API bool ggml_spectral_kv_register_tensor(
2780+
const void * owner,
2781+
void * data,
2782+
size_t size,
2783+
enum ggml_type type,
2784+
const struct ggml_spectral_kv_meta * meta);
2785+
GGML_API void ggml_spectral_kv_unregister_owner(const void * owner);
2786+
27202787
#ifdef __cplusplus
27212788
// restrict not standard in C++
27222789
# if defined(__GNUC__)

ggml/src/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,14 @@ if (GGML_BACKEND_DL)
218218
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
219219
endif()
220220

221+
if (APPLE AND GGML_ACCELERATE)
222+
find_library(ACCELERATE_FRAMEWORK Accelerate)
223+
if (ACCELERATE_FRAMEWORK)
224+
target_compile_definitions(ggml-base PRIVATE GGML_USE_ACCELERATE)
225+
target_link_libraries(ggml-base PRIVATE ${ACCELERATE_FRAMEWORK})
226+
endif()
227+
endif()
228+
221229
if (GGML_SCHED_NO_REALLOC)
222230
target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
223231
endif()

ggml/src/ggml-common.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,55 @@ typedef struct {
357357
} block_tq4_1s; // 20 bytes total
358358
static_assert(sizeof(block_tq4_1s) == 20, "wrong tq4_1s block size");
359359

360+
// SpectralQuant SQ2_0 / SQ3_1S / SQ4_1S:
361+
// spectral-domain weight quantization with per-tensor learned basis,
362+
// separate semantic/tail codebooks, and optional semantic correction.
363+
// The first GGML_SQ_CORR_MAX spectral coordinates can store a 4-bit residual.
364+
#define QK_SQ 32
365+
#define GGML_SQ_CORR_MAX 8
366+
#define GGML_SQ_CORR_BYTES (GGML_SQ_CORR_MAX / 2)
367+
368+
typedef struct {
369+
ggml_half dc; // 2 bytes: semantic correction scale
370+
uint8_t corr[GGML_SQ_CORR_BYTES]; // 4 bytes: signed 4-bit residuals for leading semantic coeffs
371+
uint8_t qs[QK_SQ / 4]; // 8 bytes: 2-bit packed indices
372+
} block_sq2_0; // 14 bytes total
373+
static_assert(sizeof(block_sq2_0) == 14, "wrong sq2_0 block size");
374+
375+
typedef struct {
376+
ggml_half dc; // 2 bytes
377+
uint8_t corr[GGML_SQ_CORR_BYTES]; // 4 bytes
378+
uint8_t qs[QK_SQ * 3 / 8]; // 12 bytes: 3-bit packed indices
379+
} block_sq3_1s; // 18 bytes total
380+
static_assert(sizeof(block_sq3_1s) == 18, "wrong sq3_1s block size");
381+
382+
typedef struct {
383+
ggml_half dc; // 2 bytes
384+
uint8_t corr[GGML_SQ_CORR_BYTES]; // 4 bytes
385+
uint8_t qs[QK_SQ / 2]; // 16 bytes: 4-bit packed indices
386+
} block_sq4_1s; // 22 bytes total
387+
static_assert(sizeof(block_sq4_1s) == 22, "wrong sq4_1s block size");
388+
389+
// SpectralQuant SKV2_0 / SKV3_0 / SKV4_0:
390+
// spectral-domain KV-cache quantization. Per-row norms / QJL signs live in
391+
// runtime side-buffers, so the on-tensor payload is indices-only.
392+
#define QK_SKV 32
393+
394+
typedef struct {
395+
uint8_t qs[QK_SKV / 4];
396+
} block_skv2_0;
397+
static_assert(sizeof(block_skv2_0) == 8, "wrong skv2_0 block size");
398+
399+
typedef struct {
400+
uint8_t qs[QK_SKV * 3 / 8];
401+
} block_skv3_0;
402+
static_assert(sizeof(block_skv3_0) == 12, "wrong skv3_0 block size");
403+
404+
typedef struct {
405+
uint8_t qs[QK_SKV / 2];
406+
} block_skv4_0;
407+
static_assert(sizeof(block_skv4_0) == 16, "wrong skv4_0 block size");
408+
360409
//
361410
// Super-block quantization structures
362411
//

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,24 @@ static void ggml_vec_dot_tq3_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
221221
static void ggml_vec_dot_tq4_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
222222
const void * GGML_RESTRICT vx, size_t bx,
223223
const void * GGML_RESTRICT vy, size_t by, int nrc);
224+
static void ggml_vec_dot_sq2_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
225+
const void * GGML_RESTRICT vx, size_t bx,
226+
const void * GGML_RESTRICT vy, size_t by, int nrc);
227+
static void ggml_vec_dot_sq3_1s_f32(int n, float * GGML_RESTRICT s, size_t bs,
228+
const void * GGML_RESTRICT vx, size_t bx,
229+
const void * GGML_RESTRICT vy, size_t by, int nrc);
230+
static void ggml_vec_dot_sq4_1s_f32(int n, float * GGML_RESTRICT s, size_t bs,
231+
const void * GGML_RESTRICT vx, size_t bx,
232+
const void * GGML_RESTRICT vy, size_t by, int nrc);
233+
void ggml_vec_dot_skv2_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
234+
const void * GGML_RESTRICT vx, size_t bx,
235+
const void * GGML_RESTRICT vy, size_t by, int nrc);
236+
void ggml_vec_dot_skv3_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
237+
const void * GGML_RESTRICT vx, size_t bx,
238+
const void * GGML_RESTRICT vy, size_t by, int nrc);
239+
void ggml_vec_dot_skv4_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
240+
const void * GGML_RESTRICT vx, size_t bx,
241+
const void * GGML_RESTRICT vy, size_t by, int nrc);
224242

225243
static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
226244
[GGML_TYPE_F32] = {
@@ -441,6 +459,42 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
441459
.vec_dot_type = GGML_TYPE_Q8_0,
442460
.nrows = 1,
443461
},
462+
[GGML_TYPE_SQ2_0] = {
463+
.from_float = NULL,
464+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_sq2_0_f32,
465+
.vec_dot_type = GGML_TYPE_F32,
466+
.nrows = 1,
467+
},
468+
[GGML_TYPE_SQ3_1S] = {
469+
.from_float = NULL,
470+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_sq3_1s_f32,
471+
.vec_dot_type = GGML_TYPE_F32,
472+
.nrows = 1,
473+
},
474+
[GGML_TYPE_SQ4_1S] = {
475+
.from_float = NULL,
476+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_sq4_1s_f32,
477+
.vec_dot_type = GGML_TYPE_F32,
478+
.nrows = 1,
479+
},
480+
[GGML_TYPE_SKV2_0] = {
481+
.from_float = (ggml_from_float_t) quantize_row_skv2_0_ref,
482+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_skv2_0_f32,
483+
.vec_dot_type = GGML_TYPE_F32,
484+
.nrows = 1,
485+
},
486+
[GGML_TYPE_SKV3_0] = {
487+
.from_float = (ggml_from_float_t) quantize_row_skv3_0_ref,
488+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_skv3_0_f32,
489+
.vec_dot_type = GGML_TYPE_F32,
490+
.nrows = 1,
491+
},
492+
[GGML_TYPE_SKV4_0] = {
493+
.from_float = (ggml_from_float_t) quantize_row_skv4_0_ref,
494+
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_skv4_0_f32,
495+
.vec_dot_type = GGML_TYPE_F32,
496+
.nrows = 1,
497+
},
444498
};
445499

446500
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@@ -1296,6 +1350,10 @@ void ggml_compute_forward_mul_mat(
12961350
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
12971351
int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows;
12981352

1353+
if (ggml_is_spectral_weight_type(src0->type)) {
1354+
ggml_sq_vec_cache_reset();
1355+
}
1356+
12991357
GGML_ASSERT(ne0 == ne01);
13001358
GGML_ASSERT(ne1 == ne11);
13011359
GGML_ASSERT(ne2 == ne12);
@@ -1574,6 +1632,10 @@ static void ggml_compute_forward_mul_mat_id(
15741632
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
15751633
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
15761634

1635+
if (ggml_is_spectral_weight_type(type)) {
1636+
ggml_sq_vec_cache_reset();
1637+
}
1638+
15771639
// we don't support permuted src0 or src1
15781640
GGML_ASSERT(nb00 == ggml_type_size(type));
15791641
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
@@ -3483,6 +3545,75 @@ static void ggml_vec_dot_tq4_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
34833545
*s = sum;
34843546
}
34853547

3548+
static void ggml_vec_dot_sq2_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
3549+
const void * GGML_RESTRICT vx, size_t bx,
3550+
const void * GGML_RESTRICT vy, size_t by, int nrc) {
3551+
GGML_ASSERT(nrc == 1);
3552+
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
3553+
3554+
if (ggml_sq_vec_dot_f32(GGML_TYPE_SQ2_0, vx, (const float *) vy, n, s)) {
3555+
return;
3556+
}
3557+
3558+
float * tmp = (float *) malloc((size_t) n * sizeof(float));
3559+
GGML_ASSERT(tmp != NULL);
3560+
ggml_get_type_traits(GGML_TYPE_SQ2_0)->to_float(vx, tmp, n);
3561+
3562+
const float * y = (const float *) vy;
3563+
float sum = 0.0f;
3564+
for (int i = 0; i < n; ++i) {
3565+
sum += tmp[i] * y[i];
3566+
}
3567+
free(tmp);
3568+
*s = sum;
3569+
}
3570+
3571+
static void ggml_vec_dot_sq3_1s_f32(int n, float * GGML_RESTRICT s, size_t bs,
3572+
const void * GGML_RESTRICT vx, size_t bx,
3573+
const void * GGML_RESTRICT vy, size_t by, int nrc) {
3574+
GGML_ASSERT(nrc == 1);
3575+
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
3576+
3577+
if (ggml_sq_vec_dot_f32(GGML_TYPE_SQ3_1S, vx, (const float *) vy, n, s)) {
3578+
return;
3579+
}
3580+
3581+
float * tmp = (float *) malloc((size_t) n * sizeof(float));
3582+
GGML_ASSERT(tmp != NULL);
3583+
ggml_get_type_traits(GGML_TYPE_SQ3_1S)->to_float(vx, tmp, n);
3584+
3585+
const float * y = (const float *) vy;
3586+
float sum = 0.0f;
3587+
for (int i = 0; i < n; ++i) {
3588+
sum += tmp[i] * y[i];
3589+
}
3590+
free(tmp);
3591+
*s = sum;
3592+
}
3593+
3594+
static void ggml_vec_dot_sq4_1s_f32(int n, float * GGML_RESTRICT s, size_t bs,
3595+
const void * GGML_RESTRICT vx, size_t bx,
3596+
const void * GGML_RESTRICT vy, size_t by, int nrc) {
3597+
GGML_ASSERT(nrc == 1);
3598+
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
3599+
3600+
if (ggml_sq_vec_dot_f32(GGML_TYPE_SQ4_1S, vx, (const float *) vy, n, s)) {
3601+
return;
3602+
}
3603+
3604+
float * tmp = (float *) malloc((size_t) n * sizeof(float));
3605+
GGML_ASSERT(tmp != NULL);
3606+
ggml_get_type_traits(GGML_TYPE_SQ4_1S)->to_float(vx, tmp, n);
3607+
3608+
const float * y = (const float *) vy;
3609+
float sum = 0.0f;
3610+
for (int i = 0; i < n; ++i) {
3611+
sum += tmp[i] * y[i];
3612+
}
3613+
free(tmp);
3614+
*s = sum;
3615+
}
3616+
34863617
void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
34873618
memcpy(y, x, n * sizeof(float));
34883619
}

0 commit comments

Comments
 (0)