From 5173dfb36944c39e417c277919d6bab12cddab91 Mon Sep 17 00:00:00 2001 From: Rountaboutt Date: Fri, 17 Apr 2026 22:03:55 +0800 Subject: [PATCH 01/12] tools: add embed weight calibration and int8 quantization --- tools/quantize/ncnn2int8.cpp | 21 +++++++------ tools/quantize/ncnn2table.cpp | 56 +++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 11 deletions(-) diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 55db8d79c2af..77796f93f6a0 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -563,6 +563,15 @@ int NetQuantize::quantize_embed() if (layers[i]->type != "Embed") continue; + char key[256]; + snprintf(key, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter = weight_int8scale_table.find(key); + if (iter == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // Embed - quantize weight from fp32 to int8 ncnn::Embed* embed = (ncnn::Embed*)layers[i]; @@ -573,17 +582,7 @@ int NetQuantize::quantize_embed() const int num_output = embed->num_output; const int input_dim = embed->input_dim; - ncnn::Mat weight_data_int8_scales(1); - { - const float* ptr = embed->weight_data; - float absmax = 0.f; - for (int i = 0; i < embed->weight_data.w; i++) - { - absmax = std::max(absmax, (float)fabs(ptr[i])); - } - - weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax; - } + ncnn::Mat weight_data_int8_scales = iter->second; { ncnn::Mat weight_data_int8; diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index 7edbdd15128d..2ce934f6aaf5 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -38,6 +38,7 @@ #include "layer/convolution.h" #include "layer/convolutiondepthwise.h" #include "layer/innerproduct.h" +#include "layer/embed.h" class QuantBlobStat { @@ -91,11 +92,13 @@ class QuantNet : public ncnn::Net std::vector conv_layers; std::vector conv_bottom_blobs; std::vector conv_top_blobs; + std::vector embed_layers; // result std::vector quant_blob_stats; std::vector weight_scales; std::vector bottom_blob_scales; + std::vector embed_weight_scales; }; QuantNet::QuantNet() @@ -126,14 +129,22 @@ int QuantNet::init() conv_bottom_blobs.push_back(layer->bottoms[0]); conv_top_blobs.push_back(layer->tops[0]); } + + // find embed layers + else if (layer->type == "Embed") + { + embed_layers.push_back(i); + } } const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + const int embed_layer_count = (int)embed_layers.size(); quant_blob_stats.resize(conv_bottom_blob_count); weight_scales.resize(conv_layer_count); bottom_blob_scales.resize(conv_bottom_blob_count); + embed_weight_scales.resize(embed_layer_count); return 0; } @@ -149,6 +160,7 @@ int QuantNet::save_table(const char* tablepath) const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + const int embed_layer_count = (int)embed_layers.size(); fprintf(stdout, "param:%d\n", conv_layer_count); @@ -175,6 +187,14 @@ int QuantNet::save_table(const char* tablepath) } fprintf(fp, "\n"); } + + fprintf(stdout, "param:%d\n", embed_layer_count); + for (int i = 0; i < embed_layer_count; i++) + { + fprintf(fp, "%s_param_0 ", layers[embed_layers[i]]->name.c_str()); + fprintf(fp, "%f ", embed_weight_scales[i]); + fprintf(fp, "\n"); + } fclose(fp); @@ -302,6 +322,8 @@ int QuantNet::quantize_KL() const int input_blob_count = (int)input_blobs.size(); const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + const int embed_layer_count = (int)embed_layers.size(); + const int file_count = (int)listspaths[0].size(); const int num_histogram_bins = 2048; @@ -407,6 +429,22 @@ int QuantNet::quantize_KL() } } + // initialize embed weight scales + for (int i = 0; i < embed_layer_count; i++) + { + const ncnn::Layer* layer = layers[embed_layers[i]]; + const ncnn::Embed* embed = (const ncnn::Embed*)layer; + const float* ptr = embed->weight_data; + + float absmax = 0.f; + for (int j = 0; j < embed->weight_data.w; j++) + { + absmax = std::max(absmax, (float)fabs(ptr[j])); + } + embed_weight_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; + + } + // count the absmax #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1) for (int i = 0; i < file_count; i++) @@ -780,6 +818,8 @@ int QuantNet::quantize_ACIQ() const int input_blob_count = (int)input_blobs.size(); const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + const int embed_layer_count = (int)embed_layers.size(); + const int file_count = (int)listspaths[0].size(); std::vector blob_allocators(quantize_num_threads); @@ -887,6 +927,22 @@ int QuantNet::quantize_ACIQ() } } + // initialize embed weight scales + for (int i = 0; i < embed_layer_count; i++) + { + const ncnn::Layer* layer = layers[embed_layers[i]]; + const ncnn::Embed* embed = (const ncnn::Embed*)layer; + const float* ptr = embed->weight_data; + + float absmax = 0.f; + for (int j = 0; j < embed->weight_data.w; j++) + { + absmax = std::max(absmax, (float)fabs(ptr[j])); + } + embed_weight_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; + + } + // count the absmax #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1) for (int i = 0; i < file_count; i++) From db9850aaa8100076b41e952c1a8d22dcb9909855 Mon Sep 17 00:00:00 2001 From: Rountaboutt Date: Sun, 19 Apr 2026 11:02:45 +0800 Subject: [PATCH 02/12] tools: add embed weight calibration and int8 quantization --- tools/quantize/ncnn2int8.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 77796f93f6a0..b7f567f29ac7 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -1119,4 +1119,4 @@ int main(int argc, char** argv) quantizer.save(outparam, outbin); return 0; -} +} \ No newline at end of file From 0207b6bc43234ebdaa98bb5d07ab520fff89bfc3 Mon Sep 17 00:00:00 2001 From: Rountaboutt Date: Sun, 19 Apr 2026 19:06:19 +0800 Subject: [PATCH 03/12] tools:add MutiHeadAttention layers' weight scales in ncnn2table --- tools/quantize/ncnn2table.cpp | 117 +++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index 2ce934f6aaf5..187e42b1eb03 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -39,6 +39,7 @@ #include "layer/convolutiondepthwise.h" #include "layer/innerproduct.h" #include "layer/embed.h" +#include "layer/multiheadattention.h" class QuantBlobStat { @@ -62,6 +63,15 @@ class QuantBlobStat std::vector histogram_normed; }; +class QuantMHAStat +{ +public: + ncnn::Mat q_weight_scales; + ncnn::Mat k_weight_scales; + ncnn::Mat v_weight_scales; + float out_weight_scale; +}; + class QuantNet : public ncnn::Net { public: @@ -93,12 +103,14 @@ class QuantNet : public ncnn::Net std::vector conv_bottom_blobs; std::vector conv_top_blobs; std::vector embed_layers; + std::vector mha_layers; // result std::vector quant_blob_stats; std::vector weight_scales; std::vector bottom_blob_scales; std::vector embed_weight_scales; + std::vector mha_stats; }; QuantNet::QuantNet() @@ -135,16 +147,24 @@ int QuantNet::init() { embed_layers.push_back(i); } + + // find all mha layers + else if (layer->type == "MultiHeadAttention") + { + mha_layers.push_back(i); + } } const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); const int embed_layer_count = (int)embed_layers.size(); + const int mha_layer_count = (int)mha_layers.size(); quant_blob_stats.resize(conv_bottom_blob_count); weight_scales.resize(conv_layer_count); bottom_blob_scales.resize(conv_bottom_blob_count); embed_weight_scales.resize(embed_layer_count); + mha_stats.resize(mha_layer_count); return 0; } @@ -161,6 +181,7 @@ int QuantNet::save_table(const char* tablepath) const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); const int embed_layer_count = (int)embed_layers.size(); + const int mha_layer_count = (int)mha_layers.size(); fprintf(stdout, "param:%d\n", conv_layer_count); @@ -190,12 +211,48 @@ int QuantNet::save_table(const char* tablepath) fprintf(stdout, "param:%d\n", embed_layer_count); for (int i = 0; i < embed_layer_count; i++) - { + { fprintf(fp, "%s_param_0 ", layers[embed_layers[i]]->name.c_str()); fprintf(fp, "%f ", embed_weight_scales[i]); fprintf(fp, "\n"); } + fprintf(stdout, "param:%d\n", mha_layer_count); + for (int i = 0; i < mha_layer_count; i++) + { + // q_weight + const ncnn::Mat q_weight_scales = mha_stats[i].q_weight_scales; + fprintf(fp, "%s_param_0 ", layers[mha_layers[i]]->name.c_str()); + for (int j = 0; j < q_weight_scales.w; j++) + { + fprintf(fp, "%f ", q_weight_scales[j]); + } + fprintf(fp, "\n"); + + // k_weight + const ncnn::Mat k_weight_scales = mha_stats[i].k_weight_scales; + fprintf(fp, "%s_param_1 ", layers[mha_layers[i]]->name.c_str()); + for (int j = 0; j < k_weight_scales.w; j++) + { + fprintf(fp, "%f ", k_weight_scales[j]); + } + fprintf(fp, "\n"); + + // v_weight + const ncnn::Mat v_weight_scales = mha_stats[i].v_weight_scales; + fprintf(fp, "%s_param_2 ", layers[mha_layers[i]]->name.c_str()); + for (int j = 0; j < v_weight_scales.w; j++) + { + fprintf(fp, "%f ", v_weight_scales[j]); + } + fprintf(fp, "\n"); + + // out_weight + fprintf(fp, "%s_param_3 ", layers[mha_layers[i]]->name.c_str()); + fprintf(fp, "%f ", mha_stats[i].out_weight_scale); + fprintf(fp, "\n"); + } + fclose(fp); fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n"); @@ -323,6 +380,7 @@ int QuantNet::quantize_KL() const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); const int embed_layer_count = (int)embed_layers.size(); + const int mha_layer_count = (int)mha_layers.size(); const int file_count = (int)listspaths[0].size(); @@ -445,6 +503,63 @@ int QuantNet::quantize_KL() } + // initialize mha weight scales + for (int i = 0; i < mha_layer_count; i++) + { + const ncnn::Layer* layer = layers[mha_layers[i]]; + const ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*) layer; + + const int qdim = mha->weight_data_size / mha->embed_dim; + mha_stats[i].q_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float q_absmax = 0.f; + + const float* q_ptr = (const float*)mha->q_weight_data + j * qdim; + for (int k = 0; k < qdim; k++) + { + q_absmax = std::max(q_absmax, (float)fabs(q_ptr[k])); + } + mha_stats[i].q_weight_scales[j] = q_absmax == 0.f ? 1.f : 127 / q_absmax; + } + + const int kdim = mha->kdim; + mha_stats[i].k_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float k_absmax = 0.f; + + const float* k_ptr = (const float*)mha->k_weight_data + j * kdim; + for (int k = 0; k < kdim; k++) + { + k_absmax = std::max(k_absmax, (float)fabs(k_ptr[k])); + } + mha_stats[i].k_weight_scales[j] = k_absmax == 0.f ? 1.f : 127 / k_absmax; + } + + const int vdim = mha->vdim; + mha_stats[i].v_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float v_absmax = 0.f; + + const float* v_ptr = (const float*)mha->v_weight_data + j * vdim; + for (int k = 0; k < vdim; k++) + { + v_absmax = std::max(v_absmax, (float)fabs(v_ptr[k])); + } + mha_stats[i].v_weight_scales[j] = v_absmax == 0.f ? 1.f : 127 / v_absmax; + } + + const float* o_ptr = (const float*)mha->out_weight_data; + float o_absmax = 0.f; + for (int k = 0; k < mha->out_weight_data.w; k++) + { + o_absmax = std::max(o_absmax, (float)fabs(o_ptr[k])); + } + mha_stats[i].out_weight_scale = o_absmax == 0.f ? 1.f : 127 / o_absmax; + } + // count the absmax #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1) for (int i = 0; i < file_count; i++) From 08988f21dee53753571b1310ff46610e3925889b Mon Sep 17 00:00:00 2001 From: Rountaboutt Date: Sun, 19 Apr 2026 19:57:56 +0800 Subject: [PATCH 04/12] tools:add weight-only mode without calibration in ncnn2table --- tools/quantize/ncnn2table.cpp | 146 ++++++++++++++++++++++++++++------ 1 file changed, 123 insertions(+), 23 deletions(-) diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index 187e42b1eb03..d9059638500c 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -382,13 +382,6 @@ int QuantNet::quantize_KL() const int embed_layer_count = (int)embed_layers.size(); const int mha_layer_count = (int)mha_layers.size(); - const int file_count = (int)listspaths[0].size(); - - const int num_histogram_bins = 2048; - - std::vector blob_allocators(quantize_num_threads); - std::vector workspace_allocators(quantize_num_threads); - // initialize conv weight scales #pragma omp parallel for num_threads(quantize_num_threads) for (int i = 0; i < conv_layer_count; i++) @@ -560,6 +553,16 @@ int QuantNet::quantize_KL() mha_stats[i].out_weight_scale = o_absmax == 0.f ? 1.f : 127 / o_absmax; } + if (conv_layer_count == 0) + return 0; + + const int file_count = (int)listspaths[0].size(); + + const int num_histogram_bins = 2048; + + std::vector blob_allocators(quantize_num_threads); + std::vector workspace_allocators(quantize_num_threads); + // count the absmax #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1) for (int i = 0; i < file_count; i++) @@ -934,11 +937,7 @@ int QuantNet::quantize_ACIQ() const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); const int embed_layer_count = (int)embed_layers.size(); - - const int file_count = (int)listspaths[0].size(); - - std::vector blob_allocators(quantize_num_threads); - std::vector workspace_allocators(quantize_num_threads); + const int mha_layer_count = (int)mha_layers.size(); // initialize conv weight scales #pragma omp parallel for num_threads(quantize_num_threads) @@ -1058,6 +1057,71 @@ int QuantNet::quantize_ACIQ() } + // initialize mha weight scales + for (int i = 0; i < mha_layer_count; i++) + { + const ncnn::Layer* layer = layers[mha_layers[i]]; + const ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layer; + + const int qdim = mha->weight_data_size / mha->embed_dim; + mha_stats[i].q_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float q_absmax = 0.f; + + const float* q_ptr = (const float*)mha->q_weight_data + j * qdim; + for (int k = 0; k < qdim; k++) + { + q_absmax = std::max(q_absmax, (float)fabs(q_ptr[k])); + } + mha_stats[i].q_weight_scales[j] = q_absmax == 0.f ? 1.f : 127 / q_absmax; + } + + const int kdim = mha->kdim; + mha_stats[i].k_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float k_absmax = 0.f; + + const float* k_ptr = (const float*)mha->k_weight_data + j * kdim; + for (int k = 0; k < kdim; k++) + { + k_absmax = std::max(k_absmax, (float)fabs(k_ptr[k])); + } + mha_stats[i].k_weight_scales[j] = k_absmax == 0.f ? 1.f : 127 / k_absmax; + } + + const int vdim = mha->vdim; + mha_stats[i].v_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float v_absmax = 0.f; + + const float* v_ptr = (const float*)mha->v_weight_data + j * vdim; + for (int k = 0; k < vdim; k++) + { + v_absmax = std::max(v_absmax, (float)fabs(v_ptr[k])); + } + mha_stats[i].v_weight_scales[j] = v_absmax == 0.f ? 1.f : 127 / v_absmax; + } + + const float* o_ptr = (const float*)mha->out_weight_data; + float o_absmax = 0.f; + for (int k = 0; k < mha->out_weight_data.w; k++) + { + o_absmax = std::max(o_absmax, (float)fabs(o_ptr[k])); + } + mha_stats[i].out_weight_scale = o_absmax == 0.f ? 1.f : 127 / o_absmax; + } + + if (conv_layer_count == 0) + return 0; + + const int file_count = (int)listspaths[0].size(); + + std::vector blob_allocators(quantize_num_threads); + std::vector workspace_allocators(quantize_num_threads); + // count the absmax #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1) for (int i = 0; i < file_count; i++) @@ -1283,6 +1347,9 @@ int QuantNet::quantize_EQ() const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + if (conv_layer_count == 0) + return 0; + std::vector blob_allocators(quantize_num_threads); std::vector workspace_allocators(quantize_num_threads); @@ -1832,6 +1899,7 @@ static void print_pixel_type_list(const std::vector& list) static void show_usage() { fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n"); + fprintf(stderr, " ncnn2table [ncnnparam] [ncnnbin] [ncnntable] [(key=value)...]\n"); fprintf(stderr, " mean=[104.0,117.0,123.0],...\n"); fprintf(stderr, " norm=[1.0,1.0,1.0],...\n"); fprintf(stderr, " shape=[224,224,3],...[w,h,c] or [w,h] **[0,0] will not resize\n"); @@ -1846,7 +1914,7 @@ static void show_usage() int main(int argc, char** argv) { - if (argc < 5) + if (argc < 4) { show_usage(); return -1; @@ -1863,8 +1931,6 @@ int main(int argc, char** argv) const char* inparam = argv[1]; const char* inbin = argv[2]; - char* lists = argv[3]; - const char* outtable = argv[4]; ncnn::Option opt; opt.num_threads = 1; @@ -1880,13 +1946,47 @@ int main(int argc, char** argv) net.init(); - // load lists - net.listspaths = parse_comma_path_list(lists); + const bool need_calibration_dataset = !net.conv_layers.empty(); + + const char* outtable = 0; + int kv_start = 0; + + if (need_calibration_dataset) + { + if (argc < 5) + { + show_usage(); + return -1; + } + + net.listspaths = parse_comma_path_list(argv[3]); + outtable = argv[4]; + kv_start = 5; + } + else + { + if (argc >= 5 && strchr(argv[4], '=')) + { + outtable = argv[3]; + kv_start = 4; + } + else if (argc >= 5) + { + net.listspaths = parse_comma_path_list(argv[3]); + outtable = argv[4]; + kv_start = 5; + } + else + { + outtable = argv[3]; + kv_start = 4; + } + } std::string method = "kl"; net.file_type = 0; - for (int i = 5; i < argc; i++) + for (int i = kv_start; i < argc; i++) { // key=value char* kv = argv[i]; @@ -1922,27 +2022,27 @@ int main(int argc, char** argv) // sanity check const size_t input_blob_count = net.input_blobs.size(); - if (net.listspaths.size() != input_blob_count) + if (need_calibration_dataset && net.listspaths.size() != input_blob_count) { fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size()); return -1; } - if ((0 == net.file_type) && (net.means.size() != input_blob_count)) + if (need_calibration_dataset && (0 == net.file_type) && (net.means.size() != input_blob_count)) { fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size()); return -1; } - if ((0 == net.file_type) && (net.norms.size() != input_blob_count)) + if (need_calibration_dataset && (0 == net.file_type) && (net.norms.size() != input_blob_count)) { fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size()); return -1; } - if (net.shapes.size() != input_blob_count) + if (need_calibration_dataset && net.shapes.size() != input_blob_count) { fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size()); return -1; } - if ((0 == net.file_type) && (net.type_to_pixels.size() != input_blob_count)) + if (need_calibration_dataset && (0 == net.file_type) && (net.type_to_pixels.size() != input_blob_count)) { fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size()); return -1; From bd39d1301b89c24241904f52ce8e76ed669750f9 Mon Sep 17 00:00:00 2001 From: Rountaboutt Date: Sun, 19 Apr 2026 20:20:17 +0800 Subject: [PATCH 05/12] tools:Change the MultiHeadAttention layer scaling factors to be read from the table in ncnn2int8 --- tools/quantize/ncnn2int8.cpp | 97 ++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 55 deletions(-) diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index b7f567f29ac7..76c51556c0a9 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -577,8 +577,6 @@ int NetQuantize::quantize_embed() fprintf(stderr, "quantize_embed %s\n", embed->name.c_str()); - // TODO move to ncnn2table - const int num_output = embed->num_output; const int input_dim = embed->input_dim; @@ -718,29 +716,51 @@ int NetQuantize::quantize_multiheadattention() if (layers[i]->type != "MultiHeadAttention") continue; + char key_q[256]; + snprintf(key_q, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_q = weight_int8scale_table.find(key_q); + if (iter_q == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_k[256]; + snprintf(key_k, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_k = weight_int8scale_table.find(key_k); + if (iter_k == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_v[256]; + snprintf(key_v, 256, "%s_param_2", layers[i]->name.c_str()); + std::map::iterator iter_v = weight_int8scale_table.find(key_v); + if (iter_v == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_out[256]; + snprintf(key_out, 256, "%s_param_3", layers[i]->name.c_str()); + std::map::iterator iter_out = weight_int8scale_table.find(key_out); + if (iter_out == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // MultiHeadAttention - quantize weight from fp32 to int8 ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layers[i]; fprintf(stderr, "quantize_multiheadattention %s\n", mha->name.c_str()); - // TODO move to ncnn2table - const int qdim = mha->weight_data_size / mha->embed_dim; { - mha->q_weight_data_int8_scales.create(mha->embed_dim); - for (int i = 0; i < mha->embed_dim; i++) - { - float absmax = 0.f; - - const float* ptr = (const float*)mha->q_weight_data + i * qdim; - for (int j = 0; j < qdim; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->q_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } + mha->q_weight_data_int8_scales = iter_q->second; ncnn::Mat q_weight_data = mha->q_weight_data.reshape(qdim, mha->embed_dim); ncnn::Mat q_weight_data_int8; @@ -756,19 +776,7 @@ int NetQuantize::quantize_multiheadattention() } { - mha->k_weight_data_int8_scales.create(mha->embed_dim); - for (int i = 0; i < mha->embed_dim; i++) - { - float absmax = 0.f; - - const float* ptr = (const float*)mha->k_weight_data + i * mha->kdim; - for (int j = 0; j < mha->kdim; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->k_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } + mha->k_weight_data_int8_scales = iter_k->second; ncnn::Mat k_weight_data = mha->k_weight_data.reshape(mha->kdim, mha->embed_dim); ncnn::Mat k_weight_data_int8; @@ -784,19 +792,7 @@ int NetQuantize::quantize_multiheadattention() } { - mha->v_weight_data_int8_scales.create(mha->embed_dim); - for (int i = 0; i < mha->embed_dim; i++) - { - float absmax = 0.f; - - const float* ptr = (const float*)mha->v_weight_data + i * mha->vdim; - for (int j = 0; j < mha->vdim; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->v_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } + mha->v_weight_data_int8_scales = iter_v->second; ncnn::Mat v_weight_data = mha->v_weight_data.reshape(mha->vdim, mha->embed_dim); ncnn::Mat v_weight_data_int8; @@ -812,17 +808,8 @@ int NetQuantize::quantize_multiheadattention() } { - const float* ptr = mha->out_weight_data; - float absmax = 0.f; - for (int j = 0; j < mha->out_weight_data.w; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->out_weight_data_int8_scale = absmax == 0.f ? 1.f : 127 / absmax; - - ncnn::Mat out_weight_data_int8_scales(1); - out_weight_data_int8_scales[0] = mha->out_weight_data_int8_scale; + ncnn::Mat out_weight_data_int8_scales = iter_out->second; + mha->out_weight_data_int8_scale = out_weight_data_int8_scales[0]; ncnn::Mat out_weight_data_int8; @@ -1119,4 +1106,4 @@ int main(int argc, char** argv) quantizer.save(outparam, outbin); return 0; -} \ No newline at end of file +} From 94c834a91878232cb6f09fbdf092033dea74de7b Mon Sep 17 00:00:00 2001 From: Rountaboutt Date: Mon, 20 Apr 2026 12:44:04 +0800 Subject: [PATCH 06/12] complete rnn,gru,lstm layers --- tools/quantize/ncnn2int8.cpp | 149 ++++++--------- tools/quantize/ncnn2table.cpp | 338 ++++++++++++++++++++++++++++++++++ 2 files changed, 399 insertions(+), 88 deletions(-) diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 76c51556c0a9..a9438c64dba8 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -317,43 +317,34 @@ int NetQuantize::quantize_rnn() if (layers[i]->type != "RNN") continue; + char key_xc[256]; + snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_xc = weight_int8scale_table.find(key_xc); + if (iter_xc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_hc[256]; + snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_hc = weight_int8scale_table.find(key_hc); + if (iter_hc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // RNN - quantize weight from fp32 to int8 ncnn::RNN* rnn = (ncnn::RNN*)layers[i]; fprintf(stderr, "quantize_rnn %s\n", rnn->name.c_str()); - // TODO move to ncnn2table const int num_directions = rnn->direction == 2 ? 2 : 1; const int size = rnn->weight_data_size / num_directions / rnn->num_output; - ncnn::Mat weight_xc_data_int8_scales(rnn->num_output * num_directions); - ncnn::Mat weight_hc_data_int8_scales(rnn->num_output * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < rnn->num_output; q++) - { - { - const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i])); - } - weight_xc_data_int8_scales[d * rnn->num_output + q] = 127 / absmax; - } - - { - const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i])); - } - weight_hc_data_int8_scales[d * rnn->num_output + q] = 127 / absmax; - } - } - } + ncnn::Mat weight_xc_data_int8_scales = iter_xc->second; + ncnn::Mat weight_hc_data_int8_scales = iter_hc->second; { ncnn::Mat weight_xc_data_r2 = rnn->weight_xc_data.reshape(size, rnn->num_output * num_directions); @@ -399,43 +390,34 @@ int NetQuantize::quantize_lstm() if (layers[i]->type != "LSTM") continue; + char key_xc[256]; + snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_xc = weight_int8scale_table.find(key_xc); + if (iter_xc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_hc[256]; + snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_hc = weight_int8scale_table.find(key_hc); + if (iter_hc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // LSTM - quantize weight from fp32 to int8 ncnn::LSTM* lstm = (ncnn::LSTM*)layers[i]; fprintf(stderr, "quantize_lstm %s\n", lstm->name.c_str()); - // TODO move to ncnn2table const int num_directions = lstm->direction == 2 ? 2 : 1; const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4; - ncnn::Mat weight_xc_data_int8_scales(lstm->hidden_size * 4 * num_directions); - ncnn::Mat weight_hc_data_int8_scales(lstm->hidden_size * 4 * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < lstm->hidden_size * 4; q++) - { - { - const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i])); - } - weight_xc_data_int8_scales[d * lstm->hidden_size * 4 + q] = 127 / absmax; - } - - { - const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i])); - } - weight_hc_data_int8_scales[d * lstm->hidden_size * 4 + q] = 127 / absmax; - } - } - } + ncnn::Mat weight_xc_data_int8_scales = iter_xc->second; + ncnn::Mat weight_hc_data_int8_scales = iter_hc->second; { ncnn::Mat weight_xc_data_r2 = lstm->weight_xc_data.reshape(size, lstm->hidden_size * 4 * num_directions); @@ -481,43 +463,34 @@ int NetQuantize::quantize_gru() if (layers[i]->type != "GRU") continue; + char key_xc[256]; + snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_xc = weight_int8scale_table.find(key_xc); + if (iter_xc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_hc[256]; + snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_hc = weight_int8scale_table.find(key_hc); + if (iter_hc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // GRU - quantize weight from fp32 to int8 ncnn::GRU* gru = (ncnn::GRU*)layers[i]; fprintf(stderr, "quantize_gru %s\n", gru->name.c_str()); - // TODO move to ncnn2table const int num_directions = gru->direction == 2 ? 2 : 1; const int size = gru->weight_data_size / num_directions / gru->num_output / 3; - ncnn::Mat weight_xc_data_int8_scales(gru->num_output * 3 * num_directions); - ncnn::Mat weight_hc_data_int8_scales(gru->num_output * 3 * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < gru->num_output * 3; q++) - { - { - const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i])); - } - weight_xc_data_int8_scales[d * gru->num_output * 3 + q] = 127 / absmax; - } - - { - const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i])); - } - weight_hc_data_int8_scales[d * gru->num_output * 3 + q] = 127 / absmax; - } - } - } + ncnn::Mat weight_xc_data_int8_scales = iter_xc->second; + ncnn::Mat weight_hc_data_int8_scales = iter_hc->second; { ncnn::Mat weight_xc_data_r2 = gru->weight_xc_data.reshape(size, gru->num_output * 3 * num_directions); @@ -840,7 +813,7 @@ int NetQuantize::quantize_sdpa() fprintf(stderr, "quantize_sdpa %s\n", sdpa->name.c_str()); - // TODO move to ncnn2table + // SDPA uses dynamic activation quantization in forward_int8 sdpa->int8_scale_term = 2; } diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index d9059638500c..bbadf865efbe 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -40,6 +40,9 @@ #include "layer/innerproduct.h" #include "layer/embed.h" #include "layer/multiheadattention.h" +#include "layer/rnn.h" +#include "layer/lstm.h" +#include "layer/gru.h" class QuantBlobStat { @@ -72,6 +75,14 @@ class QuantMHAStat float out_weight_scale; }; +// rnn, gru, lstm +class QuantRecurrentStat +{ +public: + ncnn::Mat weight_xc_scales; + ncnn::Mat weight_hc_scales; +}; + class QuantNet : public ncnn::Net { public: @@ -104,6 +115,9 @@ class QuantNet : public ncnn::Net std::vector conv_top_blobs; std::vector embed_layers; std::vector mha_layers; + std::vector rnn_layers; + std::vector lstm_layers; + std::vector gru_layers; // result std::vector quant_blob_stats; @@ -111,6 +125,9 @@ class QuantNet : public ncnn::Net std::vector bottom_blob_scales; std::vector embed_weight_scales; std::vector mha_stats; + std::vector rnn_stats; + std::vector lstm_stats; + std::vector gru_stats; }; QuantNet::QuantNet() @@ -153,18 +170,36 @@ int QuantNet::init() { mha_layers.push_back(i); } + else if (layer->type == "RNN") + { + rnn_layers.push_back(i); + } + else if (layer->type == "LSTM") + { + lstm_layers.push_back(i); + } + else if (layer->type == "GRU") + { + gru_layers.push_back(i); + } } const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); const int embed_layer_count = (int)embed_layers.size(); const int mha_layer_count = (int)mha_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); quant_blob_stats.resize(conv_bottom_blob_count); weight_scales.resize(conv_layer_count); bottom_blob_scales.resize(conv_bottom_blob_count); embed_weight_scales.resize(embed_layer_count); mha_stats.resize(mha_layer_count); + rnn_stats.resize(rnn_layer_count); + lstm_stats.resize(lstm_layer_count); + gru_stats.resize(gru_layer_count); return 0; } @@ -182,6 +217,9 @@ int QuantNet::save_table(const char* tablepath) const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); const int embed_layer_count = (int)embed_layers.size(); const int mha_layer_count = (int)mha_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); fprintf(stdout, "param:%d\n", conv_layer_count); @@ -253,6 +291,66 @@ int QuantNet::save_table(const char* tablepath) fprintf(fp, "\n"); } + fprintf(stdout, "param:%d\n", rnn_layer_count); + for (int i = 0; i < rnn_layer_count; i++) + { + const ncnn::Mat weight_xc_scales = rnn_stats[i].weight_xc_scales; + fprintf(fp, "%s_param_0 ", layers[rnn_layers[i]]->name.c_str()); + for (int j = 0; j < weight_xc_scales.w; j++) + { + fprintf(fp, "%f ", weight_xc_scales[j]); + } + fprintf(fp, "\n"); + + const ncnn::Mat weight_hc_scales = rnn_stats[i].weight_hc_scales; + fprintf(fp, "%s_param_1 ", layers[rnn_layers[i]]->name.c_str()); + for (int j = 0; j < weight_hc_scales.w; j++) + { + fprintf(fp, "%f ", weight_hc_scales[j]); + } + fprintf(fp, "\n"); + } + + fprintf(stdout, "param:%d\n", lstm_layer_count); + for (int i = 0; i < lstm_layer_count; i++) + { + const ncnn::Mat weight_xc_scales = lstm_stats[i].weight_xc_scales; + fprintf(fp, "%s_param_0 ", layers[lstm_layers[i]]->name.c_str()); + for (int j = 0; j < weight_xc_scales.w; j++) + { + fprintf(fp, "%f ", weight_xc_scales[j]); + } + fprintf(fp, "\n"); + + const ncnn::Mat weight_hc_scales = lstm_stats[i].weight_hc_scales; + fprintf(fp, "%s_param_1 ", layers[lstm_layers[i]]->name.c_str()); + for (int j = 0; j < weight_hc_scales.w; j++) + { + fprintf(fp, "%f ", weight_hc_scales[j]); + } + fprintf(fp, "\n"); + } + + fprintf(stdout, "param:%d\n", gru_layer_count); + for (int i = 0; i < gru_layer_count; i++) + { + const ncnn::Mat weight_xc_scales = gru_stats[i].weight_xc_scales; + fprintf(fp, "%s_param_0 ", layers[gru_layers[i]]->name.c_str()); + for (int j = 0; j < weight_xc_scales.w; j++) + { + fprintf(fp, "%f ", weight_xc_scales[j]); + } + fprintf(fp, "\n"); + + const ncnn::Mat weight_hc_scales = gru_stats[i].weight_hc_scales; + fprintf(fp, "%s_param_1 ", layers[gru_layers[i]]->name.c_str()); + for (int j = 0; j < weight_hc_scales.w; j++) + { + fprintf(fp, "%f ", weight_hc_scales[j]); + } + fprintf(fp, "\n"); + } + fclose(fp); fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n"); @@ -381,6 +479,9 @@ int QuantNet::quantize_KL() const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); const int embed_layer_count = (int)embed_layers.size(); const int mha_layer_count = (int)mha_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); // initialize conv weight scales #pragma omp parallel for num_threads(quantize_num_threads) @@ -553,6 +654,123 @@ int QuantNet::quantize_KL() mha_stats[i].out_weight_scale = o_absmax == 0.f ? 1.f : 127 / o_absmax; } + // initialize rnn weight scales + for (int i = 0; i < rnn_layer_count; i++) + { + const ncnn::Layer* layer = layers[rnn_layers[i]]; + const ncnn::RNN* rnn = (const ncnn::RNN*)layer; + + const int num_directions = rnn->direction == 2 ? 2 : 1; + const int size = rnn->weight_data_size / num_directions / rnn->num_output; + + rnn_stats[i].weight_xc_scales.create(rnn->num_output * num_directions); + rnn_stats[i].weight_hc_scales.create(rnn->num_output * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < rnn->num_output; q++) + { + { + const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + rnn_stats[i].weight_xc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < rnn->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + rnn_stats[i].weight_hc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + + // initialize lstm weight scales + for (int i = 0; i < lstm_layer_count; i++) + { + const ncnn::Layer* layer = layers[lstm_layers[i]]; + const ncnn::LSTM* lstm = (const ncnn::LSTM*)layer; + + const int num_directions = lstm->direction == 2 ? 2 : 1; + const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4; + + lstm_stats[i].weight_xc_scales.create(lstm->hidden_size * 4 * num_directions); + lstm_stats[i].weight_hc_scales.create(lstm->hidden_size * 4 * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < lstm->hidden_size * 4; q++) + { + { + const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + lstm_stats[i].weight_xc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < lstm->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + lstm_stats[i].weight_hc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + + // initialize gru weight scales + for (int i = 0; i < gru_layer_count; i++) + { + const ncnn::Layer* layer = layers[gru_layers[i]]; + const ncnn::GRU* gru = (const ncnn::GRU*)layer; + + const int num_directions = gru->direction == 2 ? 2 : 1; + const int size = gru->weight_data_size / num_directions / gru->num_output / 3; + + gru_stats[i].weight_xc_scales.create(gru->num_output * 3 * num_directions); + gru_stats[i].weight_hc_scales.create(gru->num_output * 3 * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < gru->num_output * 3; q++) + { + { + const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + gru_stats[i].weight_xc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < gru->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + gru_stats[i].weight_hc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + if (conv_layer_count == 0) return 0; @@ -937,6 +1155,9 @@ int QuantNet::quantize_ACIQ() const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); const int embed_layer_count = (int)embed_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); const int mha_layer_count = (int)mha_layers.size(); // initialize conv weight scales @@ -1057,6 +1278,123 @@ int QuantNet::quantize_ACIQ() } + // initialize rnn weight scales + for (int i = 0; i < rnn_layer_count; i++) + { + const ncnn::Layer* layer = layers[rnn_layers[i]]; + const ncnn::RNN* rnn = (const ncnn::RNN*)layer; + + const int num_directions = rnn->direction == 2 ? 2 : 1; + const int size = rnn->weight_data_size / num_directions / rnn->num_output; + + rnn_stats[i].weight_xc_scales.create(rnn->num_output * num_directions); + rnn_stats[i].weight_hc_scales.create(rnn->num_output * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < rnn->num_output; q++) + { + { + const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + rnn_stats[i].weight_xc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < rnn->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + rnn_stats[i].weight_hc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + + // initialize lstm weight scales + for (int i = 0; i < lstm_layer_count; i++) + { + const ncnn::Layer* layer = layers[lstm_layers[i]]; + const ncnn::LSTM* lstm = (const ncnn::LSTM*)layer; + + const int num_directions = lstm->direction == 2 ? 2 : 1; + const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4; + + lstm_stats[i].weight_xc_scales.create(lstm->hidden_size * 4 * num_directions); + lstm_stats[i].weight_hc_scales.create(lstm->hidden_size * 4 * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < lstm->hidden_size * 4; q++) + { + { + const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + lstm_stats[i].weight_xc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < lstm->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + lstm_stats[i].weight_hc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + + // initialize gru weight scales + for (int i = 0; i < gru_layer_count; i++) + { + const ncnn::Layer* layer = layers[gru_layers[i]]; + const ncnn::GRU* gru = (const ncnn::GRU*)layer; + + const int num_directions = gru->direction == 2 ? 2 : 1; + const int size = gru->weight_data_size / num_directions / gru->num_output / 3; + + gru_stats[i].weight_xc_scales.create(gru->num_output * 3 * num_directions); + gru_stats[i].weight_hc_scales.create(gru->num_output * 3 * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < gru->num_output * 3; q++) + { + { + const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + gru_stats[i].weight_xc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < gru->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + gru_stats[i].weight_hc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } + } + } + } + // initialize mha weight scales for (int i = 0; i < mha_layer_count; i++) { From 43caf20071a72068e0a2c7735c5a0fee41ccc8e0 Mon Sep 17 00:00:00 2001 From: Rountaboutt Date: Mon, 20 Apr 2026 13:23:56 +0800 Subject: [PATCH 07/12] supplement documents and printing information --- docs/how-to-use-and-FAQ/quantized-int8-inference.md | 8 ++++---- tools/quantize/ncnn2table.cpp | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/how-to-use-and-FAQ/quantized-int8-inference.md b/docs/how-to-use-and-FAQ/quantized-int8-inference.md index bb0b0a7330b7..030215d5be5e 100644 --- a/docs/how-to-use-and-FAQ/quantized-int8-inference.md +++ b/docs/how-to-use-and-FAQ/quantized-int8-inference.md @@ -89,16 +89,16 @@ filelist_in2.txt ``` **Here shape is WHC, because the order of the arguments to `ncnn::Mat`.** -### 3. Quantize model +For RNN,GRU,LSTM,MultiHeadAttention and Embed layers,ncnn2table also supports tableless quantization. ```shell -./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table +./ncnn2table rnn.param rnn.bin rnn.table method=kl ``` -If you don’t need static quantization, ncnn supports RNN/LSTM/GRU dynamic quantization. In this case, you can omit the table file. +### 3. Quantize model ```shell -./ncnn2int8 rnn-model.param rnn-model.bin rnn-model-int8.param rnn-model-int8.bin +./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table ``` ## use ncnn int8 inference diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index bbadf865efbe..cc6d7403e09f 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -2248,6 +2248,7 @@ static void show_usage() fprintf(stderr, "Sample usage:\n"); fprintf(stderr, " ncnn2table squeezenet.param squeezenet.bin filelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n"); fprintf(stderr, " ncnn2table test.param test.bin filelist.txt squeezenet.table shape=[227,227,3] method=kl type=1\n"); + fprintf(stderr, " ncnn2table rnn.param rnn.bin rnn.table method=kl\n"); } int main(int argc, char** argv) From fe827598da71d57dcc07892119aadf48b36a56d5 Mon Sep 17 00:00:00 2001 From: Roundaboutt <93021080+Roundaboutt@users.noreply.github.com> Date: Mon, 20 Apr 2026 06:29:13 +0000 Subject: [PATCH 08/12] apply code-format changes --- tools/quantize/ncnn2int8.cpp | 2 +- tools/quantize/ncnn2table.cpp | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index a9438c64dba8..a92305a75324 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -1079,4 +1079,4 @@ int main(int argc, char** argv) quantizer.save(outparam, outbin); return 0; -} +} diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index cc6d7403e09f..55aac995c0e6 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -158,7 +158,7 @@ int QuantNet::init() conv_bottom_blobs.push_back(layer->bottoms[0]); conv_top_blobs.push_back(layer->tops[0]); } - + // find embed layers else if (layer->type == "Embed") { @@ -246,10 +246,10 @@ int QuantNet::save_table(const char* tablepath) } fprintf(fp, "\n"); } - + fprintf(stdout, "param:%d\n", embed_layer_count); for (int i = 0; i < embed_layer_count; i++) - { + { fprintf(fp, "%s_param_0 ", layers[embed_layers[i]]->name.c_str()); fprintf(fp, "%f ", embed_weight_scales[i]); fprintf(fp, "\n"); @@ -257,7 +257,7 @@ int QuantNet::save_table(const char* tablepath) fprintf(stdout, "param:%d\n", mha_layer_count); for (int i = 0; i < mha_layer_count; i++) - { + { // q_weight const ncnn::Mat q_weight_scales = mha_stats[i].q_weight_scales; fprintf(fp, "%s_param_0 ", layers[mha_layers[i]]->name.c_str()); @@ -265,8 +265,8 @@ int QuantNet::save_table(const char* tablepath) { fprintf(fp, "%f ", q_weight_scales[j]); } - fprintf(fp, "\n"); - + fprintf(fp, "\n"); + // k_weight const ncnn::Mat k_weight_scales = mha_stats[i].k_weight_scales; fprintf(fp, "%s_param_1 ", layers[mha_layers[i]]->name.c_str()); @@ -274,8 +274,8 @@ int QuantNet::save_table(const char* tablepath) { fprintf(fp, "%f ", k_weight_scales[j]); } - fprintf(fp, "\n"); - + fprintf(fp, "\n"); + // v_weight const ncnn::Mat v_weight_scales = mha_stats[i].v_weight_scales; fprintf(fp, "%s_param_2 ", layers[mha_layers[i]]->name.c_str()); @@ -283,12 +283,12 @@ int QuantNet::save_table(const char* tablepath) { fprintf(fp, "%f ", v_weight_scales[j]); } - fprintf(fp, "\n"); - + fprintf(fp, "\n"); + // out_weight fprintf(fp, "%s_param_3 ", layers[mha_layers[i]]->name.c_str()); fprintf(fp, "%f ", mha_stats[i].out_weight_scale); - fprintf(fp, "\n"); + fprintf(fp, "\n"); } fprintf(stdout, "param:%d\n", rnn_layer_count); @@ -594,14 +594,13 @@ int QuantNet::quantize_KL() absmax = std::max(absmax, (float)fabs(ptr[j])); } embed_weight_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } // initialize mha weight scales for (int i = 0; i < mha_layer_count; i++) { const ncnn::Layer* layer = layers[mha_layers[i]]; - const ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*) layer; + const ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layer; const int qdim = mha->weight_data_size / mha->embed_dim; mha_stats[i].q_weight_scales.create(mha->embed_dim); @@ -1275,7 +1274,6 @@ int QuantNet::quantize_ACIQ() absmax = std::max(absmax, (float)fabs(ptr[j])); } embed_weight_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } // initialize rnn weight scales From 943151c1bf4d60880a2c78d5f82086c89e31abdd Mon Sep 17 00:00:00 2001 From: Rountaboutt Date: Fri, 8 May 2026 14:25:42 +0800 Subject: [PATCH 09/12] Fix the issue of 'const' in ncnn2table and correct docs --- docs/how-to-use-and-FAQ/quantized-int8-inference.md | 2 +- tools/quantize/ncnn2table.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/how-to-use-and-FAQ/quantized-int8-inference.md b/docs/how-to-use-and-FAQ/quantized-int8-inference.md index 030215d5be5e..10ab96843c92 100644 --- a/docs/how-to-use-and-FAQ/quantized-int8-inference.md +++ b/docs/how-to-use-and-FAQ/quantized-int8-inference.md @@ -89,7 +89,7 @@ filelist_in2.txt ``` **Here shape is WHC, because the order of the arguments to `ncnn::Mat`.** -For RNN,GRU,LSTM,MultiHeadAttention and Embed layers,ncnn2table also supports tableless quantization. +ncnn2table can generate static weight scales without a calibration dataset for RNN,GRU,LSTM,MultiHeadAttention and Embed layers ```shell ./ncnn2table rnn.param rnn.bin rnn.table method=kl diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index 55aac995c0e6..7ad0b8e174a9 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -1397,7 +1397,7 @@ int QuantNet::quantize_ACIQ() for (int i = 0; i < mha_layer_count; i++) { const ncnn::Layer* layer = layers[mha_layers[i]]; - const ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layer; + const ncnn::MultiHeadAttention* mha = (const ncnn::MultiHeadAttention*)layer; const int qdim = mha->weight_data_size / mha->embed_dim; mha_stats[i].q_weight_scales.create(mha->embed_dim); From 43bec3db337fcb3a7a3144442118d6e5aa8a5625 Mon Sep 17 00:00:00 2001 From: Rountaboutt Date: Fri, 8 May 2026 14:53:25 +0800 Subject: [PATCH 10/12] Fix the issue of 'const' in ncnn2table and correct docs --- tools/quantize/ncnn2table.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index 7ad0b8e174a9..65139af681b1 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -600,7 +600,7 @@ int QuantNet::quantize_KL() for (int i = 0; i < mha_layer_count; i++) { const ncnn::Layer* layer = layers[mha_layers[i]]; - const ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layer; + const ncnn::MultiHeadAttention* mha = (const ncnn::MultiHeadAttention*)layer; const int qdim = mha->weight_data_size / mha->embed_dim; mha_stats[i].q_weight_scales.create(mha->embed_dim); From a6836281e8f184f774183612e4612e76be29815f Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 18 May 2026 16:50:02 +0800 Subject: [PATCH 11/12] cc --- tools/modelwriter.h | 1 + tools/quantize/ncnn2int8.cpp | 24 ++ tools/quantize/ncnn2table.cpp | 437 ++++++++++------------------------ 3 files changed, 154 insertions(+), 308 deletions(-) diff --git a/tools/modelwriter.h b/tools/modelwriter.h index fdebca7d2c93..a7e1d9a72baa 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -2057,6 +2057,7 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 4=%d", vdim) fprintf_param_value(" 5=%d", attn_mask) fprintf_param_value(" 6=%e", scale) + fprintf_param_value(" 7=%d", kv_cache) fprintf_param_value(" 18=%d", int8_scale_term) fwrite_weight_tag_data(op->q_weight_data, bp); diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index a92305a75324..41a2115b094a 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -128,6 +128,8 @@ class NetQuantize : public ModelWriter int quantize_sdpa(); int fuse_requantize(); + + int check_int8scale_table_requirement(const char* int8scale_table_path) const; }; NetQuantize::NetQuantize() @@ -135,6 +137,25 @@ NetQuantize::NetQuantize() { } +int NetQuantize::check_int8scale_table_requirement(const char* int8scale_table_path) const +{ + if (int8scale_table_path) + return 0; + + for (size_t i = 0; i < layers.size(); i++) + { + const std::string& type = layers[i]->type; + if (type != "Embed" && type != "MultiHeadAttention" && type != "RNN" && type != "LSTM" && type != "GRU") + continue; + + fprintf(stderr, "%s (%s): calibration table is required for static weight quantization\n", layers[i]->name.c_str(), type.c_str()); + fprintf(stderr, "run ncnn2table to generate weight scales and pass the table to ncnn2int8\n"); + return -1; + } + + return 0; +} + int NetQuantize::quantize_convolution() { const int layer_count = static_cast(layers.size()); @@ -1062,6 +1083,9 @@ int main(int argc, char** argv) else quantizer.load_model(inbin); + if (quantizer.check_int8scale_table_requirement(int8scale_table_path) != 0) + return -1; + quantizer.quantize_convolution(); quantizer.quantize_convolutiondepthwise(); quantizer.quantize_innerproduct(); diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index 65139af681b1..a0af1f3e8182 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -104,6 +104,7 @@ class QuantNet : public ncnn::Net int init(); void print_quant_info() const; int save_table(const char* tablepath); + void initialize_static_weight_scales(); int quantize_KL(); int quantize_ACIQ(); int quantize_EQ(); @@ -459,128 +460,14 @@ inline ncnn::Mat read_and_resize_image(const std::vector& shape, const std: return ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h); } -static float compute_kl_divergence(const std::vector& a, const std::vector& b) -{ - const size_t length = a.size(); - - float result = 0; - for (size_t i = 0; i < length; i++) - { - result += a[i] * log(a[i] / b[i]); - } - - return result; -} - -int QuantNet::quantize_KL() +void QuantNet::initialize_static_weight_scales() { - const int input_blob_count = (int)input_blobs.size(); - const int conv_layer_count = (int)conv_layers.size(); - const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); const int embed_layer_count = (int)embed_layers.size(); const int mha_layer_count = (int)mha_layers.size(); const int rnn_layer_count = (int)rnn_layers.size(); const int lstm_layer_count = (int)lstm_layers.size(); const int gru_layer_count = (int)gru_layers.size(); - // initialize conv weight scales - #pragma omp parallel for num_threads(quantize_num_threads) - for (int i = 0; i < conv_layer_count; i++) - { - const ncnn::Layer* layer = layers[conv_layers[i]]; - - if (layer->type == "Convolution") - { - const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer; - - const int num_output = convolution->num_output; - const int kernel_w = convolution->kernel_w; - const int kernel_h = convolution->kernel_h; - const int dilation_w = convolution->dilation_w; - const int dilation_h = convolution->dilation_h; - const int stride_w = convolution->stride_w; - const int stride_h = convolution->stride_h; - - const int weight_data_size_output = convolution->weight_data_size / num_output; - - // int8 winograd F43 needs weight data to use 6bit quantization - // TODO proper condition for winograd 3x3 int8 - bool quant_6bit = false; - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - quant_6bit = true; - - weight_scales[i].create(num_output); - - for (int n = 0; n < num_output; n++) - { - const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output); - - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) - { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } - - if (quant_6bit) - { - weight_scales[i][n] = 31 / absmax; - } - else - { - weight_scales[i][n] = 127 / absmax; - } - } - } - - if (layer->type == "ConvolutionDepthWise") - { - const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer; - - const int group = convolutiondepthwise->group; - const int weight_data_size_output = convolutiondepthwise->weight_data_size / group; - - std::vector scales; - - weight_scales[i].create(group); - - for (int n = 0; n < group; n++) - { - const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output); - - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) - { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } - - weight_scales[i][n] = 127 / absmax; - } - } - - if (layer->type == "InnerProduct") - { - const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer; - - const int num_output = innerproduct->num_output; - const int weight_data_size_output = innerproduct->weight_data_size / num_output; - - weight_scales[i].create(num_output); - - for (int n = 0; n < num_output; n++) - { - const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output); - - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) - { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } - - weight_scales[i][n] = 127 / absmax; - } - } - } - // initialize embed weight scales for (int i = 0; i < embed_layer_count; i++) { @@ -769,6 +656,126 @@ int QuantNet::quantize_KL() } } } +} + +static float compute_kl_divergence(const std::vector& a, const std::vector& b) +{ + const size_t length = a.size(); + + float result = 0; + for (size_t i = 0; i < length; i++) + { + result += a[i] * log(a[i] / b[i]); + } + + return result; +} + +int QuantNet::quantize_KL() +{ + const int input_blob_count = (int)input_blobs.size(); + const int conv_layer_count = (int)conv_layers.size(); + const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + + // initialize conv weight scales + #pragma omp parallel for num_threads(quantize_num_threads) + for (int i = 0; i < conv_layer_count; i++) + { + const ncnn::Layer* layer = layers[conv_layers[i]]; + + if (layer->type == "Convolution") + { + const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer; + + const int num_output = convolution->num_output; + const int kernel_w = convolution->kernel_w; + const int kernel_h = convolution->kernel_h; + const int dilation_w = convolution->dilation_w; + const int dilation_h = convolution->dilation_h; + const int stride_w = convolution->stride_w; + const int stride_h = convolution->stride_h; + + const int weight_data_size_output = convolution->weight_data_size / num_output; + + // int8 winograd F43 needs weight data to use 6bit quantization + // TODO proper condition for winograd 3x3 int8 + bool quant_6bit = false; + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + quant_6bit = true; + + weight_scales[i].create(num_output); + + for (int n = 0; n < num_output; n++) + { + const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output); + + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } + + if (quant_6bit) + { + weight_scales[i][n] = 31 / absmax; + } + else + { + weight_scales[i][n] = 127 / absmax; + } + } + } + + if (layer->type == "ConvolutionDepthWise") + { + const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer; + + const int group = convolutiondepthwise->group; + const int weight_data_size_output = convolutiondepthwise->weight_data_size / group; + + std::vector scales; + + weight_scales[i].create(group); + + for (int n = 0; n < group; n++) + { + const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output); + + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } + + weight_scales[i][n] = 127 / absmax; + } + } + + if (layer->type == "InnerProduct") + { + const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer; + + const int num_output = innerproduct->num_output; + const int weight_data_size_output = innerproduct->weight_data_size / num_output; + + weight_scales[i].create(num_output); + + for (int n = 0; n < num_output; n++) + { + const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output); + + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } + + weight_scales[i][n] = 127 / absmax; + } + } + } + + initialize_static_weight_scales(); if (conv_layer_count == 0) return 0; @@ -1153,11 +1160,6 @@ int QuantNet::quantize_ACIQ() const int input_blob_count = (int)input_blobs.size(); const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); - const int embed_layer_count = (int)embed_layers.size(); - const int rnn_layer_count = (int)rnn_layers.size(); - const int lstm_layer_count = (int)lstm_layers.size(); - const int gru_layer_count = (int)gru_layers.size(); - const int mha_layer_count = (int)mha_layers.size(); // initialize conv weight scales #pragma omp parallel for num_threads(quantize_num_threads) @@ -1261,194 +1263,7 @@ int QuantNet::quantize_ACIQ() } } - // initialize embed weight scales - for (int i = 0; i < embed_layer_count; i++) - { - const ncnn::Layer* layer = layers[embed_layers[i]]; - const ncnn::Embed* embed = (const ncnn::Embed*)layer; - const float* ptr = embed->weight_data; - - float absmax = 0.f; - for (int j = 0; j < embed->weight_data.w; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - embed_weight_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } - - // initialize rnn weight scales - for (int i = 0; i < rnn_layer_count; i++) - { - const ncnn::Layer* layer = layers[rnn_layers[i]]; - const ncnn::RNN* rnn = (const ncnn::RNN*)layer; - - const int num_directions = rnn->direction == 2 ? 2 : 1; - const int size = rnn->weight_data_size / num_directions / rnn->num_output; - - rnn_stats[i].weight_xc_scales.create(rnn->num_output * num_directions); - rnn_stats[i].weight_hc_scales.create(rnn->num_output * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < rnn->num_output; q++) - { - { - const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int j = 0; j < size; j++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); - } - rnn_stats[i].weight_xc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; - } - - { - const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int j = 0; j < rnn->num_output; j++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); - } - rnn_stats[i].weight_hc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; - } - } - } - } - - // initialize lstm weight scales - for (int i = 0; i < lstm_layer_count; i++) - { - const ncnn::Layer* layer = layers[lstm_layers[i]]; - const ncnn::LSTM* lstm = (const ncnn::LSTM*)layer; - - const int num_directions = lstm->direction == 2 ? 2 : 1; - const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4; - - lstm_stats[i].weight_xc_scales.create(lstm->hidden_size * 4 * num_directions); - lstm_stats[i].weight_hc_scales.create(lstm->hidden_size * 4 * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < lstm->hidden_size * 4; q++) - { - { - const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int j = 0; j < size; j++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); - } - lstm_stats[i].weight_xc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; - } - - { - const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int j = 0; j < lstm->num_output; j++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); - } - lstm_stats[i].weight_hc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; - } - } - } - } - - // initialize gru weight scales - for (int i = 0; i < gru_layer_count; i++) - { - const ncnn::Layer* layer = layers[gru_layers[i]]; - const ncnn::GRU* gru = (const ncnn::GRU*)layer; - - const int num_directions = gru->direction == 2 ? 2 : 1; - const int size = gru->weight_data_size / num_directions / gru->num_output / 3; - - gru_stats[i].weight_xc_scales.create(gru->num_output * 3 * num_directions); - gru_stats[i].weight_hc_scales.create(gru->num_output * 3 * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < gru->num_output * 3; q++) - { - { - const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int j = 0; j < size; j++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); - } - gru_stats[i].weight_xc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; - } - - { - const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int j = 0; j < gru->num_output; j++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); - } - gru_stats[i].weight_hc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; - } - } - } - } - - // initialize mha weight scales - for (int i = 0; i < mha_layer_count; i++) - { - const ncnn::Layer* layer = layers[mha_layers[i]]; - const ncnn::MultiHeadAttention* mha = (const ncnn::MultiHeadAttention*)layer; - - const int qdim = mha->weight_data_size / mha->embed_dim; - mha_stats[i].q_weight_scales.create(mha->embed_dim); - for (int j = 0; j < mha->embed_dim; j++) - { - float q_absmax = 0.f; - - const float* q_ptr = (const float*)mha->q_weight_data + j * qdim; - for (int k = 0; k < qdim; k++) - { - q_absmax = std::max(q_absmax, (float)fabs(q_ptr[k])); - } - mha_stats[i].q_weight_scales[j] = q_absmax == 0.f ? 1.f : 127 / q_absmax; - } - - const int kdim = mha->kdim; - mha_stats[i].k_weight_scales.create(mha->embed_dim); - for (int j = 0; j < mha->embed_dim; j++) - { - float k_absmax = 0.f; - - const float* k_ptr = (const float*)mha->k_weight_data + j * kdim; - for (int k = 0; k < kdim; k++) - { - k_absmax = std::max(k_absmax, (float)fabs(k_ptr[k])); - } - mha_stats[i].k_weight_scales[j] = k_absmax == 0.f ? 1.f : 127 / k_absmax; - } - - const int vdim = mha->vdim; - mha_stats[i].v_weight_scales.create(mha->embed_dim); - for (int j = 0; j < mha->embed_dim; j++) - { - float v_absmax = 0.f; - - const float* v_ptr = (const float*)mha->v_weight_data + j * vdim; - for (int k = 0; k < vdim; k++) - { - v_absmax = std::max(v_absmax, (float)fabs(v_ptr[k])); - } - mha_stats[i].v_weight_scales[j] = v_absmax == 0.f ? 1.f : 127 / v_absmax; - } - - const float* o_ptr = (const float*)mha->out_weight_data; - float o_absmax = 0.f; - for (int k = 0; k < mha->out_weight_data.w; k++) - { - o_absmax = std::max(o_absmax, (float)fabs(o_ptr[k])); - } - mha_stats[i].out_weight_scale = o_absmax == 0.f ? 1.f : 127 / o_absmax; - } + initialize_static_weight_scales(); if (conv_layer_count == 0) return 0; @@ -2295,6 +2110,12 @@ int main(int argc, char** argv) show_usage(); return -1; } + if (strchr(argv[4], '=')) + { + fprintf(stderr, "calibration dataset is required for activation calibration\n"); + show_usage(); + return -1; + } net.listspaths = parse_comma_path_list(argv[3]); outtable = argv[4]; From 7a11ec27a83375201fdb273cc7260415e4f41652 Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 18 May 2026 17:09:40 +0800 Subject: [PATCH 12/12] allow data-free quantize --- tools/quantize/ncnn2table.cpp | 358 +++++++++++++++++----------------- 1 file changed, 176 insertions(+), 182 deletions(-) diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index a0af1f3e8182..8c41205db638 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -99,6 +99,7 @@ class QuantNet : public ncnn::Net std::vector type_to_pixels; int quantize_num_threads; int file_type; + bool use_calibration_dataset; public: int init(); @@ -135,6 +136,7 @@ QuantNet::QuantNet() : blobs(mutable_blobs()), layers(mutable_layers()) { quantize_num_threads = ncnn::get_cpu_count(); + use_calibration_dataset = false; } int QuantNet::init() @@ -222,9 +224,9 @@ int QuantNet::save_table(const char* tablepath) const int lstm_layer_count = (int)lstm_layers.size(); const int gru_layer_count = (int)gru_layers.size(); - fprintf(stdout, "param:%d\n", conv_layer_count); + fprintf(stdout, "param:%d\n", use_calibration_dataset ? conv_layer_count : 0); - for (int i = 0; i < conv_layer_count; i++) + for (int i = 0; use_calibration_dataset && i < conv_layer_count; i++) { const ncnn::Mat& weight_scale = weight_scales[i]; @@ -236,7 +238,7 @@ int QuantNet::save_table(const char* tablepath) fprintf(fp, "\n"); } - for (int i = 0; i < conv_bottom_blob_count; i++) + for (int i = 0; use_calibration_dataset && i < conv_bottom_blob_count; i++) { const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i]; @@ -361,6 +363,9 @@ int QuantNet::save_table(const char* tablepath) void QuantNet::print_quant_info() const { + if (!use_calibration_dataset) + return; + for (int i = 0; i < (int)conv_bottom_blobs.size(); i++) { const QuantBlobStat& stat = quant_blob_stats[i]; @@ -677,107 +682,110 @@ int QuantNet::quantize_KL() const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); - // initialize conv weight scales - #pragma omp parallel for num_threads(quantize_num_threads) - for (int i = 0; i < conv_layer_count; i++) + if (use_calibration_dataset) { - const ncnn::Layer* layer = layers[conv_layers[i]]; - - if (layer->type == "Convolution") + // initialize conv weight scales + #pragma omp parallel for num_threads(quantize_num_threads) + for (int i = 0; i < conv_layer_count; i++) { - const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer; + const ncnn::Layer* layer = layers[conv_layers[i]]; - const int num_output = convolution->num_output; - const int kernel_w = convolution->kernel_w; - const int kernel_h = convolution->kernel_h; - const int dilation_w = convolution->dilation_w; - const int dilation_h = convolution->dilation_h; - const int stride_w = convolution->stride_w; - const int stride_h = convolution->stride_h; + if (layer->type == "Convolution") + { + const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer; - const int weight_data_size_output = convolution->weight_data_size / num_output; + const int num_output = convolution->num_output; + const int kernel_w = convolution->kernel_w; + const int kernel_h = convolution->kernel_h; + const int dilation_w = convolution->dilation_w; + const int dilation_h = convolution->dilation_h; + const int stride_w = convolution->stride_w; + const int stride_h = convolution->stride_h; - // int8 winograd F43 needs weight data to use 6bit quantization - // TODO proper condition for winograd 3x3 int8 - bool quant_6bit = false; - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - quant_6bit = true; + const int weight_data_size_output = convolution->weight_data_size / num_output; - weight_scales[i].create(num_output); + // int8 winograd F43 needs weight data to use 6bit quantization + // TODO proper condition for winograd 3x3 int8 + bool quant_6bit = false; + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + quant_6bit = true; - for (int n = 0; n < num_output; n++) - { - const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output); + weight_scales[i].create(num_output); - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) + for (int n = 0; n < num_output; n++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } + const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output); - if (quant_6bit) - { - weight_scales[i][n] = 31 / absmax; - } - else - { - weight_scales[i][n] = 127 / absmax; + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } + + if (quant_6bit) + { + weight_scales[i][n] = 31 / absmax; + } + else + { + weight_scales[i][n] = 127 / absmax; + } } } - } - if (layer->type == "ConvolutionDepthWise") - { - const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer; + if (layer->type == "ConvolutionDepthWise") + { + const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer; - const int group = convolutiondepthwise->group; - const int weight_data_size_output = convolutiondepthwise->weight_data_size / group; + const int group = convolutiondepthwise->group; + const int weight_data_size_output = convolutiondepthwise->weight_data_size / group; - std::vector scales; + std::vector scales; - weight_scales[i].create(group); + weight_scales[i].create(group); - for (int n = 0; n < group; n++) - { - const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output); - - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) + for (int n = 0; n < group; n++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } + const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output); - weight_scales[i][n] = 127 / absmax; - } - } + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } - if (layer->type == "InnerProduct") - { - const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer; + weight_scales[i][n] = 127 / absmax; + } + } - const int num_output = innerproduct->num_output; - const int weight_data_size_output = innerproduct->weight_data_size / num_output; + if (layer->type == "InnerProduct") + { + const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer; - weight_scales[i].create(num_output); + const int num_output = innerproduct->num_output; + const int weight_data_size_output = innerproduct->weight_data_size / num_output; - for (int n = 0; n < num_output; n++) - { - const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output); + weight_scales[i].create(num_output); - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) + for (int n = 0; n < num_output; n++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } + const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output); + + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } - weight_scales[i][n] = 127 / absmax; + weight_scales[i][n] = 127 / absmax; + } } } } initialize_static_weight_scales(); - if (conv_layer_count == 0) + if (conv_layer_count == 0 || !use_calibration_dataset) return 0; const int file_count = (int)listspaths[0].size(); @@ -1161,111 +1169,114 @@ int QuantNet::quantize_ACIQ() const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); - // initialize conv weight scales - #pragma omp parallel for num_threads(quantize_num_threads) - for (int i = 0; i < conv_layer_count; i++) + if (use_calibration_dataset) { - const ncnn::Layer* layer = layers[conv_layers[i]]; - - if (layer->type == "Convolution") + // initialize conv weight scales + #pragma omp parallel for num_threads(quantize_num_threads) + for (int i = 0; i < conv_layer_count; i++) { - const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer; + const ncnn::Layer* layer = layers[conv_layers[i]]; - const int num_output = convolution->num_output; - const int kernel_w = convolution->kernel_w; - const int kernel_h = convolution->kernel_h; - const int dilation_w = convolution->dilation_w; - const int dilation_h = convolution->dilation_h; - const int stride_w = convolution->stride_w; - const int stride_h = convolution->stride_h; + if (layer->type == "Convolution") + { + const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer; - const int weight_data_size_output = convolution->weight_data_size / num_output; + const int num_output = convolution->num_output; + const int kernel_w = convolution->kernel_w; + const int kernel_h = convolution->kernel_h; + const int dilation_w = convolution->dilation_w; + const int dilation_h = convolution->dilation_h; + const int stride_w = convolution->stride_w; + const int stride_h = convolution->stride_h; - // int8 winograd F43 needs weight data to use 6bit quantization - // TODO proper condition for winograd 3x3 int8 - bool quant_6bit = false; - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - quant_6bit = true; + const int weight_data_size_output = convolution->weight_data_size / num_output; - weight_scales[i].create(num_output); + // int8 winograd F43 needs weight data to use 6bit quantization + // TODO proper condition for winograd 3x3 int8 + bool quant_6bit = false; + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + quant_6bit = true; - for (int n = 0; n < num_output; n++) - { - const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output); + weight_scales[i].create(num_output); - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) + for (int n = 0; n < num_output; n++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } + const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output); - if (quant_6bit) - { - const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6); - weight_scales[i][n] = 31 / threshold; - } - else - { - const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); - weight_scales[i][n] = 127 / threshold; + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } + + if (quant_6bit) + { + const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6); + weight_scales[i][n] = 31 / threshold; + } + else + { + const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); + weight_scales[i][n] = 127 / threshold; + } } } - } - if (layer->type == "ConvolutionDepthWise") - { - const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer; - - const int group = convolutiondepthwise->group; - const int weight_data_size_output = convolutiondepthwise->weight_data_size / group; + if (layer->type == "ConvolutionDepthWise") + { + const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer; - std::vector scales; + const int group = convolutiondepthwise->group; + const int weight_data_size_output = convolutiondepthwise->weight_data_size / group; - weight_scales[i].create(group); + std::vector scales; - for (int n = 0; n < group; n++) - { - const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output); + weight_scales[i].create(group); - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) + for (int n = 0; n < group; n++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } + const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output); - const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); - weight_scales[i][n] = 127 / threshold; - } - } + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } - if (layer->type == "InnerProduct") - { - const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer; + const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); + weight_scales[i][n] = 127 / threshold; + } + } - const int num_output = innerproduct->num_output; - const int weight_data_size_output = innerproduct->weight_data_size / num_output; + if (layer->type == "InnerProduct") + { + const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer; - weight_scales[i].create(num_output); + const int num_output = innerproduct->num_output; + const int weight_data_size_output = innerproduct->weight_data_size / num_output; - for (int n = 0; n < num_output; n++) - { - const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output); + weight_scales[i].create(num_output); - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) + for (int n = 0; n < num_output; n++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } + const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output); + + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } - const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); - weight_scales[i][n] = 127 / threshold; + const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); + weight_scales[i][n] = 127 / threshold; + } } } } initialize_static_weight_scales(); - if (conv_layer_count == 0) + if (conv_layer_count == 0 || !use_calibration_dataset) return 0; const int file_count = (int)listspaths[0].size(); @@ -1498,7 +1509,7 @@ int QuantNet::quantize_EQ() const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); - if (conv_layer_count == 0) + if (conv_layer_count == 0 || !use_calibration_dataset) return 0; std::vector blob_allocators(quantize_num_threads); @@ -2098,47 +2109,30 @@ int main(int argc, char** argv) net.init(); - const bool need_calibration_dataset = !net.conv_layers.empty(); - const char* outtable = 0; int kv_start = 0; - if (need_calibration_dataset) + if (argc >= 5 && strchr(argv[4], '=')) + { + outtable = argv[3]; + kv_start = 4; + } + else if (argc >= 5) { - if (argc < 5) - { - show_usage(); - return -1; - } - if (strchr(argv[4], '=')) - { - fprintf(stderr, "calibration dataset is required for activation calibration\n"); - show_usage(); - return -1; - } - net.listspaths = parse_comma_path_list(argv[3]); + net.use_calibration_dataset = true; outtable = argv[4]; kv_start = 5; } else { - if (argc >= 5 && strchr(argv[4], '=')) - { - outtable = argv[3]; - kv_start = 4; - } - else if (argc >= 5) - { - net.listspaths = parse_comma_path_list(argv[3]); - outtable = argv[4]; - kv_start = 5; - } - else - { - outtable = argv[3]; - kv_start = 4; - } + outtable = argv[3]; + kv_start = 4; + } + + if (!net.conv_layers.empty() && !net.use_calibration_dataset) + { + fprintf(stderr, "warning: calibration dataset not provided, skip activation calibration and generate weight-only table\n"); } std::string method = "kl"; @@ -2180,27 +2174,27 @@ int main(int argc, char** argv) // sanity check const size_t input_blob_count = net.input_blobs.size(); - if (need_calibration_dataset && net.listspaths.size() != input_blob_count) + if (net.use_calibration_dataset && net.listspaths.size() != input_blob_count) { fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size()); return -1; } - if (need_calibration_dataset && (0 == net.file_type) && (net.means.size() != input_blob_count)) + if (net.use_calibration_dataset && (0 == net.file_type) && (net.means.size() != input_blob_count)) { fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size()); return -1; } - if (need_calibration_dataset && (0 == net.file_type) && (net.norms.size() != input_blob_count)) + if (net.use_calibration_dataset && (0 == net.file_type) && (net.norms.size() != input_blob_count)) { fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size()); return -1; } - if (need_calibration_dataset && net.shapes.size() != input_blob_count) + if (net.use_calibration_dataset && net.shapes.size() != input_blob_count) { fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size()); return -1; } - if (need_calibration_dataset && (0 == net.file_type) && (net.type_to_pixels.size() != input_blob_count)) + if (net.use_calibration_dataset && (0 == net.file_type) && (net.type_to_pixels.size() != input_blob_count)) { fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size()); return -1;