diff --git a/docs/how-to-use-and-FAQ/quantized-int8-inference.md b/docs/how-to-use-and-FAQ/quantized-int8-inference.md index bb0b0a7330b7..10ab96843c92 100644 --- a/docs/how-to-use-and-FAQ/quantized-int8-inference.md +++ b/docs/how-to-use-and-FAQ/quantized-int8-inference.md @@ -89,16 +89,16 @@ filelist_in2.txt ``` **Here shape is WHC, because the order of the arguments to `ncnn::Mat`.** -### 3. Quantize model +ncnn2table can generate static weight scales without a calibration dataset for RNN,GRU,LSTM,MultiHeadAttention and Embed layers ```shell -./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table +./ncnn2table rnn.param rnn.bin rnn.table method=kl ``` -If you don’t need static quantization, ncnn supports RNN/LSTM/GRU dynamic quantization. In this case, you can omit the table file. +### 3. Quantize model ```shell -./ncnn2int8 rnn-model.param rnn-model.bin rnn-model-int8.param rnn-model-int8.bin +./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table ``` ## use ncnn int8 inference diff --git a/tools/modelwriter.h b/tools/modelwriter.h index fdebca7d2c93..a7e1d9a72baa 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -2057,6 +2057,7 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 4=%d", vdim) fprintf_param_value(" 5=%d", attn_mask) fprintf_param_value(" 6=%e", scale) + fprintf_param_value(" 7=%d", kv_cache) fprintf_param_value(" 18=%d", int8_scale_term) fwrite_weight_tag_data(op->q_weight_data, bp); diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 55db8d79c2af..41a2115b094a 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -128,6 +128,8 @@ class NetQuantize : public ModelWriter int quantize_sdpa(); int fuse_requantize(); + + int check_int8scale_table_requirement(const char* int8scale_table_path) const; }; NetQuantize::NetQuantize() @@ -135,6 +137,25 @@ NetQuantize::NetQuantize() { } +int NetQuantize::check_int8scale_table_requirement(const char* int8scale_table_path) const +{ + if (int8scale_table_path) + return 0; + + for (size_t i = 0; i < layers.size(); i++) + { + const std::string& type = layers[i]->type; + if (type != "Embed" && type != "MultiHeadAttention" && type != "RNN" && type != "LSTM" && type != "GRU") + continue; + + fprintf(stderr, "%s (%s): calibration table is required for static weight quantization\n", layers[i]->name.c_str(), type.c_str()); + fprintf(stderr, "run ncnn2table to generate weight scales and pass the table to ncnn2int8\n"); + return -1; + } + + return 0; +} + int NetQuantize::quantize_convolution() { const int layer_count = static_cast(layers.size()); @@ -317,43 +338,34 @@ int NetQuantize::quantize_rnn() if (layers[i]->type != "RNN") continue; + char key_xc[256]; + snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_xc = weight_int8scale_table.find(key_xc); + if (iter_xc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_hc[256]; + snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_hc = weight_int8scale_table.find(key_hc); + if (iter_hc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // RNN - quantize weight from fp32 to int8 ncnn::RNN* rnn = (ncnn::RNN*)layers[i]; fprintf(stderr, "quantize_rnn %s\n", rnn->name.c_str()); - // TODO move to ncnn2table const int num_directions = rnn->direction == 2 ? 2 : 1; const int size = rnn->weight_data_size / num_directions / rnn->num_output; - ncnn::Mat weight_xc_data_int8_scales(rnn->num_output * num_directions); - ncnn::Mat weight_hc_data_int8_scales(rnn->num_output * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < rnn->num_output; q++) - { - { - const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i])); - } - weight_xc_data_int8_scales[d * rnn->num_output + q] = 127 / absmax; - } - - { - const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i])); - } - weight_hc_data_int8_scales[d * rnn->num_output + q] = 127 / absmax; - } - } - } + ncnn::Mat weight_xc_data_int8_scales = iter_xc->second; + ncnn::Mat weight_hc_data_int8_scales = iter_hc->second; { ncnn::Mat weight_xc_data_r2 = rnn->weight_xc_data.reshape(size, rnn->num_output * num_directions); @@ -399,43 +411,34 @@ int NetQuantize::quantize_lstm() if (layers[i]->type != "LSTM") continue; + char key_xc[256]; + snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_xc = weight_int8scale_table.find(key_xc); + if (iter_xc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_hc[256]; + snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_hc = weight_int8scale_table.find(key_hc); + if (iter_hc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // LSTM - quantize weight from fp32 to int8 ncnn::LSTM* lstm = (ncnn::LSTM*)layers[i]; fprintf(stderr, "quantize_lstm %s\n", lstm->name.c_str()); - // TODO move to ncnn2table const int num_directions = lstm->direction == 2 ? 2 : 1; const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4; - ncnn::Mat weight_xc_data_int8_scales(lstm->hidden_size * 4 * num_directions); - ncnn::Mat weight_hc_data_int8_scales(lstm->hidden_size * 4 * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < lstm->hidden_size * 4; q++) - { - { - const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i])); - } - weight_xc_data_int8_scales[d * lstm->hidden_size * 4 + q] = 127 / absmax; - } - - { - const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i])); - } - weight_hc_data_int8_scales[d * lstm->hidden_size * 4 + q] = 127 / absmax; - } - } - } + ncnn::Mat weight_xc_data_int8_scales = iter_xc->second; + ncnn::Mat weight_hc_data_int8_scales = iter_hc->second; { ncnn::Mat weight_xc_data_r2 = lstm->weight_xc_data.reshape(size, lstm->hidden_size * 4 * num_directions); @@ -481,43 +484,34 @@ int NetQuantize::quantize_gru() if (layers[i]->type != "GRU") continue; + char key_xc[256]; + snprintf(key_xc, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_xc = weight_int8scale_table.find(key_xc); + if (iter_xc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_hc[256]; + snprintf(key_hc, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_hc = weight_int8scale_table.find(key_hc); + if (iter_hc == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // GRU - quantize weight from fp32 to int8 ncnn::GRU* gru = (ncnn::GRU*)layers[i]; fprintf(stderr, "quantize_gru %s\n", gru->name.c_str()); - // TODO move to ncnn2table const int num_directions = gru->direction == 2 ? 2 : 1; const int size = gru->weight_data_size / num_directions / gru->num_output / 3; - ncnn::Mat weight_xc_data_int8_scales(gru->num_output * 3 * num_directions); - ncnn::Mat weight_hc_data_int8_scales(gru->num_output * 3 * num_directions); - - for (int d = 0; d < num_directions; d++) - { - for (int q = 0; q < gru->num_output * 3; q++) - { - { - const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_xc_ptr[i])); - } - weight_xc_data_int8_scales[d * gru->num_output * 3 + q] = 127 / absmax; - } - - { - const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q); - float absmax = 0.f; - for (int i = 0; i < size; i++) - { - absmax = std::max(absmax, (float)fabs(weight_hc_ptr[i])); - } - weight_hc_data_int8_scales[d * gru->num_output * 3 + q] = 127 / absmax; - } - } - } + ncnn::Mat weight_xc_data_int8_scales = iter_xc->second; + ncnn::Mat weight_hc_data_int8_scales = iter_hc->second; { ncnn::Mat weight_xc_data_r2 = gru->weight_xc_data.reshape(size, gru->num_output * 3 * num_directions); @@ -563,27 +557,24 @@ int NetQuantize::quantize_embed() if (layers[i]->type != "Embed") continue; + char key[256]; + snprintf(key, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter = weight_int8scale_table.find(key); + if (iter == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // Embed - quantize weight from fp32 to int8 ncnn::Embed* embed = (ncnn::Embed*)layers[i]; fprintf(stderr, "quantize_embed %s\n", embed->name.c_str()); - // TODO move to ncnn2table - const int num_output = embed->num_output; const int input_dim = embed->input_dim; - ncnn::Mat weight_data_int8_scales(1); - { - const float* ptr = embed->weight_data; - float absmax = 0.f; - for (int i = 0; i < embed->weight_data.w; i++) - { - absmax = std::max(absmax, (float)fabs(ptr[i])); - } - - weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax; - } + ncnn::Mat weight_data_int8_scales = iter->second; { ncnn::Mat weight_data_int8; @@ -719,29 +710,51 @@ int NetQuantize::quantize_multiheadattention() if (layers[i]->type != "MultiHeadAttention") continue; + char key_q[256]; + snprintf(key_q, 256, "%s_param_0", layers[i]->name.c_str()); + std::map::iterator iter_q = weight_int8scale_table.find(key_q); + if (iter_q == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_k[256]; + snprintf(key_k, 256, "%s_param_1", layers[i]->name.c_str()); + std::map::iterator iter_k = weight_int8scale_table.find(key_k); + if (iter_k == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_v[256]; + snprintf(key_v, 256, "%s_param_2", layers[i]->name.c_str()); + std::map::iterator iter_v = weight_int8scale_table.find(key_v); + if (iter_v == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + char key_out[256]; + snprintf(key_out, 256, "%s_param_3", layers[i]->name.c_str()); + std::map::iterator iter_out = weight_int8scale_table.find(key_out); + if (iter_out == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + // MultiHeadAttention - quantize weight from fp32 to int8 ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layers[i]; fprintf(stderr, "quantize_multiheadattention %s\n", mha->name.c_str()); - // TODO move to ncnn2table - const int qdim = mha->weight_data_size / mha->embed_dim; { - mha->q_weight_data_int8_scales.create(mha->embed_dim); - for (int i = 0; i < mha->embed_dim; i++) - { - float absmax = 0.f; - - const float* ptr = (const float*)mha->q_weight_data + i * qdim; - for (int j = 0; j < qdim; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->q_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } + mha->q_weight_data_int8_scales = iter_q->second; ncnn::Mat q_weight_data = mha->q_weight_data.reshape(qdim, mha->embed_dim); ncnn::Mat q_weight_data_int8; @@ -757,19 +770,7 @@ int NetQuantize::quantize_multiheadattention() } { - mha->k_weight_data_int8_scales.create(mha->embed_dim); - for (int i = 0; i < mha->embed_dim; i++) - { - float absmax = 0.f; - - const float* ptr = (const float*)mha->k_weight_data + i * mha->kdim; - for (int j = 0; j < mha->kdim; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->k_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } + mha->k_weight_data_int8_scales = iter_k->second; ncnn::Mat k_weight_data = mha->k_weight_data.reshape(mha->kdim, mha->embed_dim); ncnn::Mat k_weight_data_int8; @@ -785,19 +786,7 @@ int NetQuantize::quantize_multiheadattention() } { - mha->v_weight_data_int8_scales.create(mha->embed_dim); - for (int i = 0; i < mha->embed_dim; i++) - { - float absmax = 0.f; - - const float* ptr = (const float*)mha->v_weight_data + i * mha->vdim; - for (int j = 0; j < mha->vdim; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->v_weight_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; - } + mha->v_weight_data_int8_scales = iter_v->second; ncnn::Mat v_weight_data = mha->v_weight_data.reshape(mha->vdim, mha->embed_dim); ncnn::Mat v_weight_data_int8; @@ -813,17 +802,8 @@ int NetQuantize::quantize_multiheadattention() } { - const float* ptr = mha->out_weight_data; - float absmax = 0.f; - for (int j = 0; j < mha->out_weight_data.w; j++) - { - absmax = std::max(absmax, (float)fabs(ptr[j])); - } - - mha->out_weight_data_int8_scale = absmax == 0.f ? 1.f : 127 / absmax; - - ncnn::Mat out_weight_data_int8_scales(1); - out_weight_data_int8_scales[0] = mha->out_weight_data_int8_scale; + ncnn::Mat out_weight_data_int8_scales = iter_out->second; + mha->out_weight_data_int8_scale = out_weight_data_int8_scales[0]; ncnn::Mat out_weight_data_int8; @@ -854,7 +834,7 @@ int NetQuantize::quantize_sdpa() fprintf(stderr, "quantize_sdpa %s\n", sdpa->name.c_str()); - // TODO move to ncnn2table + // SDPA uses dynamic activation quantization in forward_int8 sdpa->int8_scale_term = 2; } @@ -1103,6 +1083,9 @@ int main(int argc, char** argv) else quantizer.load_model(inbin); + if (quantizer.check_int8scale_table_requirement(int8scale_table_path) != 0) + return -1; + quantizer.quantize_convolution(); quantizer.quantize_convolutiondepthwise(); quantizer.quantize_innerproduct(); diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index 7edbdd15128d..8c41205db638 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -38,6 +38,11 @@ #include "layer/convolution.h" #include "layer/convolutiondepthwise.h" #include "layer/innerproduct.h" +#include "layer/embed.h" +#include "layer/multiheadattention.h" +#include "layer/rnn.h" +#include "layer/lstm.h" +#include "layer/gru.h" class QuantBlobStat { @@ -61,6 +66,23 @@ class QuantBlobStat std::vector histogram_normed; }; +class QuantMHAStat +{ +public: + ncnn::Mat q_weight_scales; + ncnn::Mat k_weight_scales; + ncnn::Mat v_weight_scales; + float out_weight_scale; +}; + +// rnn, gru, lstm +class QuantRecurrentStat +{ +public: + ncnn::Mat weight_xc_scales; + ncnn::Mat weight_hc_scales; +}; + class QuantNet : public ncnn::Net { public: @@ -77,11 +99,13 @@ class QuantNet : public ncnn::Net std::vector type_to_pixels; int quantize_num_threads; int file_type; + bool use_calibration_dataset; public: int init(); void print_quant_info() const; int save_table(const char* tablepath); + void initialize_static_weight_scales(); int quantize_KL(); int quantize_ACIQ(); int quantize_EQ(); @@ -91,17 +115,28 @@ class QuantNet : public ncnn::Net std::vector conv_layers; std::vector conv_bottom_blobs; std::vector conv_top_blobs; + std::vector embed_layers; + std::vector mha_layers; + std::vector rnn_layers; + std::vector lstm_layers; + std::vector gru_layers; // result std::vector quant_blob_stats; std::vector weight_scales; std::vector bottom_blob_scales; + std::vector embed_weight_scales; + std::vector mha_stats; + std::vector rnn_stats; + std::vector lstm_stats; + std::vector gru_stats; }; QuantNet::QuantNet() : blobs(mutable_blobs()), layers(mutable_layers()) { quantize_num_threads = ncnn::get_cpu_count(); + use_calibration_dataset = false; } int QuantNet::init() @@ -126,14 +161,48 @@ int QuantNet::init() conv_bottom_blobs.push_back(layer->bottoms[0]); conv_top_blobs.push_back(layer->tops[0]); } + + // find embed layers + else if (layer->type == "Embed") + { + embed_layers.push_back(i); + } + + // find all mha layers + else if (layer->type == "MultiHeadAttention") + { + mha_layers.push_back(i); + } + else if (layer->type == "RNN") + { + rnn_layers.push_back(i); + } + else if (layer->type == "LSTM") + { + lstm_layers.push_back(i); + } + else if (layer->type == "GRU") + { + gru_layers.push_back(i); + } } const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + const int embed_layer_count = (int)embed_layers.size(); + const int mha_layer_count = (int)mha_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); quant_blob_stats.resize(conv_bottom_blob_count); weight_scales.resize(conv_layer_count); bottom_blob_scales.resize(conv_bottom_blob_count); + embed_weight_scales.resize(embed_layer_count); + mha_stats.resize(mha_layer_count); + rnn_stats.resize(rnn_layer_count); + lstm_stats.resize(lstm_layer_count); + gru_stats.resize(gru_layer_count); return 0; } @@ -149,10 +218,15 @@ int QuantNet::save_table(const char* tablepath) const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + const int embed_layer_count = (int)embed_layers.size(); + const int mha_layer_count = (int)mha_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); - fprintf(stdout, "param:%d\n", conv_layer_count); + fprintf(stdout, "param:%d\n", use_calibration_dataset ? conv_layer_count : 0); - for (int i = 0; i < conv_layer_count; i++) + for (int i = 0; use_calibration_dataset && i < conv_layer_count; i++) { const ncnn::Mat& weight_scale = weight_scales[i]; @@ -164,7 +238,7 @@ int QuantNet::save_table(const char* tablepath) fprintf(fp, "\n"); } - for (int i = 0; i < conv_bottom_blob_count; i++) + for (int i = 0; use_calibration_dataset && i < conv_bottom_blob_count; i++) { const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i]; @@ -176,6 +250,110 @@ int QuantNet::save_table(const char* tablepath) fprintf(fp, "\n"); } + fprintf(stdout, "param:%d\n", embed_layer_count); + for (int i = 0; i < embed_layer_count; i++) + { + fprintf(fp, "%s_param_0 ", layers[embed_layers[i]]->name.c_str()); + fprintf(fp, "%f ", embed_weight_scales[i]); + fprintf(fp, "\n"); + } + + fprintf(stdout, "param:%d\n", mha_layer_count); + for (int i = 0; i < mha_layer_count; i++) + { + // q_weight + const ncnn::Mat q_weight_scales = mha_stats[i].q_weight_scales; + fprintf(fp, "%s_param_0 ", layers[mha_layers[i]]->name.c_str()); + for (int j = 0; j < q_weight_scales.w; j++) + { + fprintf(fp, "%f ", q_weight_scales[j]); + } + fprintf(fp, "\n"); + + // k_weight + const ncnn::Mat k_weight_scales = mha_stats[i].k_weight_scales; + fprintf(fp, "%s_param_1 ", layers[mha_layers[i]]->name.c_str()); + for (int j = 0; j < k_weight_scales.w; j++) + { + fprintf(fp, "%f ", k_weight_scales[j]); + } + fprintf(fp, "\n"); + + // v_weight + const ncnn::Mat v_weight_scales = mha_stats[i].v_weight_scales; + fprintf(fp, "%s_param_2 ", layers[mha_layers[i]]->name.c_str()); + for (int j = 0; j < v_weight_scales.w; j++) + { + fprintf(fp, "%f ", v_weight_scales[j]); + } + fprintf(fp, "\n"); + + // out_weight + fprintf(fp, "%s_param_3 ", layers[mha_layers[i]]->name.c_str()); + fprintf(fp, "%f ", mha_stats[i].out_weight_scale); + fprintf(fp, "\n"); + } + + fprintf(stdout, "param:%d\n", rnn_layer_count); + for (int i = 0; i < rnn_layer_count; i++) + { + const ncnn::Mat weight_xc_scales = rnn_stats[i].weight_xc_scales; + fprintf(fp, "%s_param_0 ", layers[rnn_layers[i]]->name.c_str()); + for (int j = 0; j < weight_xc_scales.w; j++) + { + fprintf(fp, "%f ", weight_xc_scales[j]); + } + fprintf(fp, "\n"); + + const ncnn::Mat weight_hc_scales = rnn_stats[i].weight_hc_scales; + fprintf(fp, "%s_param_1 ", layers[rnn_layers[i]]->name.c_str()); + for (int j = 0; j < weight_hc_scales.w; j++) + { + fprintf(fp, "%f ", weight_hc_scales[j]); + } + fprintf(fp, "\n"); + } + + fprintf(stdout, "param:%d\n", lstm_layer_count); + for (int i = 0; i < lstm_layer_count; i++) + { + const ncnn::Mat weight_xc_scales = lstm_stats[i].weight_xc_scales; + fprintf(fp, "%s_param_0 ", layers[lstm_layers[i]]->name.c_str()); + for (int j = 0; j < weight_xc_scales.w; j++) + { + fprintf(fp, "%f ", weight_xc_scales[j]); + } + fprintf(fp, "\n"); + + const ncnn::Mat weight_hc_scales = lstm_stats[i].weight_hc_scales; + fprintf(fp, "%s_param_1 ", layers[lstm_layers[i]]->name.c_str()); + for (int j = 0; j < weight_hc_scales.w; j++) + { + fprintf(fp, "%f ", weight_hc_scales[j]); + } + fprintf(fp, "\n"); + } + + fprintf(stdout, "param:%d\n", gru_layer_count); + for (int i = 0; i < gru_layer_count; i++) + { + const ncnn::Mat weight_xc_scales = gru_stats[i].weight_xc_scales; + fprintf(fp, "%s_param_0 ", layers[gru_layers[i]]->name.c_str()); + for (int j = 0; j < weight_xc_scales.w; j++) + { + fprintf(fp, "%f ", weight_xc_scales[j]); + } + fprintf(fp, "\n"); + + const ncnn::Mat weight_hc_scales = gru_stats[i].weight_hc_scales; + fprintf(fp, "%s_param_1 ", layers[gru_layers[i]]->name.c_str()); + for (int j = 0; j < weight_hc_scales.w; j++) + { + fprintf(fp, "%f ", weight_hc_scales[j]); + } + fprintf(fp, "\n"); + } + fclose(fp); fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n"); @@ -185,6 +363,9 @@ int QuantNet::save_table(const char* tablepath) void QuantNet::print_quant_info() const { + if (!use_calibration_dataset) + return; + for (int i = 0; i < (int)conv_bottom_blobs.size(); i++) { const QuantBlobStat& stat = quant_blob_stats[i]; @@ -284,129 +465,336 @@ inline ncnn::Mat read_and_resize_image(const std::vector& shape, const std: return ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h); } -static float compute_kl_divergence(const std::vector& a, const std::vector& b) +void QuantNet::initialize_static_weight_scales() { - const size_t length = a.size(); - - float result = 0; - for (size_t i = 0; i < length; i++) + const int embed_layer_count = (int)embed_layers.size(); + const int mha_layer_count = (int)mha_layers.size(); + const int rnn_layer_count = (int)rnn_layers.size(); + const int lstm_layer_count = (int)lstm_layers.size(); + const int gru_layer_count = (int)gru_layers.size(); + + // initialize embed weight scales + for (int i = 0; i < embed_layer_count; i++) { - result += a[i] * log(a[i] / b[i]); + const ncnn::Layer* layer = layers[embed_layers[i]]; + const ncnn::Embed* embed = (const ncnn::Embed*)layer; + const float* ptr = embed->weight_data; + + float absmax = 0.f; + for (int j = 0; j < embed->weight_data.w; j++) + { + absmax = std::max(absmax, (float)fabs(ptr[j])); + } + embed_weight_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; } - return result; -} + // initialize mha weight scales + for (int i = 0; i < mha_layer_count; i++) + { + const ncnn::Layer* layer = layers[mha_layers[i]]; + const ncnn::MultiHeadAttention* mha = (const ncnn::MultiHeadAttention*)layer; -int QuantNet::quantize_KL() -{ - const int input_blob_count = (int)input_blobs.size(); - const int conv_layer_count = (int)conv_layers.size(); - const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); - const int file_count = (int)listspaths[0].size(); + const int qdim = mha->weight_data_size / mha->embed_dim; + mha_stats[i].q_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float q_absmax = 0.f; - const int num_histogram_bins = 2048; + const float* q_ptr = (const float*)mha->q_weight_data + j * qdim; + for (int k = 0; k < qdim; k++) + { + q_absmax = std::max(q_absmax, (float)fabs(q_ptr[k])); + } + mha_stats[i].q_weight_scales[j] = q_absmax == 0.f ? 1.f : 127 / q_absmax; + } - std::vector blob_allocators(quantize_num_threads); - std::vector workspace_allocators(quantize_num_threads); + const int kdim = mha->kdim; + mha_stats[i].k_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) + { + float k_absmax = 0.f; - // initialize conv weight scales - #pragma omp parallel for num_threads(quantize_num_threads) - for (int i = 0; i < conv_layer_count; i++) - { - const ncnn::Layer* layer = layers[conv_layers[i]]; + const float* k_ptr = (const float*)mha->k_weight_data + j * kdim; + for (int k = 0; k < kdim; k++) + { + k_absmax = std::max(k_absmax, (float)fabs(k_ptr[k])); + } + mha_stats[i].k_weight_scales[j] = k_absmax == 0.f ? 1.f : 127 / k_absmax; + } - if (layer->type == "Convolution") + const int vdim = mha->vdim; + mha_stats[i].v_weight_scales.create(mha->embed_dim); + for (int j = 0; j < mha->embed_dim; j++) { - const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer; + float v_absmax = 0.f; + + const float* v_ptr = (const float*)mha->v_weight_data + j * vdim; + for (int k = 0; k < vdim; k++) + { + v_absmax = std::max(v_absmax, (float)fabs(v_ptr[k])); + } + mha_stats[i].v_weight_scales[j] = v_absmax == 0.f ? 1.f : 127 / v_absmax; + } - const int num_output = convolution->num_output; - const int kernel_w = convolution->kernel_w; - const int kernel_h = convolution->kernel_h; - const int dilation_w = convolution->dilation_w; - const int dilation_h = convolution->dilation_h; - const int stride_w = convolution->stride_w; - const int stride_h = convolution->stride_h; + const float* o_ptr = (const float*)mha->out_weight_data; + float o_absmax = 0.f; + for (int k = 0; k < mha->out_weight_data.w; k++) + { + o_absmax = std::max(o_absmax, (float)fabs(o_ptr[k])); + } + mha_stats[i].out_weight_scale = o_absmax == 0.f ? 1.f : 127 / o_absmax; + } - const int weight_data_size_output = convolution->weight_data_size / num_output; + // initialize rnn weight scales + for (int i = 0; i < rnn_layer_count; i++) + { + const ncnn::Layer* layer = layers[rnn_layers[i]]; + const ncnn::RNN* rnn = (const ncnn::RNN*)layer; - // int8 winograd F43 needs weight data to use 6bit quantization - // TODO proper condition for winograd 3x3 int8 - bool quant_6bit = false; - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - quant_6bit = true; + const int num_directions = rnn->direction == 2 ? 2 : 1; + const int size = rnn->weight_data_size / num_directions / rnn->num_output; - weight_scales[i].create(num_output); + rnn_stats[i].weight_xc_scales.create(rnn->num_output * num_directions); + rnn_stats[i].weight_hc_scales.create(rnn->num_output * num_directions); - for (int n = 0; n < num_output; n++) + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < rnn->num_output; q++) { - const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output); + { + const float* weight_xc_ptr = rnn->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + rnn_stats[i].weight_xc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; + } - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + const float* weight_hc_ptr = rnn->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < rnn->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + rnn_stats[i].weight_hc_scales[d * rnn->num_output + q] = absmax == 0.f ? 1.f : 127 / absmax; } + } + } + } + + // initialize lstm weight scales + for (int i = 0; i < lstm_layer_count; i++) + { + const ncnn::Layer* layer = layers[lstm_layers[i]]; + const ncnn::LSTM* lstm = (const ncnn::LSTM*)layer; - if (quant_6bit) + const int num_directions = lstm->direction == 2 ? 2 : 1; + const int size = lstm->weight_data_size / num_directions / lstm->hidden_size / 4; + + lstm_stats[i].weight_xc_scales.create(lstm->hidden_size * 4 * num_directions); + lstm_stats[i].weight_hc_scales.create(lstm->hidden_size * 4 * num_directions); + + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < lstm->hidden_size * 4; q++) + { { - weight_scales[i][n] = 31 / absmax; + const float* weight_xc_ptr = lstm->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + lstm_stats[i].weight_xc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; } - else + { - weight_scales[i][n] = 127 / absmax; + const float* weight_hc_ptr = lstm->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < lstm->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + lstm_stats[i].weight_hc_scales[d * lstm->hidden_size * 4 + q] = absmax == 0.f ? 1.f : 127 / absmax; } } } + } - if (layer->type == "ConvolutionDepthWise") - { - const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer; - - const int group = convolutiondepthwise->group; - const int weight_data_size_output = convolutiondepthwise->weight_data_size / group; + // initialize gru weight scales + for (int i = 0; i < gru_layer_count; i++) + { + const ncnn::Layer* layer = layers[gru_layers[i]]; + const ncnn::GRU* gru = (const ncnn::GRU*)layer; - std::vector scales; + const int num_directions = gru->direction == 2 ? 2 : 1; + const int size = gru->weight_data_size / num_directions / gru->num_output / 3; - weight_scales[i].create(group); + gru_stats[i].weight_xc_scales.create(gru->num_output * 3 * num_directions); + gru_stats[i].weight_hc_scales.create(gru->num_output * 3 * num_directions); - for (int n = 0; n < group; n++) + for (int d = 0; d < num_directions; d++) + { + for (int q = 0; q < gru->num_output * 3; q++) { - const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output); - - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + const float* weight_xc_ptr = gru->weight_xc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < size; j++) + { + absmax = std::max(absmax, (float)fabs(weight_xc_ptr[j])); + } + gru_stats[i].weight_xc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; } - weight_scales[i][n] = 127 / absmax; + { + const float* weight_hc_ptr = gru->weight_hc_data.channel(d).row(q); + float absmax = 0.f; + for (int j = 0; j < gru->num_output; j++) + { + absmax = std::max(absmax, (float)fabs(weight_hc_ptr[j])); + } + gru_stats[i].weight_hc_scales[d * gru->num_output * 3 + q] = absmax == 0.f ? 1.f : 127 / absmax; + } } } + } +} + +static float compute_kl_divergence(const std::vector& a, const std::vector& b) +{ + const size_t length = a.size(); + + float result = 0; + for (size_t i = 0; i < length; i++) + { + result += a[i] * log(a[i] / b[i]); + } + + return result; +} - if (layer->type == "InnerProduct") +int QuantNet::quantize_KL() +{ + const int input_blob_count = (int)input_blobs.size(); + const int conv_layer_count = (int)conv_layers.size(); + const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + + if (use_calibration_dataset) + { + // initialize conv weight scales + #pragma omp parallel for num_threads(quantize_num_threads) + for (int i = 0; i < conv_layer_count; i++) { - const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer; + const ncnn::Layer* layer = layers[conv_layers[i]]; - const int num_output = innerproduct->num_output; - const int weight_data_size_output = innerproduct->weight_data_size / num_output; + if (layer->type == "Convolution") + { + const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer; + + const int num_output = convolution->num_output; + const int kernel_w = convolution->kernel_w; + const int kernel_h = convolution->kernel_h; + const int dilation_w = convolution->dilation_w; + const int dilation_h = convolution->dilation_h; + const int stride_w = convolution->stride_w; + const int stride_h = convolution->stride_h; + + const int weight_data_size_output = convolution->weight_data_size / num_output; + + // int8 winograd F43 needs weight data to use 6bit quantization + // TODO proper condition for winograd 3x3 int8 + bool quant_6bit = false; + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + quant_6bit = true; + + weight_scales[i].create(num_output); + + for (int n = 0; n < num_output; n++) + { + const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output); + + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } - weight_scales[i].create(num_output); + if (quant_6bit) + { + weight_scales[i][n] = 31 / absmax; + } + else + { + weight_scales[i][n] = 127 / absmax; + } + } + } - for (int n = 0; n < num_output; n++) + if (layer->type == "ConvolutionDepthWise") { - const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output); + const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer; - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) + const int group = convolutiondepthwise->group; + const int weight_data_size_output = convolutiondepthwise->weight_data_size / group; + + std::vector scales; + + weight_scales[i].create(group); + + for (int n = 0; n < group; n++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output); + + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } + + weight_scales[i][n] = 127 / absmax; } + } - weight_scales[i][n] = 127 / absmax; + if (layer->type == "InnerProduct") + { + const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer; + + const int num_output = innerproduct->num_output; + const int weight_data_size_output = innerproduct->weight_data_size / num_output; + + weight_scales[i].create(num_output); + + for (int n = 0; n < num_output; n++) + { + const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output); + + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } + + weight_scales[i][n] = 127 / absmax; + } } } } + initialize_static_weight_scales(); + + if (conv_layer_count == 0 || !use_calibration_dataset) + return 0; + + const int file_count = (int)listspaths[0].size(); + + const int num_histogram_bins = 2048; + + std::vector blob_allocators(quantize_num_threads); + std::vector workspace_allocators(quantize_num_threads); + // count the absmax #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1) for (int i = 0; i < file_count; i++) @@ -780,113 +1168,122 @@ int QuantNet::quantize_ACIQ() const int input_blob_count = (int)input_blobs.size(); const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); - const int file_count = (int)listspaths[0].size(); - std::vector blob_allocators(quantize_num_threads); - std::vector workspace_allocators(quantize_num_threads); - - // initialize conv weight scales - #pragma omp parallel for num_threads(quantize_num_threads) - for (int i = 0; i < conv_layer_count; i++) + if (use_calibration_dataset) { - const ncnn::Layer* layer = layers[conv_layers[i]]; - - if (layer->type == "Convolution") + // initialize conv weight scales + #pragma omp parallel for num_threads(quantize_num_threads) + for (int i = 0; i < conv_layer_count; i++) { - const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer; + const ncnn::Layer* layer = layers[conv_layers[i]]; - const int num_output = convolution->num_output; - const int kernel_w = convolution->kernel_w; - const int kernel_h = convolution->kernel_h; - const int dilation_w = convolution->dilation_w; - const int dilation_h = convolution->dilation_h; - const int stride_w = convolution->stride_w; - const int stride_h = convolution->stride_h; + if (layer->type == "Convolution") + { + const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer; - const int weight_data_size_output = convolution->weight_data_size / num_output; + const int num_output = convolution->num_output; + const int kernel_w = convolution->kernel_w; + const int kernel_h = convolution->kernel_h; + const int dilation_w = convolution->dilation_w; + const int dilation_h = convolution->dilation_h; + const int stride_w = convolution->stride_w; + const int stride_h = convolution->stride_h; - // int8 winograd F43 needs weight data to use 6bit quantization - // TODO proper condition for winograd 3x3 int8 - bool quant_6bit = false; - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - quant_6bit = true; + const int weight_data_size_output = convolution->weight_data_size / num_output; - weight_scales[i].create(num_output); + // int8 winograd F43 needs weight data to use 6bit quantization + // TODO proper condition for winograd 3x3 int8 + bool quant_6bit = false; + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + quant_6bit = true; - for (int n = 0; n < num_output; n++) - { - const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output); + weight_scales[i].create(num_output); - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) + for (int n = 0; n < num_output; n++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } + const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output); - if (quant_6bit) - { - const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6); - weight_scales[i][n] = 31 / threshold; - } - else - { - const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); - weight_scales[i][n] = 127 / threshold; + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } + + if (quant_6bit) + { + const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6); + weight_scales[i][n] = 31 / threshold; + } + else + { + const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); + weight_scales[i][n] = 127 / threshold; + } } } - } - - if (layer->type == "ConvolutionDepthWise") - { - const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer; - const int group = convolutiondepthwise->group; - const int weight_data_size_output = convolutiondepthwise->weight_data_size / group; + if (layer->type == "ConvolutionDepthWise") + { + const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer; - std::vector scales; + const int group = convolutiondepthwise->group; + const int weight_data_size_output = convolutiondepthwise->weight_data_size / group; - weight_scales[i].create(group); + std::vector scales; - for (int n = 0; n < group; n++) - { - const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output); + weight_scales[i].create(group); - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) + for (int n = 0; n < group; n++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } + const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output); - const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); - weight_scales[i][n] = 127 / threshold; - } - } + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } - if (layer->type == "InnerProduct") - { - const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer; + const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); + weight_scales[i][n] = 127 / threshold; + } + } - const int num_output = innerproduct->num_output; - const int weight_data_size_output = innerproduct->weight_data_size / num_output; + if (layer->type == "InnerProduct") + { + const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer; - weight_scales[i].create(num_output); + const int num_output = innerproduct->num_output; + const int weight_data_size_output = innerproduct->weight_data_size / num_output; - for (int n = 0; n < num_output; n++) - { - const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output); + weight_scales[i].create(num_output); - float absmax = 0.f; - for (int k = 0; k < weight_data_size_output; k++) + for (int n = 0; n < num_output; n++) { - absmax = std::max(absmax, (float)fabs(weight_data_n[k])); - } + const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output); + + float absmax = 0.f; + for (int k = 0; k < weight_data_size_output; k++) + { + absmax = std::max(absmax, (float)fabs(weight_data_n[k])); + } - const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); - weight_scales[i][n] = 127 / threshold; + const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output); + weight_scales[i][n] = 127 / threshold; + } } } } + initialize_static_weight_scales(); + + if (conv_layer_count == 0 || !use_calibration_dataset) + return 0; + + const int file_count = (int)listspaths[0].size(); + + std::vector blob_allocators(quantize_num_threads); + std::vector workspace_allocators(quantize_num_threads); + // count the absmax #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1) for (int i = 0; i < file_count; i++) @@ -1112,6 +1509,9 @@ int QuantNet::quantize_EQ() const int conv_layer_count = (int)conv_layers.size(); const int conv_bottom_blob_count = (int)conv_bottom_blobs.size(); + if (conv_layer_count == 0 || !use_calibration_dataset) + return 0; + std::vector blob_allocators(quantize_num_threads); std::vector workspace_allocators(quantize_num_threads); @@ -1661,6 +2061,7 @@ static void print_pixel_type_list(const std::vector& list) static void show_usage() { fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n"); + fprintf(stderr, " ncnn2table [ncnnparam] [ncnnbin] [ncnntable] [(key=value)...]\n"); fprintf(stderr, " mean=[104.0,117.0,123.0],...\n"); fprintf(stderr, " norm=[1.0,1.0,1.0],...\n"); fprintf(stderr, " shape=[224,224,3],...[w,h,c] or [w,h] **[0,0] will not resize\n"); @@ -1671,11 +2072,12 @@ static void show_usage() fprintf(stderr, "Sample usage:\n"); fprintf(stderr, " ncnn2table squeezenet.param squeezenet.bin filelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n"); fprintf(stderr, " ncnn2table test.param test.bin filelist.txt squeezenet.table shape=[227,227,3] method=kl type=1\n"); + fprintf(stderr, " ncnn2table rnn.param rnn.bin rnn.table method=kl\n"); } int main(int argc, char** argv) { - if (argc < 5) + if (argc < 4) { show_usage(); return -1; @@ -1692,8 +2094,6 @@ int main(int argc, char** argv) const char* inparam = argv[1]; const char* inbin = argv[2]; - char* lists = argv[3]; - const char* outtable = argv[4]; ncnn::Option opt; opt.num_threads = 1; @@ -1709,13 +2109,36 @@ int main(int argc, char** argv) net.init(); - // load lists - net.listspaths = parse_comma_path_list(lists); + const char* outtable = 0; + int kv_start = 0; + + if (argc >= 5 && strchr(argv[4], '=')) + { + outtable = argv[3]; + kv_start = 4; + } + else if (argc >= 5) + { + net.listspaths = parse_comma_path_list(argv[3]); + net.use_calibration_dataset = true; + outtable = argv[4]; + kv_start = 5; + } + else + { + outtable = argv[3]; + kv_start = 4; + } + + if (!net.conv_layers.empty() && !net.use_calibration_dataset) + { + fprintf(stderr, "warning: calibration dataset not provided, skip activation calibration and generate weight-only table\n"); + } std::string method = "kl"; net.file_type = 0; - for (int i = 5; i < argc; i++) + for (int i = kv_start; i < argc; i++) { // key=value char* kv = argv[i]; @@ -1751,27 +2174,27 @@ int main(int argc, char** argv) // sanity check const size_t input_blob_count = net.input_blobs.size(); - if (net.listspaths.size() != input_blob_count) + if (net.use_calibration_dataset && net.listspaths.size() != input_blob_count) { fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size()); return -1; } - if ((0 == net.file_type) && (net.means.size() != input_blob_count)) + if (net.use_calibration_dataset && (0 == net.file_type) && (net.means.size() != input_blob_count)) { fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size()); return -1; } - if ((0 == net.file_type) && (net.norms.size() != input_blob_count)) + if (net.use_calibration_dataset && (0 == net.file_type) && (net.norms.size() != input_blob_count)) { fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size()); return -1; } - if (net.shapes.size() != input_blob_count) + if (net.use_calibration_dataset && net.shapes.size() != input_blob_count) { fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size()); return -1; } - if ((0 == net.file_type) && (net.type_to_pixels.size() != input_blob_count)) + if (net.use_calibration_dataset && (0 == net.file_type) && (net.type_to_pixels.size() != input_blob_count)) { fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size()); return -1;