|
| 1 | +{ |
| 2 | + "input_model": { "type": "HfModel", "model_path": "microsoft/Phi-4-reasoning" }, |
| 3 | + "systems": { |
| 4 | + "qnn_system": { |
| 5 | + "type": "PythonEnvironment", |
| 6 | + "python_environment_path": "/path/to/qnn/env/bin", |
| 7 | + "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] |
| 8 | + } |
| 9 | + }, |
| 10 | + "data_configs": [ |
| 11 | + { |
| 12 | + "name": "wikitext2_train_joined", |
| 13 | + "type": "HuggingfaceContainer", |
| 14 | + "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" }, |
| 15 | + "pre_process_data_config": { |
| 16 | + "strategy": "join", |
| 17 | + "add_special_tokens": false, |
| 18 | + "max_seq_len": 4096, |
| 19 | + "max_samples": 128 |
| 20 | + } |
| 21 | + }, |
| 22 | + { |
| 23 | + "name": "wikitext2_train_act", |
| 24 | + "type": "HuggingfaceContainer", |
| 25 | + "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" }, |
| 26 | + "pre_process_data_config": { |
| 27 | + "strategy": "line-by-line", |
| 28 | + "add_special_tokens": true, |
| 29 | + "max_samples": 256, |
| 30 | + "max_seq_len": 4096 |
| 31 | + } |
| 32 | + } |
| 33 | + ], |
| 34 | + "passes": { |
| 35 | + "q": { "type": "QuaRot" }, |
| 36 | + "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true }, |
| 37 | + "g": { |
| 38 | + "type": "GptqModel", |
| 39 | + "bits": 4, |
| 40 | + "sym": true, |
| 41 | + "group_size": -1, |
| 42 | + "lm_head": true, |
| 43 | + "device": "cuda", |
| 44 | + "data_config": "wikitext2_train_joined", |
| 45 | + "dynamic": { "+:.*lm_head*": { "bits": 4, "sym": true, "group_size": 128, "desc_act": false } } |
| 46 | + }, |
| 47 | + "mb": { |
| 48 | + "type": "ModelBuilder", |
| 49 | + "precision": "int4", |
| 50 | + "int4_block_size": 128, |
| 51 | + "int4_accuracy_level": 4, |
| 52 | + "int4_op_types_to_quantize": [ "Gather" ] |
| 53 | + }, |
| 54 | + "mq": { |
| 55 | + "type": "MatMulNBitsToQDQ", |
| 56 | + "use_int4": true, |
| 57 | + "add_zero_point": true, |
| 58 | + "nodes_to_exclude": [ "/lm_head/MatMulNBits" ], |
| 59 | + "save_as_external_data": true |
| 60 | + }, |
| 61 | + "gs": { |
| 62 | + "type": "GraphSurgeries", |
| 63 | + "surgeries": [ |
| 64 | + { "surgeon": "RemoveRopeMultiCache" }, |
| 65 | + { "surgeon": "AttentionMaskToSequenceLengths" }, |
| 66 | + { "surgeon": "RemoveGidxFromMatMulNBits" }, |
| 67 | + { "surgeon": "SimplifiedLayerNormToL2Norm" } |
| 68 | + ], |
| 69 | + "save_as_external_data": true |
| 70 | + }, |
| 71 | + "f16": { |
| 72 | + "type": "OnnxFloatToFloat16", |
| 73 | + "op_include_list": [ "GroupQueryAttention" ], |
| 74 | + "keep_io_types": [ "logits" ], |
| 75 | + "save_as_external_data": true |
| 76 | + }, |
| 77 | + "sq": { |
| 78 | + "type": "OnnxStaticQuantization", |
| 79 | + "data_config": "wikitext2_train_act", |
| 80 | + "activation_type": "uint16", |
| 81 | + "precision": "uint8", |
| 82 | + "calibration_providers": [ "CUDAExecutionProvider" ], |
| 83 | + "quant_preprocess": true, |
| 84 | + "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ], |
| 85 | + "save_as_external_data": true, |
| 86 | + "extra_options": { "CalibStridedMinMax": 1 } |
| 87 | + }, |
| 88 | + "sp": { "type": "SplitModel" }, |
| 89 | + "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 }, |
| 90 | + "cb": { |
| 91 | + "type": "EPContextBinaryGenerator", |
| 92 | + "provider_options": { |
| 93 | + "htp_performance_mode": "burst", |
| 94 | + "htp_graph_finalization_optimization_mode": "3", |
| 95 | + "soc_model": "60" |
| 96 | + }, |
| 97 | + "weight_sharing": true |
| 98 | + }, |
| 99 | + "cp": { "type": "ComposeOnnxModels" } |
| 100 | + }, |
| 101 | + "target": "qnn_system", |
| 102 | + "log_severity_level": 1, |
| 103 | + "output_dir": "models/phi4_reasoning", |
| 104 | + "cache_dir": "cache", |
| 105 | + "no_artifacts": true |
| 106 | +} |
0 commit comments