olive-recipes/microsoft-Phi-4-mini-instruct/QNN/phi4_mini_qnn_docker.json at main · CodeLinaro/olive-recipes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
{
    "input_model": { "type": "HfModel", "model_path": "microsoft/Phi-4-mini-instruct" },
    "systems": {
        "qnn_system": {
            "type": "PythonEnvironment",
            "python_environment_path": "/qnn-venv/bin",
            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
        },
        "docker_system": {
            "type": "Docker",
            "image_name": "olive",
            "build_context_path": "docker",
            "dockerfile": "Dockerfile",
            "run_params": {"shm_size": "128g" },
            "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ]
        }
    },
    "data_configs": [
        {
            "name": "wikitext2_train_joined",
            "type": "HuggingfaceContainer",
            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
            "pre_process_data_config": {
                "strategy": "join",
                "add_special_tokens": false,
                "max_seq_len": 4096,
                "max_samples": 128
            }
        },
        {
            "name": "wikitext2_train_act",
            "type": "HuggingfaceContainer",
            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
            "pre_process_data_config": {
                "strategy": "line-by-line",
                "add_special_tokens": true,
                "max_samples": 256,
                "max_seq_len": 4096
            }
        }
    ],
    "passes": {
        "q": { "type": "QuaRot" },
        "g": {
            "type": "GptqQuantizer",
            "sym": true,
            "group_size": -1,
            "desc_act": true,
            "data_config": "wikitext2_train_joined"
        },
        "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
        "mb": {
            "type": "ModelBuilder",
            "precision": "int4",
            "int4_block_size": 32,
            "int4_accuracy_level": 4,
            "int4_op_types_to_quantize": [ "MatMul", "Gather" ]
        },
        "mq": {
            "type": "MatMulNBitsToQDQ",
            "use_int4": true,
            "add_zero_point": true,
            "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
            "save_as_external_data": true
        },
        "gs": {
            "type": "GraphSurgeries",
            "surgeries": [
                { "surgeon": "RemoveRopeMultiCache" },
                { "surgeon": "AttentionMaskToSequenceLengths" },
                { "surgeon": "SimplifiedLayerNormToL2Norm" }
            ],
            "save_as_external_data": true
        },
        "sq": {
            "type": "OnnxStaticQuantization",
            "data_config": "wikitext2_train_act",
            "activation_type": "uint16",
            "precision": "uint8",
            "calibration_providers": [ "CUDAExecutionProvider" ],
            "quant_preprocess": true,
            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
            "save_as_external_data": true
        },
        "sp": { "type": "SplitModel" },
        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
        "cb": {
            "type": "EPContextBinaryGenerator",
            "provider_options": {
                "htp_performance_mode": "burst",
                "htp_graph_finalization_optimization_mode": "3",
                "soc_model": "60"
            },
            "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 },
            "weight_sharing": true
        },
        "cp": { "type": "ComposeOnnxModels" }
    },
    "host": "docker_system",
    "target": "qnn_system",
    "log_severity_level": 0,
    "output_dir": "models/phi4-mini-qnn-docker",
    "cache_dir": "cache",
    "no_artifacts": true
}