Model-Optimizer/modelopt/onnx/quantization/graph_utils.py at 4deee67cfc0fafebd8962b4faa4b9750f76866cc · NVIDIA/Model-Optimizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Provides ONNX graph related utils for QDQ placement."""

import re
from collections import defaultdict
from functools import reduce
from typing import Any, cast

import numpy as np
import onnx
import onnx_graphsurgeon as gs
from onnx_graphsurgeon.ir.graph import Graph
from onnx_graphsurgeon.ir.node import Node
from onnx_graphsurgeon.ir.tensor import Constant, Tensor, Variable
from onnxruntime.quantization.calibrate import CalibrationDataReader

from modelopt.onnx.logging_config import logger
from modelopt.onnx.op_types import get_copy_ops, is_copy_op, is_linear_op
from modelopt.onnx.quantization.ort_utils import create_inference_session
from modelopt.onnx.utils import (
    find_lowest_common_ancestor,
    get_child_nodes,
    get_parent_nodes,
    infer_shapes,
    parse_shapes_spec,
    save_onnx,
)

DEFAULT_GATHER_BLOCK_SIZE = 32
DEFAULT_GATHER_QUANTIZE_AXIS = None


def is_const_input(tensor: Tensor) -> bool:
    """Returns whether the given tensor is an initializer or produced by const-foldable nodes."""
    if isinstance(tensor, Constant):
        return True

    # Tensor is a graph input variable
    if len(tensor.inputs) == 0:
        return False

    producer_node = tensor.inputs[0]  # Generally tensors has single producer
    if producer_node.op in ["Constant", "Identity"]:
        return True

    # Second axes input to Squeeze/Unsqueeze is a constant, we need to check the first input
    if producer_node.op in ["Squeeze", "Unsqueeze"] and is_const_input(producer_node.inputs[0]):
        return True

    # Const -> Clip -> Exp -> Mul pattern matching for swin_v2
    if producer_node.op == "Exp":
        clip_node = producer_node.i()
        if clip_node.op == "Clip" and has_const_input(clip_node):
            return True

    return False


def has_const_input(node: Node) -> bool:
    """Returns whether the given node has any constant input."""
    return any(is_const_input(tensor) for tensor in node.inputs)


def get_input_shapes(onnx_path: str) -> dict[str, list[int]]:
    """Returns the input shapes of the given ONNX model."""
    onnx_model = onnx.load(onnx_path)
    input_shape_dict = {}
    for input in onnx_model.graph.input:
        input_shape_dict[input.name] = [x.dim_value for x in input.type.tensor_type.shape.dim]
    return input_shape_dict


def has_path_type(
    node: Node,
    graph: Graph,
    path_type: list[str],
    is_forward: bool,
    wild_card_types: list[str] = [],
    path_nodes: list[Node] = [],
) -> bool:
    """Checks if the given node is start/end of a given forward/backward path type.

    Note, Path can be forward or backward wrt a node depending on the next level nodes.
    Additionally, this method can work with optional nodes and collect the traversed path.

    Args:
        node: Start node of the path.
        graph: ONNX model graph.
        path_type: Path types to match from the given node.
        is_forward: Whether to match forward or backward path.
        wild_card_types: Wild card types, these type of nodes are skipped and not matched with the path_type.
        path_nodes: Accumulated nodes in the matched path.

    Returns:
        Bool, whether the given node is start/end of the given forward/backward path type.
    """
    optional_path_types = ["BiasAdd", "ConstMul"]
    if not path_type:
        # All types matched
        return True

    # Current node type and special type conversion for optional BiasAdd and ConstMul
    # Note, matching path with Add/Mul type nodes with const input will fail
    node_type = node.op
    if node_type == "Add" and has_const_input(node):
        node_type = "BiasAdd"
    elif node_type == "Mul" and has_const_input(node):
        node_type = "ConstMul"

    # Special type conversion from NonBiasAdd to Add if all Add inputs are non-constant
    if node_type == "Add" and path_type[0] == "NonBiasAdd":
        path_type[0] = "Add"

    # Check if current non-wild node type does not match the expected path type
    # And if path type is not optional (ex. BiasAdd)
    is_match = (node_type == path_type[0]) or (node.op == path_type[0])
    is_wild_match = node_type in wild_card_types
    if not is_match and not is_wild_match and (path_type[0] not in optional_path_types):
        return False

    # Add current node name in the path
    if is_match:
        path_nodes.append(node)

    # If current node type matches the expected path type or path type is optional (ex. BiasAdd), we have a type match
    # Update the remaining path types to match
    next_path_type = path_type[:]

    # Non-repeatable optional types should be consumed
    if is_match or (path_type[0] in ["BiasAdd", "ConstMul"]):
        next_path_type = path_type[1:]

    # If current node is not wild card and didn't match, go ahead and match with the
    # remaining path types starting with the current node
    if not is_match and not is_wild_match:
        assert path_type[0] in optional_path_types
        return has_path_type(
            node,
            graph,
            next_path_type,
            is_forward,
            wild_card_types,
            path_nodes,
        )

    next_level_nodes = get_child_nodes(node) if is_forward else get_parent_nodes(node)

    # Check if any child (forward path) or parent (backward path) can match the remaining path types
    for next_node in next_level_nodes:
        sub_path = []
        if has_path_type(next_node, graph, next_path_type, is_forward, wild_card_types, sub_path):
            path_nodes.extend(sub_path)
            return True

    # Path type matches if there is no remaining types to match
    return not next_path_type


def get_fusible_backbone(node: Node, graph: Graph) -> Node | None:
    """Returns the linear backbone node for a given node if it matches the pattern.

    TensorRT fuses convolution with BN, Relu, MaxPool etc. when in some specific pattern.
    This rule tries to match some of those patterns.
    Note. BiasAdd and ConstMul are optional in path types.

    Args:
        node: Start node of the pattern.
        graph: ONNX model graph.

    Returns:
        Backbone node of the given node, None if not found.
    """

    def _get_backbone(root: Node):
        if root.op in ["Conv", "ConvTranspose"]:
            return root

        for tensor in root.inputs:
            if not isinstance(tensor, Constant) and tensor.inputs:
                parent_node = tensor.inputs[0]
                bb = _get_backbone(parent_node)
                if bb:
                    return bb

    fusible_linear_path_types = []
    for conv_type in ["Conv", "ConvTranspose"]:
        fusible_linear_path_types += [
            ["BiasAdd", "ConstMul", conv_type],
            ["Relu", "BiasAdd", "ConstMul", conv_type],
            ["BatchNormalization", "BiasAdd", conv_type],
            ["Relu", "BatchNormalization", "BiasAdd", conv_type],
            ["MaxPool", "Relu", "BatchNormalization", "BiasAdd", conv_type],
            ["Mul", "Sigmoid", "BatchNormalization", conv_type],
        ]
    for idx, path_type in enumerate(fusible_linear_path_types):
        if has_path_type(node, graph, path_type, is_forward=False, wild_card_types=get_copy_ops()):
            return _get_backbone(node)

    return None


def get_tensor_from_name(graph: onnx.GraphProto, tensor_name: str) -> onnx.ValueInfoProto | None:
    """Returns a ValueInfoProto given a tensor name.

    Args:
        graph: ONNX model graph
        tensor_name: String with tensor name.

    Returns:
        onnx.ValueInfoProto: actual graph tensor.
    """
    # Search in inputs
    vi = next((vi for vi in graph.input if vi.name == tensor_name), None)
    # If not found, search in outputs
    if vi is None:
        vi = next((vi for vi in graph.output if vi.name == tensor_name), None)
    # If not found, search in value_info (intermediate tensors)
    if vi is None:
        vi = next((vi for vi in graph.value_info if vi.name == tensor_name), None)
    return vi


def get_tensor_producer_nodes(
    graph: onnx.GraphProto,
    get_initializer_producers: bool = False,
) -> dict[str, onnx.NodeProto]:
    """Returns a dictionary of tensor name and their producer node object mapping.

    Note. we create a special Root type node as external inputs producer for ease of implementation.

    Args:
        graph: ONNX model graph.

    Returns:
        Dictionary, key is tensor name and value is their producer node object
    """
    # Create a dictionary to store tensor producer nodes
    tensor_producers = defaultdict(None)

    # Special Root type producer node
    root_node = onnx.helper.make_node(
        op_type="Root",
        inputs=[],
        outputs=[i.name for i in graph.input],
        name="root_0",
    )

    input_names = [graph_input.name for graph_input in graph.input]
    initializer_names = [initializer.name for initializer in graph.initializer]
    external_input_names = list(np.setdiff1d(input_names, initializer_names))

    # Note. We are marking external inputs as non-constant by adding a parent,
    # so that we can quantize the first node of the graph if appropriate
    for graph_input in external_input_names:
        tensor_producers[graph_input] = root_node

    # Traverse the graph to find producer nodes for each tensor
    for node in graph.node:
        for output_name in node.output:
            tensor_producers[output_name] = node

    if get_initializer_producers:
        for initializer in graph.initializer:
            tensor_producers[initializer.name] = initializer

    return tensor_producers


def get_tensor_consumer_nodes(
    graph: onnx.GraphProto,
) -> dict[str, list[onnx.NodeProto]]:
    """Returns a dictionary of tensor name and their consumer node object mapping.

    Args:
        graph: ONNX model graph.

    Returns:
        Dictionary, key is tensor name and value is their consumer node object
    """
    # Create a dictionary to store tensor consumer nodes
    tensor_consumers = defaultdict(list)

    # Traverse the graph to find consumer nodes for each tensor
    for node in graph.node:
        for input_name in node.input:
            tensor_consumers[input_name].append(node)

    return tensor_consumers


def get_tensor_consumer_node_indices(graph: onnx.GraphProto | gs.Graph) -> dict[str, list[int]]:
    """Build a mapping from tensor names to the indices of nodes that use them.

    Args:
        graph: ONNX GraphSurgeon graph to analyze
    Returns:
        Dictionary mapping tensor names to lists of node indices that consume them
    """
    tensor_consumer_map: dict[str, list[int]] = defaultdict(list)
    nodes = graph.nodes if isinstance(graph, gs.Graph) else graph.node
    for node_idx, node in enumerate(nodes):
        inputs = node.inputs if isinstance(node, gs.Node) else node.input
        for tensor in inputs:
            tensor_name = tensor
            if isinstance(tensor, str):
                tensor_name = tensor
            elif hasattr(tensor, "name") and isinstance(tensor.name, str):
                tensor_name = tensor.name
            tensor_consumer_map[tensor_name].append(node_idx)
    return tensor_consumer_map


def _is_following_cask_partition(
    node: Node, cask_partition_nodes: set[str], max_depth: int = 10
) -> bool:
    """Check if a CASK fusible partition can be reached by traversing backward through copy ops.

    Args:
        node: The node to check.
        cask_partition_nodes: Set of node names belonging to CASK partitions.
        max_depth: Maximum recursion depth to guard against pathological graphs.

    Returns:
        True if the node belongs to or follows a CASK partition through copy ops.
    """
    if node.name in cask_partition_nodes:
        return True

    if max_depth <= 0 or not is_copy_op(node.op):
        return False

    parent_nodes = get_parent_nodes(node)
    if len(parent_nodes) == 0:
        return False

    return all(
        _is_following_cask_partition(parent, cask_partition_nodes, max_depth - 1)
        for parent in parent_nodes
    )


def find_conv_to_layernorm_nodes(
    graph: Graph,
    cask_fusible_partitions: list[list[Node]],
) -> list[Node]:
    """Find LayerNormalization nodes whose input comes from a CASK (Conv) partition.

    When a Conv's output feeds into a LayerNormalization, the Conv output should be
    quantized to enable faster INT8 kernels in TRT. This function detects such patterns
    and returns the LayerNormalization nodes that should be added to the quantizable
    nodes list so that Q/DQ pairs are inserted on their input (i.e. the Conv output).

    Args:
        graph: ONNX model graph.
        cask_fusible_partitions: List of CASK fusible partitions.

    Returns:
        List of LayerNormalization nodes that consume CASK partition outputs.
    """
    cask_partition_nodes: set[str] = set()
    for partition in cask_fusible_partitions:
        cask_partition_nodes.update(node.name for node in partition)

    conv_to_ln_nodes = []
    for node in graph.nodes:
        if node.op != "LayerNormalization":
            continue

        # Check if the first input (activation) comes from a CASK partition
        # possibly through copy ops (Reshape, Transpose, etc.)
        inp_tensor = node.inputs[0]
        if inp_tensor.inputs:
            producer = inp_tensor.inputs[0]
            if _is_following_cask_partition(producer, cask_partition_nodes):
                conv_to_ln_nodes.append(node)
                logger.debug(
                    f"Found Conv->LayerNorm pattern: LayerNorm node '{node.name}' "
                    f"consumes CASK partition output"
                )

    logger.info(f"Found {len(conv_to_ln_nodes)} Conv->LayerNorm patterns to quantize")
    return conv_to_ln_nodes


def filter_quantizable_kgen_heads(
    cask_fusible_partitions: list[list[Node]],
    kgen_partitions: list[list[Node]],
    quantizable_op_types: list[str],
    graph: Graph,
) -> tuple[list[Node], list[tuple[Node, Node, str]]]:
    """Returns the list of kgen head names if it follows a CASK partition."""
    cask_partition_nodes: set[str] = set()
    for partition in cask_fusible_partitions:
        cask_partition_nodes.update(node.name for node in partition)

    cask_partition_heads = [partition[0] for partition in cask_fusible_partitions]

    def _is_mha_epilogue_pattern(node: Node, graph: Graph):
        if head_node.op != "Add":
            return False

        # Below are valid patterns:
        # (1)
        # Add -> Softmax -> MatMul
        #
        # (2)
        # Add -> Flatten -> Softmax -> Reshape -> MatMul
        #      \----------Shape-----/
        #
        mha_epilogue_path = ["Softmax", "MatMul"]
        wild_card_types = ["Flatten", "Reshape"]
        add_children = get_child_nodes(node)

        for child in add_children:
            if has_path_type(
                child,
                graph,
                mha_epilogue_path,
                is_forward=True,
                wild_card_types=wild_card_types,
            ):
                return True

        return False

    def _has_other_quantizable_consumer(
        tensor: Tensor, quantizable_kgen_heads: list[Node], head_name: str
    ):
        # Note. this is kinda approximate analysis,
        # all quantizable kgen heads may haven't got discovered yet
        quantizable_ops = [node.name for node in cask_partition_heads + quantizable_kgen_heads]

        # Look for other quantizable consumer than the current kgen head
        if head_name in quantizable_ops:
            quantizable_ops.remove(head_name)

        return any(consumer.name in quantizable_ops for consumer in tensor.outputs)

    quantizable_kgen_heads = []
    no_quantize_inputs = []  # list of tuple [(src_node_name, dst_node_name, input_name), ...]
    output_quantization_candidates = [
        "AveragePool",
        "BatchNormalization",
        "GlobalAveragePool",
        "MaxPool",
        "Mul",  # Example: VoVNet
    ]

    for partition in kgen_partitions:
        head_node = partition[0]
        # Check if partition head is of default quantizable type
        if head_node.op not in quantizable_op_types:
            continue

        # If the node has cost input, do not quantize
        if has_const_input(head_node):
            continue

        head_parents = get_parent_nodes(head_node)
        no_quantize_inputs_of_head = []
        has_quantizable_input = False

        # Check each of the parent (input producer for partition head)
        # or predecessor nodes and see if output quantization is needed for them
        # and decide which input of kgen head needs quantization
        for parent in head_parents:
            # If the head is consuming output of any quantizable op, then it is quantizable
            if (
                _is_following_cask_partition(parent, cask_partition_nodes)
                or parent.op in output_quantization_candidates
            ):
                # The mask add of MHA should not be quantized
                if _is_mha_epilogue_pattern(head_node, graph):
                    no_quantize_inputs_of_head.append(
                        (parent, partition[0], parent.outputs[0].name)
                    )
                else:
                    quantizable_kgen_heads.append(partition[0])
                    has_quantizable_input = True
            # If the input from the current parent has no other quantizable consumer, do not quantize that input
            elif not _has_other_quantizable_consumer(
                parent.outputs[0], quantizable_kgen_heads, head_node.name
            ):
                no_quantize_inputs_of_head.append((parent, partition[0], parent.outputs[0].name))

        # If at least one input of Add is quantizable, collect if there is any non-quantizable inputs
        if head_node.op == "Add" and has_quantizable_input:
            no_quantize_inputs.extend(no_quantize_inputs_of_head)

    return quantizable_kgen_heads, no_quantize_inputs


def classify_partition_nodes(
    partitions: list[list[Node]],
) -> tuple[list[Node], list[Node], list[tuple[Node, Node, str]]]:
    """We should partially quantize the partition nodes with inputs outside of the partition.

    Args:
        partitions: Partitions created by modelopt ptq algo.

    Returns:
        List of non-quantizable nodes.
        List of quantizable nodes.
        List of partially-quantizable inputs with non-quantizable input info as (src, dst, input_name)
    """
    non_quantizable_partition_nodes = []  # list of Node [node1, ...]
    quantizable_partition_nodes = []  # list of Node [node1, ...]
    no_quantize_inputs = []  # list of tuple [(src_node, dst_node, input_name), ...]

    for partition in partitions:
        partition_root_type = partition[0].op
        assert is_linear_op(partition_root_type)

        # Collect tensor names produced by partition nodes
        partition_node_outputs = []
        for node in partition:
            partition_node_outputs.extend([output.name for output in node.outputs])

        for node in partition:
            has_external_inputs = False
            internal_inputs = []  # Keeps (producer, consumer, tensor)
            for tensor in node.inputs:
                if is_const_input(tensor):
                    continue

                # If a KGEN op has external non-constant input, it is considered partially quantizable
                if tensor.name not in partition_node_outputs:
                    # partition heads will be fully quantizable and added
                    has_external_inputs = True
                else:
                    producer_node = tensor.inputs[0]
                    # format: source, target, input
                    # Note. it might happen that this node was not quantized
                    # We just ignore it from no_quantize_inputs list in post-processing
                    internal_inputs.append((producer_node, node, tensor.name))

            if not has_external_inputs:
                non_quantizable_partition_nodes.append(node)
            elif has_external_inputs and internal_inputs:
                no_quantize_inputs.extend(internal_inputs)
            else:
                # partition head is quantizable
                quantizable_partition_nodes.append(node)

    return non_quantizable_partition_nodes, quantizable_partition_nodes, no_quantize_inputs


def classify_partially_quantized_weighted_ops(
    graph: Graph, nodes_to_exclude: list[str]
) -> list[tuple[Node, Node, str]]:
    """Ensures that the input of non-quantizable weighted nodes do not get quantized."""
    no_quantize_inputs = []
    linear_nodes_to_exclude = [
        node for node in graph.nodes if node.name in nodes_to_exclude and is_linear_op(node.op)
    ]
    for node in linear_nodes_to_exclude:
        for tensor in node.inputs:
            if tensor.inputs:
                producer_node = tensor.inputs[0]
                no_quantize_inputs.append((producer_node, node, tensor.name))
    return no_quantize_inputs


def build_non_residual_input_map(
    graph: Graph,
) -> tuple[dict[str, str], list[tuple[Node, Node, str]]]:
    """Builds a map of non-residual Add input name to the Add node name from the given graph.

    This assumes that the Add layer only has 2 inputs.

    We will refer to a subgraph which has a Convolution node with a single output that is summed (element-wise)
    with another non-constant input-tensor as a "residual-add" subgraph, because it occurs in modern
    convnets that use residual connections.

    Args:
        graph: Onnx model graph.

    Returns:
        Dictionary of Add node names vs their non-residual input name.
        List of partially-quantizable inputs with non-quantizable input info as (src, dst, input_name)
    """
    non_residual_inputs = {}
    no_quantize_inputs = []
    for node in graph.nodes:
        if node.op in ["Add"]:
            # Add nodes with constant or graph input does not have non-residual input
            # Here, A = node.inputs[0], B = node.inputs[1] and A.inputs means producer nodes of A
            # TODO: make this check a util?
            if (
                has_const_input(node)
                or len(node.inputs[0].inputs) == 0
                or len(node.inputs[1].inputs) == 0
            ):
                non_residual_inputs[node.name] = None
                continue

            input1_producer = node.i(0, 0)
            input2_producer = node.i(1, 0)

            backbone1 = get_fusible_backbone(input1_producer, graph)
            backbone2 = get_fusible_backbone(input2_producer, graph)

            # Input in the longest path to LCA is the non-residual input
            lca, d1, d2 = find_lowest_common_ancestor(input1_producer, input2_producer)

            # Generally if both the inputs have a backbone then both backbones are of the same type
            if backbone1 and backbone2:
                if backbone1 == backbone2:
                    non_residual_inputs[node.name] = None
                    continue

                if d1 > d2:
                    non_residual_inputs[node.name] = node.inputs[0].name
                    no_quantize_inputs.append((input1_producer, node, node.inputs[0].name))
                else:
                    non_residual_inputs[node.name] = node.inputs[1].name
                    no_quantize_inputs.append((input2_producer, node, node.inputs[1].name))
            elif backbone1:
                # ConvNext pattern
                # Conv ---------------------- add
                #       \---- non backbone---/
                # This case LCA being backbone itself is not residual Add case.
                if lca and lca == backbone1.name:
                    # Not a residual Add node
                    non_residual_inputs[node.name] = None
                else:
                    non_residual_inputs[node.name] = node.inputs[0].name
                    no_quantize_inputs.append((input1_producer, node, node.inputs[0].name))
            elif backbone2:
                if lca and lca == backbone2.name:
                    # Not a residual Add node
                    non_residual_inputs[node.name] = None
                else:
                    non_residual_inputs[node.name] = node.inputs[1].name
                    no_quantize_inputs.append((input2_producer, node, node.inputs[1].name))
            else:
                # Not a residual Add node
                non_residual_inputs[node.name] = None

    return non_residual_inputs, no_quantize_inputs


def remove_partial_input_qdq(
    graph: Graph,
    no_quantize_inputs: list[tuple[Node, Node, str]],
) -> None:
    """Modifies the onnx model by removing QDQ nodes from the marked inputs, ex. non-residual inputs etc.

    Args:
        graph: Onnx model graph.
        no_quantize_inputs: List non-quantizable input info as (src, dst, input_name)
    """
    logger.info("Deleting QDQ nodes from marked inputs to make certain operations fusible")
    graph_nodes = {node.name: node for node in graph.nodes}
    for source, target, non_qdq_input_name in no_quantize_inputs:
        # Note. no_quantize_inputs objects are from non-quantized input graph
        # we are deleting some QDQ from the new quantized output graph
        source_node = graph_nodes[source.name]
        try:
            dq_node = source_node.o().o()
        except Exception:
            # Reached end of the graph
            continue
        if dq_node.op == "DequantizeLinear":
            dq_output = dq_node.outputs[0]  # source_node->Q->DQ->target_node

            # Look up the specific target node in the quantized graph.
            # With DedicatedQDQPair=False, a shared Q/DQ pair may feed multiple consumers
            # (e.g. Conv activation AND Add residual). Always patch the intended target
            # rather than the first consumer of the DQ output to avoid removing Q/DQ from
            # the wrong branch.
            target_node_in_graph = graph_nodes.get(target.name)
            if target_node_in_graph is None:
                continue

            # Find the input index in the target that is connected to the DQ output
            target_input_idx_arr = [
                idx
                for idx, inp in enumerate(target_node_in_graph.inputs)
                if inp.name == dq_output.name
            ]
            # If no input index is found (dq_output is not actually connected to target node), skip rewiring to
            # prevent silent corruption of the graph.
            if not target_input_idx_arr:
                logger.warning(
                    "Expected DequantizeLinear output '%s' to be an input of node '%s', "
                    "but no matching input was found. Skipping Q/DQ bypass for this edge.",
                    dq_output.name,
                    target_node_in_graph.name,
                )
                continue
            target_input_idx = target_input_idx_arr[0]

            # Connect the target's input directly to source_node's output (bypass Q/DQ)
            target_node_in_graph.inputs[target_input_idx] = source_node.outputs[0]

    # Check for quantized residual Adds where the parallel branch is not being quantized
    for source, target, non_qdq_input_name in no_quantize_inputs:
        if target.op != "Add":
            continue

        target_node = graph_nodes[target.name]
        for inp_idx, inp in enumerate(target_node.inputs):
            if inp.inputs[0].op == "DequantizeLinear":
                try:
                    parent_node = inp.inputs[0].i().i()
                except Exception:
                    # Reached beginning of the graph
                    continue
                quant_out_count = [
                    out_idx
                    for out_idx, out in enumerate(parent_node.outputs)
                    if out.outputs[0].op == "QuantizeLinear"
                ]
                non_quant_out_count = [
                    out
                    for out in parent_node.outputs
                    for _, _, non_qdq_inp_name in no_quantize_inputs
                    if out.name == non_qdq_inp_name
                ]
                # Bypass QDQ nodes if only one branch is quantized and the parallel branch should not be quantized
                if len(quant_out_count) == 1 and non_quant_out_count:
                    target_node.inputs[inp_idx] = parent_node.outputs[quant_out_count[0]]

    graph.cleanup()
    graph.toposort()


def _find_nodes_from_op_types_to_exclude(graph: Graph, op_types_to_exclude=None) -> list[str]:
    nodes_to_exclude = []
    if op_types_to_exclude:
        nodes_to_exclude = [node.name for node in graph.nodes if node.op in op_types_to_exclude]
    return nodes_to_exclude


def _find_int4_quantizable_weights(
    graph: onnx.GraphProto,
    nodes_to_exclude: list[str],
) -> list[tuple[onnx.ValueInfoProto, onnx.ValueInfoProto, bool, int, str]]:
    """Finds the int4 quantizable weights from the graph.

    Returns:
        list of tuples: (act_tensor, weight_tensor, do_transpose, gemm_io_type, node_name)
    """
    wa_pack = []
    gemm_nodes = [
        node
        for node in graph.node
        if node.op_type in ["Gemm", "MatMul"] and node.name not in nodes_to_exclude
    ]
    initializer_idxs = {initializer.name: idx for idx, initializer in enumerate(graph.initializer)}
    for gemm in gemm_nodes:
        if gemm.input[0] in initializer_idxs:
            # Ex. two const input to MatMul_115 in fastvit0.onnx
            # Note. RTN algorithm will quantize these weights though
            continue

        if gemm.input[1] not in initializer_idxs:
            continue

        weight_tensor = graph.initializer[initializer_idxs[gemm.input[1]]]
        if len(weight_tensor.dims) == 1:  # 1D blocked quantization not supported
            continue

        gemm_io_type = cast("int", weight_tensor.data_type)

        act_tensor = onnx.helper.ValueInfoProto()
        act_tensor.name = gemm.input[0]

        # TODO: support transA by transposing activation tensors in _clip_search
        do_transpose = gemm.op_type == "Gemm" and any(
            attr.name == "transB" and attr.i > 0 for attr in gemm.attribute
        )

        # Include node name for proper matching with layers_8bit_set
        wa_pack.append((act_tensor, weight_tensor, do_transpose, gemm_io_type, gemm.name))

    return wa_pack


def should_quantize_to_8bit(layer_name: str, layers_8bit: list[str]):
    """Check if layer should be quantized to 8 bits.

    The layers_8bit list contains ONNX node names like '/model/layers.13/attn/qkv_proj/MatMul'.
    The layer_name argument is an ONNX initializer name like 'model.layers.13.attn.qkv_proj.MatMul.weight'.

    To match these, we:
      - Remove the leading slash from the node name.
      - Replace all '/' with '.' to match the naming convention of the initializer.

    This allows us to correctly identify which weights should be quantized to 8 bits.
    """
    if not layers_8bit:
        return False

    # Normalize both to dot-delimited tokens and require exact token sequence match.
    def tokens(s: str) -> list[str]:
        return s.lstrip("/").replace("/", ".").split(".")

    hay = tokens(layer_name)
    for pat in layers_8bit:
        needle = tokens(pat)
        n, m = len(hay), len(needle)
        for i in range(n - m + 1):
            if hay[i : i + m] == needle:
                return True
    return False


def validate_8bit_layers(layers_str: str) -> bool:
    """Validate the format of layers_8bit string."""
    if not layers_str:
        return True
    # Allow comma-separated list of path-like tokens
    pattern = r"^\s*[/a-zA-Z0-9_.\-]+(\s*,\s*[/a-zA-Z0-9_.\-]+)*\s*$"
    return bool(re.match(pattern, layers_str))


def get_layer_precision_mapping(
    onnx_model: onnx.ModelProto,
    precision_pattern_8bit: str | None = None,
    nodes_to_exclude: list[str] | None = [r"/lm_head"],
    block_size: int = 128,
    quantize_axis: int = 0,
):
    """Generate a mapping of layer names to their quantization precision (4 bits or 8 bits) for an ONNX model.

    Args:
        onnx_model (onnx.ModelProto): The ONNX model to analyze.
        precision_pattern_8bit (str, optional): Comma-separated string of layer patterns to quantize to 8 bits.
            If None, a default set of patterns is used to select layers for 8 bits quantization.
        nodes_to_exclude (list[str], optional): List of node name patterns to exclude from quantization.
            Defaults to [r"/lm_head"].

    Returns:
        dict: A mapping from layer names to their quantization precision (e.g., {"layer_name": "8"}).
    """
    graph = onnx_model.graph

    nodes_to_exclude = expand_node_names_from_patterns(graph, nodes_to_exclude)
    # Collect quantizable weight tensors
    wa_pack = _find_int4_quantizable_weights(graph, nodes_to_exclude)

    if precision_pattern_8bit:
        if not validate_8bit_layers(precision_pattern_8bit):
            raise ValueError("Invalid format for --layers_8bit. Use comma-separated layers.")
        layers_list_8bit = [x.strip() for x in precision_pattern_8bit.split(",") if x.strip()]

    else:
        matmul_nodes = [
            node
            for node in onnx_model.graph.node
            if node.op_type in ["Gemm", "MatMul"] and "lm_head" not in node.name
        ]

        # Only include nodes matching the specified patterns for all layers present in the model
        # For example, for all i where a node exists with name:
        #   /model/layers.{i}/attn/qkv_proj/MatMul
        #   /model/layers.{i}/attn/v_proj/MatMul
        #   /model/layers.{i}/mlp/down_proj/MatMul
        pattern_regexes = [
            re.compile(r"^/model/layers\.(\d+)/attn/qkv_proj/MatMul$"),
            re.compile(r"^/model/layers\.(\d+)/attn/v_proj/MatMul$"),
            re.compile(r"^/model/layers\.(\d+)/self_attn/qkv_proj/MatMul$"),
            re.compile(r"^/model/layers\.(\d+)/self_attn/v_proj/MatMul$"),
            re.compile(r"^/model/layers\.(\d+)/mlp/down_proj/MatMul$"),
        ]

        # Filter matmul_nodes to only those matching the patterns
        filtered_matmul_nodes = []
        for node in matmul_nodes:
            for pat in pattern_regexes:
                if pat.match(node.name):
                    filtered_matmul_nodes.append(node)
                    break

        # Build a mapping from group key to list of node names (ordered by layer index if possible)
        def extract_group_key(node_name):
            # Extract the two components before 'MatMul' in the name, e.g. ...foo.bar.MatMul
            parts = node_name.split("/")
            if len(parts) >= 3:
                return ".".join(parts[-3:-1])
            return node_name

        group_to_nodes = {}
        for node in filtered_matmul_nodes:
            group_key = extract_group_key(node.name)
            group_to_nodes.setdefault(group_key, []).append(node.name)

        layers_8bit_set = set()
        for names in group_to_nodes.values():
            n = len(names)
            if n == 0:
                continue

            # Try to sort by layer index if present
            def layer_idx(name):
                m = re.search(r"layers\.(\d+)\.", name)
                return int(m.group(1)) if m else 0

            names_sorted = sorted(names, key=layer_idx)
            first_eighth = int(n // 8)
            last_eighth = int(n // 8)
            # First 1/8
            layers_8bit_set.update(names_sorted[:first_eighth])
            # Last 1/8
            if last_eighth > 0:
                layers_8bit_set.update(names_sorted[-last_eighth:])
            # Every third in the rest (excluding first and last eighth)
            rest_start = first_eighth
            rest_end = n - last_eighth
            for i in range(rest_start, rest_end):
                if (i - rest_start) % 3 == 0:
                    layers_8bit_set.add(names_sorted[i])
        layers_list_8bit = list(layers_8bit_set)
    # NEW: Create layer info mapping with precision, block_size, and axis
    layer_info = {}
    for i, (act_tensor, weight_tensor, do_transpose, gemm_io_type, node_name) in enumerate(wa_pack):
        weight_name = weight_tensor.name
        # Use node_name for matching against layers_8bit patterns
        if should_quantize_to_8bit(node_name, layers_list_8bit):
            layer_info[weight_name] = {
                "precision": 8,
                "block_size": -1,  # Per-channel for 8-bit
                "axis": 0,
            }
        else:
            layer_info[weight_name] = {
                "precision": 4,
                "block_size": block_size,  # Default block size for 4-bit
                "axis": quantize_axis,
            }

    return layer_info


def get_layer_info(
    onnx_model: onnx.ModelProto,
    nodes_to_exclude: list[str] | None = [r"/lm_head"],
    block_size: int = 128,
    quantize_axis: int = 0,
    **kwargs: Any,
):
    """Generate a mapping of weight tensor names to their quantization configuration.

    This function determines the quantization configuration (precision, block_size, axis) for each
    weight tensor in the ONNX model, based on the provided configuration. If mixed quantization
    is enabled, it uses the layer precision mapping; otherwise, it returns None.

    Args:
        onnx_model (onnx.ModelProto): The ONNX model to analyze.
        nodes_to_exclude (list[str] | None): List of node name patterns to exclude from quantization.
        **kwargs: Additional keyword arguments, such as:
            - enable_mixed_quant (bool): Whether to enable mixed quantization.
            - layers_8bit (str): Comma-separated list of layer patterns to quantize to 8 bit.
            - block_size (int): Default block size for quantization.
            - quantize_axis (int): Default quantization axis.
            - gather_block_size (int): Default block size for gather quantization.
            - gather_quantize_axis (int): Default quantization axis for gather.

    Returns:
        dict[str, dict[str, Any]] | None: A mapping from weight tensor names to their quantization
        configuration (with keys: precision, block_size, axis), or None if mixed quantization is not enabled.
    """
    layer_info = None
    enable_mixed_quant = kwargs.get("enable_mixed_quant", False)
    layers_8bit = kwargs.get("layers_8bit")
    gather_block_size = kwargs.get("gather_block_size", DEFAULT_GATHER_BLOCK_SIZE)
    gather_quantize_axis = kwargs.get("gather_quantize_axis", DEFAULT_GATHER_QUANTIZE_AXIS)
    if enable_mixed_quant or layers_8bit:
        layer_info = get_layer_precision_mapping(
            onnx_model,
            layers_8bit,
            nodes_to_exclude,
            block_size,
            quantize_axis,
        )
    else:
        layer_info = None

    if gather_quantize_axis is not None:
        if layer_info is None:
            layer_info = {}
        for node in onnx_model.graph.node:
            if node.op_type == "Gather":
                layer_info[node.input[0]] = {