|
1 | 1 | import re |
2 | | -from enum import Enum |
3 | | -from typing import Any, Dict, Iterable, Optional, Union |
| 2 | +from typing import Iterable, Optional |
4 | 3 |
|
5 | | -from pydantic import BaseModel, Field, field_validator |
| 4 | +from compressed_tensors import CompressionFormat |
6 | 5 | from torch.nn import Module |
7 | 6 |
|
8 | 7 | from aphrodite.quantization.utils.quant_utils import FUSED_LAYER_NAME_MAPPING |
9 | 8 |
|
10 | 9 |
|
11 | | -class CompressionFormat(Enum): |
12 | | - dense = "dense" |
13 | | - sparse_bitmask = "sparse-bitmask" |
14 | | - naive_quantized = "naive-quantized" |
15 | | - float_quantized = "float-quantized" |
16 | | - int_quantized = "int-quantized" |
17 | | - pack_quantized = "pack-quantized" |
18 | | - marlin_24 = "marlin-24" |
19 | | - |
20 | | - |
21 | | -class QuantizationType(str, Enum): |
22 | | - """ |
23 | | - Enum storing quantization type options |
24 | | - """ |
25 | | - |
26 | | - INT = "int" |
27 | | - FLOAT = "float" |
28 | | - |
29 | | - |
30 | | -class QuantizationStrategy(str, Enum): |
31 | | - """ |
32 | | - Enum storing quantization strategy options |
33 | | - """ |
34 | | - |
35 | | - TENSOR = "tensor" |
36 | | - CHANNEL = "channel" |
37 | | - GROUP = "group" |
38 | | - BLOCK = "block" |
39 | | - TOKEN = "token" |
40 | | - |
41 | | - |
42 | | -class ActivationOrdering(str, Enum): |
43 | | - """ |
44 | | - Enum storing strategies for activation ordering |
45 | | - Group: reorder groups and weight\n |
46 | | - Weight: only reorder weight, not groups. Slightly lower latency and |
47 | | - accuracy compared to group actorder\n |
48 | | - """ |
49 | | - GROUP = "group" |
50 | | - WEIGHT = "weight" |
51 | | - |
52 | | - |
53 | | -class QuantizationArgs(BaseModel): |
54 | | - """ |
55 | | - User facing arguments used to define a quantization config |
56 | | - for weights or activations |
57 | | -
|
58 | | - :param num_bits: quantization bit depth |
59 | | - :param type: dtype to quantized to, either int or float |
60 | | - :param symmetric: whether or not quantization scale is symmetric |
61 | | - :param strategy: string determining the scope of scale/zero-point to apply |
62 | | - :param group_size: group length to use for the group strategy |
63 | | - :param block_structure: 2d block structure to use for the block |
64 | | - strategy, must be of the format "2x4", "8x16", etc. |
65 | | - :param dynamic: set True to perform dynamic quantization - |
66 | | - values will not be calibrated during calibration phase, |
67 | | - instead during inference new quantization ranges will be |
68 | | - observed with every sample. Defaults to False for static |
69 | | - quantization. Note that enabling dynamic quantization |
70 | | - will change the default observer to a memoryless one |
71 | | - :param actorder: whether to apply group quantization in decreasing order of |
72 | | - activation. Defaults to None for arbitrary ordering |
73 | | - """ |
74 | | - |
75 | | - num_bits: int = 8 |
76 | | - type: QuantizationType = QuantizationType.INT |
77 | | - symmetric: bool = True |
78 | | - group_size: Optional[int] = None |
79 | | - strategy: Optional[QuantizationStrategy] = None |
80 | | - block_structure: Optional[str] = None |
81 | | - dynamic: bool = False |
82 | | - actorder: Union[ActivationOrdering, bool, None] = None |
83 | | - observer: str = Field( |
84 | | - default="minmax", |
85 | | - description=("The class to use to compute the quantization param - " |
86 | | - "scale and zero-point'"), |
87 | | - ) |
88 | | - observer_kwargs: Dict[str, Any] = Field( |
89 | | - default_factory=dict, |
90 | | - description= |
91 | | - ("optional dict of kwargs to be passed directly to torch quantization " |
92 | | - "Observers constructor excluding quantization range or symmetry"), |
93 | | - ) |
94 | | - |
95 | | - @field_validator("actorder", mode="before") |
96 | | - def validate_actorder(cls, value) -> Optional[ActivationOrdering]: |
97 | | - if isinstance(value, bool): |
98 | | - return ActivationOrdering.GROUP if value else None |
99 | | - if isinstance(value, str): |
100 | | - return ActivationOrdering(value.lower()) |
101 | | - return value |
102 | | - |
103 | | - |
104 | 10 | def is_activation_quantization_format(format: str) -> bool: |
105 | 11 | _ACTIVATION_QUANTIZATION_FORMATS = [ |
106 | 12 | CompressionFormat.naive_quantized.value, |
|
0 commit comments