Skip to content

Commit 7c825e5

Browse files
authored
fix: correct FP8 support check on Ada+ GPUs by using compressed-tensors (#1110)
* fix: fp8 support check for dynamic fp8 * bump compressed-tensors
1 parent 2bb9c9c commit 7c825e5

8 files changed

Lines changed: 13 additions & 106 deletions

aphrodite/quantization/compressed_tensors/compressed_tensors.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
from typing import Any, Dict, List, Optional
22

33
import torch
4+
from compressed_tensors.config import CompressionFormat
5+
from compressed_tensors.quantization import (QuantizationArgs,
6+
QuantizationStrategy,
7+
QuantizationType)
48
from pydantic import BaseModel
59

610
from aphrodite.modeling.layers.fused_moe import FusedMoE
@@ -17,8 +21,7 @@
1721
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
1822
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
1923
from aphrodite.quantization.compressed_tensors.utils import (
20-
CompressionFormat, QuantizationArgs, QuantizationStrategy,
21-
QuantizationType, find_matched_target, is_activation_quantization_format,
24+
find_matched_target, is_activation_quantization_format,
2225
should_ignore_layer)
2326
from aphrodite.quantization.kv_cache import BaseKVCacheMethod
2427

aphrodite/quantization/compressed_tensors/compressed_tensors_moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@
33
from typing import Callable, List, Optional
44

55
import torch
6+
from compressed_tensors import CompressionFormat
67

78
from aphrodite import _custom_ops as ops
89
from aphrodite.modeling.layers.fused_moe import FusedMoEMethodBase
910
from aphrodite.modeling.utils import set_weight_attrs
1011
from aphrodite.quantization.compressed_tensors.schemes import (
1112
WNA16_SUPPORTED_BITS)
12-
from aphrodite.quantization.compressed_tensors.utils import CompressionFormat
1313

1414

1515
class GPTQMarlinState(Enum):

aphrodite/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
from typing import Callable, List, Optional
22

33
import torch
4+
from compressed_tensors.quantization import QuantizationStrategy
45

56
from aphrodite.modeling.parameter import (ChannelQuantScaleParameter,
67
ModelWeightParameter,
78
PerTensorScaleParameter)
89
from aphrodite.quantization.compressed_tensors.schemes import (
910
CompressedTensorsScheme)
10-
from aphrodite.quantization.compressed_tensors.utils import (
11-
QuantizationStrategy)
1211
from aphrodite.quantization.utils.marlin_utils_fp8 import (
1312
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
1413
from aphrodite.quantization.utils.w8a8_utils import convert_to_channelwise

aphrodite/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import Callable, List, Optional
22

33
import torch
4+
from compressed_tensors.quantization import QuantizationStrategy
45
from torch.nn import Parameter
56

67
from aphrodite.common.utils import is_hip
@@ -9,8 +10,6 @@
910
PerTensorScaleParameter)
1011
from aphrodite.quantization.compressed_tensors.schemes import (
1112
CompressedTensorsScheme)
12-
from aphrodite.quantization.compressed_tensors.utils import (
13-
QuantizationStrategy)
1413
from aphrodite.quantization.utils.w8a8_utils import (
1514
apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz,
1615
requantize_with_max_scale)

aphrodite/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import Callable, List, Optional
22

33
import torch
4+
from compressed_tensors.quantization import QuantizationStrategy
45
from torch.nn import Parameter
56

67
from aphrodite.modeling.parameter import (BaseAphroditeParameter,
@@ -9,8 +10,6 @@
910
PerTensorScaleParameter)
1011
from aphrodite.quantization.compressed_tensors.schemes import (
1112
CompressedTensorsScheme)
12-
from aphrodite.quantization.compressed_tensors.utils import (
13-
QuantizationStrategy)
1413
from aphrodite.quantization.utils.w8a8_utils import (apply_int8_linear,
1514
convert_to_channelwise)
1615

aphrodite/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import Callable, List, Optional, Set
22

33
import torch
4+
from compressed_tensors.quantization import ActivationOrdering
45
from loguru import logger
56

67
from aphrodite.modeling.parameter import (BaseAphroditeParameter,
@@ -10,7 +11,6 @@
1011
RowAphroditeParameter)
1112
from aphrodite.quantization.compressed_tensors.schemes import (
1213
CompressedTensorsScheme)
13-
from aphrodite.quantization.compressed_tensors.utils import ActivationOrdering
1414
from aphrodite.quantization.kernels import (MPLinearLayerConfig,
1515
choose_mp_linear_kernel)
1616
from aphrodite.quantization.utils.marlin_utils import (

aphrodite/quantization/compressed_tensors/utils.py

Lines changed: 2 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,106 +1,12 @@
11
import re
2-
from enum import Enum
3-
from typing import Any, Dict, Iterable, Optional, Union
2+
from typing import Iterable, Optional
43

5-
from pydantic import BaseModel, Field, field_validator
4+
from compressed_tensors import CompressionFormat
65
from torch.nn import Module
76

87
from aphrodite.quantization.utils.quant_utils import FUSED_LAYER_NAME_MAPPING
98

109

11-
class CompressionFormat(Enum):
12-
dense = "dense"
13-
sparse_bitmask = "sparse-bitmask"
14-
naive_quantized = "naive-quantized"
15-
float_quantized = "float-quantized"
16-
int_quantized = "int-quantized"
17-
pack_quantized = "pack-quantized"
18-
marlin_24 = "marlin-24"
19-
20-
21-
class QuantizationType(str, Enum):
22-
"""
23-
Enum storing quantization type options
24-
"""
25-
26-
INT = "int"
27-
FLOAT = "float"
28-
29-
30-
class QuantizationStrategy(str, Enum):
31-
"""
32-
Enum storing quantization strategy options
33-
"""
34-
35-
TENSOR = "tensor"
36-
CHANNEL = "channel"
37-
GROUP = "group"
38-
BLOCK = "block"
39-
TOKEN = "token"
40-
41-
42-
class ActivationOrdering(str, Enum):
43-
"""
44-
Enum storing strategies for activation ordering
45-
Group: reorder groups and weight\n
46-
Weight: only reorder weight, not groups. Slightly lower latency and
47-
accuracy compared to group actorder\n
48-
"""
49-
GROUP = "group"
50-
WEIGHT = "weight"
51-
52-
53-
class QuantizationArgs(BaseModel):
54-
"""
55-
User facing arguments used to define a quantization config
56-
for weights or activations
57-
58-
:param num_bits: quantization bit depth
59-
:param type: dtype to quantized to, either int or float
60-
:param symmetric: whether or not quantization scale is symmetric
61-
:param strategy: string determining the scope of scale/zero-point to apply
62-
:param group_size: group length to use for the group strategy
63-
:param block_structure: 2d block structure to use for the block
64-
strategy, must be of the format "2x4", "8x16", etc.
65-
:param dynamic: set True to perform dynamic quantization -
66-
values will not be calibrated during calibration phase,
67-
instead during inference new quantization ranges will be
68-
observed with every sample. Defaults to False for static
69-
quantization. Note that enabling dynamic quantization
70-
will change the default observer to a memoryless one
71-
:param actorder: whether to apply group quantization in decreasing order of
72-
activation. Defaults to None for arbitrary ordering
73-
"""
74-
75-
num_bits: int = 8
76-
type: QuantizationType = QuantizationType.INT
77-
symmetric: bool = True
78-
group_size: Optional[int] = None
79-
strategy: Optional[QuantizationStrategy] = None
80-
block_structure: Optional[str] = None
81-
dynamic: bool = False
82-
actorder: Union[ActivationOrdering, bool, None] = None
83-
observer: str = Field(
84-
default="minmax",
85-
description=("The class to use to compute the quantization param - "
86-
"scale and zero-point'"),
87-
)
88-
observer_kwargs: Dict[str, Any] = Field(
89-
default_factory=dict,
90-
description=
91-
("optional dict of kwargs to be passed directly to torch quantization "
92-
"Observers constructor excluding quantization range or symmetry"),
93-
)
94-
95-
@field_validator("actorder", mode="before")
96-
def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
97-
if isinstance(value, bool):
98-
return ActivationOrdering.GROUP if value else None
99-
if isinstance(value, str):
100-
return ActivationOrdering(value.lower())
101-
return value
102-
103-
10410
def is_activation_quantization_format(format: str) -> bool:
10511
_ACTIVATION_QUANTIZATION_FORMATS = [
10612
CompressionFormat.naive_quantized.value,

requirements-common.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,4 @@ python-multipart
3636
partial-json-parser
3737
opencv-python-headless
3838
einops
39+
compressed-tensors == 0.8.0

0 commit comments

Comments
 (0)