Skip to content

Commit f4a1ae8

Browse files
Complete legacy AQT deprecation and transition to Qwix/FP8
1 parent 72ddb3c commit f4a1ae8

51 files changed

Lines changed: 284 additions & 1348 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/dependencies/requirements/base_requirements/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
absl-py
2-
aqtp
32
array-record
43
chex
54
cloud-accelerator-diagnostics

src/dependencies/requirements/generated_requirements/cuda12-requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ annotated-doc>=0.0.4
1010
annotated-types>=0.7.0
1111
antlr4-python3-runtime>=4.9.3
1212
anyio>=4.13.0
13-
aqtp>=0.9.0
1413
array-record>=0.8.3
1514
astroid>=4.0.4
1615
astunparse>=1.6.3

src/dependencies/requirements/generated_requirements/tpu-post-train-requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ antlr4-python3-runtime>=4.9.3
1414
anyio>=4.13.0
1515
apache-tvm-ffi>=0.1.11
1616
appnope>=0.1.4 ; sys_platform == 'darwin'
17-
aqtp>=0.9.0
1817
array-record>=0.8.3
1918
astor>=0.8.1
2019
astroid>=4.0.4

src/dependencies/requirements/generated_requirements/tpu-requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ annotated-doc>=0.0.4
1010
annotated-types>=0.7.0
1111
antlr4-python3-runtime>=4.9.3
1212
anyio>=4.13.0
13-
aqtp>=0.9.0
1413
array-record>=0.8.3
1514
astroid>=4.0.4
1615
astunparse>=1.6.3

src/dependencies/requirements/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
absl-py
2-
aqtp
32
array-record
43
cloud-accelerator-diagnostics
54
cloud-tpu-diagnostics

src/dependencies/requirements/requirements_decoupled_jax_0_7.1.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
absl_py>=2.3.1
2-
aqtp>=0.9.0
32
chex>=0.1.90
43
datasets>=4.2.0
54
etils>=1.13.0

src/maxtext/configs/base.yml

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,6 @@ dtype: "bfloat16"
112112
# used to configure quantization in the transformer layers, defaults to null implying bf16.
113113
# possible alternative settings are as follows:
114114
# 'int8' for dynamic range quantization using 8-bits
115-
# 'intmp' for mixed precision quantization for inference as described here: src/maxtext/configs/quantization/readme.md
116115
# 'fp8' for 8-bit floating-point gemms on nvidia gpus.
117116
# 'nanoo_fp8' for 8-bit floating-point gemms on amd mi300/mi325 gpus.
118117
# 'fp8_full' for fp8 quantization with static scaling.
@@ -123,10 +122,6 @@ constant_bound_config: ""
123122
# https://kolonist26-jax-kr.readthedocs.io/en/latest/jax.lax.html#jax.lax.precision
124123
matmul_precision: "default"
125124
activations_in_float32: false # sets activations to float32 before nonlinearity it true, else dtype
126-
# used to replicate the quantization scale to avoid the inefficient xla fusion for 2d sharding.
127-
replicate_quant_scale: false
128-
# path to file with quantization config for intmp.
129-
quant_cfg_path: ""
130125
quantize_kvcache: false # set to true to quantize kv cache values, defaults to false
131126
# valid kv_quant_axis values:
132127
# - "" is valid only when quantize_kvcache is false
@@ -143,7 +138,7 @@ save_quantized_params_path: ""
143138
# when left as is, corresponds to training
144139
# accepted values are "inference"
145140
model_call_mode: ""
146-
use_qwix_quantization: false # [DEPRECATED: AQT will be removed in a future release. It is strongly recommended to set use_qwix_quantization to true] whether to use qwix for quantization. if set to true, the model will be quantized using qwix.
141+
use_qwix_quantization: true # [DEPRECATED: AQT will be removed in a future release. It is strongly recommended to set use_qwix_quantization to true] whether to use qwix for quantization. if set to true, the model will be quantized using qwix.
147142
use_manual_quantization: false # a flag if to use manual quantization for batch split. Only used if use_batch_split_schedule is true.
148143
# quantization calibration method used for weights and activations. supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#l70-l80
149144
weight_quantization_calibration_method: "absmax"

src/maxtext/configs/inference/multihost/disaggregation/llama3_405b_v6e-16-16.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@ sharding_strategy: "experimental"
55
attention: 'dot_product'
66
allow_split_physical_axes: true
77
tokenizer_path: "assets/tokenizer_llama3.tiktoken"
8-
# Used to replicate the quantization scale to avoid the inefficient XLA fusion.
9-
replicate_quant_scale: true
108

119
inference_server: "ExperimentalMaxtextDisaggregatedServer"
1210

src/maxtext/configs/inference/multihost/interleaved/llama2_70b_v5e-16.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ model_name: "llama2-70b"
88
sharding_strategy: "experimental"
99
attention: 'dot_product'
1010
allow_split_physical_axes: true
11-
# Used to replicate the quantization scale to avoid the inefficient XLA fusion.
12-
replicate_quant_scale: true
1311

1412
logical_axis_rules: [
1513
['embed', []],

src/maxtext/configs/inference/multihost/interleaved/llama3_405b_v5e-64.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@ sharding_strategy: "experimental"
1010
attention: 'dot_product'
1111
allow_split_physical_axes: true
1212
tokenizer_path: "assets/tokenizer_llama3.tiktoken"
13-
# Used to replicate the quantization scale to avoid the inefficient XLA fusion.
14-
replicate_quant_scale: true
1513

1614
logical_axis_rules: [
1715
['embed', []],

0 commit comments

Comments
 (0)