AI-Hypercomputer
diff --git a/‎src/dependencies/requirements/base_requirements/requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎src/dependencies/requirements/base_requirements/requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/dependencies/requirements/generated_requirements/cuda12-requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎src/dependencies/requirements/generated_requirements/cuda12-requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/dependencies/requirements/generated_requirements/tpu-post-train-requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎src/dependencies/requirements/generated_requirements/tpu-post-train-requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/dependencies/requirements/generated_requirements/tpu-requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎src/dependencies/requirements/generated_requirements/tpu-requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/dependencies/requirements/requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎src/dependencies/requirements/requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/dependencies/requirements/requirements_decoupled_jax_0_7.1.txt‎
Lines changed: 0 additions & 1 deletion b/‎src/dependencies/requirements/requirements_decoupled_jax_0_7.1.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 1 addition & 6 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎src/maxtext/configs/inference/multihost/disaggregation/llama3_405b_v6e-16-16.yml‎
Lines changed: 0 additions & 2 deletions b/‎src/maxtext/configs/inference/multihost/disaggregation/llama3_405b_v6e-16-16.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/maxtext/configs/inference/multihost/interleaved/llama2_70b_v5e-16.yml‎
Lines changed: 0 additions & 2 deletions b/‎src/maxtext/configs/inference/multihost/interleaved/llama2_70b_v5e-16.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/maxtext/configs/inference/multihost/interleaved/llama3_405b_v5e-64.yml‎
Lines changed: 0 additions & 2 deletions b/‎src/maxtext/configs/inference/multihost/interleaved/llama3_405b_v5e-64.yml‎
Lines changed: 0 additions & 2 deletions
@@ -1,5 +1,4 @@
 absl-py
-aqtp
 array-record
 chex
 cloud-accelerator-diagnostics
 
@@ -10,7 +10,6 @@ annotated-doc>=0.0.4
 annotated-types>=0.7.0
 antlr4-python3-runtime>=4.9.3
 anyio>=4.13.0
-aqtp>=0.9.0
 array-record>=0.8.3
 astroid>=4.0.4
 astunparse>=1.6.3
 
@@ -14,7 +14,6 @@ antlr4-python3-runtime>=4.9.3
 anyio>=4.13.0
 apache-tvm-ffi>=0.1.11
 appnope>=0.1.4 ; sys_platform == 'darwin'
-aqtp>=0.9.0
 array-record>=0.8.3
 astor>=0.8.1
 astroid>=4.0.4
 
@@ -10,7 +10,6 @@ annotated-doc>=0.0.4
 annotated-types>=0.7.0
 antlr4-python3-runtime>=4.9.3
 anyio>=4.13.0
-aqtp>=0.9.0
 array-record>=0.8.3
 astroid>=4.0.4
 astunparse>=1.6.3
 
@@ -1,5 +1,4 @@
 absl-py
-aqtp
 array-record
 cloud-accelerator-diagnostics
 cloud-tpu-diagnostics
 
@@ -1,5 +1,4 @@
 absl_py>=2.3.1
-aqtp>=0.9.0
 chex>=0.1.90
 datasets>=4.2.0
 etils>=1.13.0
 
@@ -112,7 +112,6 @@ dtype: "bfloat16"
 # used to configure quantization in the transformer layers, defaults to null implying bf16.
 # possible alternative settings are as follows:
 # 'int8' for dynamic range quantization using 8-bits
-# 'intmp' for mixed precision quantization for inference as described here: src/maxtext/configs/quantization/readme.md
 # 'fp8' for 8-bit floating-point gemms on nvidia gpus.
 # 'nanoo_fp8' for 8-bit floating-point gemms on amd mi300/mi325 gpus.
 # 'fp8_full' for fp8 quantization with static scaling.
@@ -123,10 +122,6 @@ constant_bound_config: ""
 # https://kolonist26-jax-kr.readthedocs.io/en/latest/jax.lax.html#jax.lax.precision
 matmul_precision: "default"
 activations_in_float32: false # sets activations to float32 before nonlinearity it true, else dtype
-# used to replicate the quantization scale to avoid the inefficient xla fusion for 2d sharding.
-replicate_quant_scale: false
-# path to file with quantization config for intmp.
-quant_cfg_path: ""
 quantize_kvcache: false # set to true to quantize kv cache values, defaults to false
 # valid kv_quant_axis values:
 #   - "" is valid only when quantize_kvcache is false
@@ -143,7 +138,7 @@ save_quantized_params_path: ""
 # when left as is, corresponds to training
 # accepted values are "inference"
 model_call_mode: ""
-use_qwix_quantization: false # [DEPRECATED: AQT will be removed in a future release. It is strongly recommended to set use_qwix_quantization to true] whether to use qwix for quantization. if set to true, the model will be quantized using qwix.
+use_qwix_quantization: true # [DEPRECATED: AQT will be removed in a future release. It is strongly recommended to set use_qwix_quantization to true] whether to use qwix for quantization. if set to true, the model will be quantized using qwix.
 use_manual_quantization: false # a flag if to use manual quantization for batch split. Only used if use_batch_split_schedule is true.
 # quantization calibration method used for weights and activations. supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#l70-l80
 weight_quantization_calibration_method: "absmax"
 
@@ -5,8 +5,6 @@ sharding_strategy: "experimental"
 attention: 'dot_product'
 allow_split_physical_axes: true
 tokenizer_path: "assets/tokenizer_llama3.tiktoken"
-# Used to replicate the quantization scale to avoid the inefficient XLA fusion.
-replicate_quant_scale: true
 
 inference_server: "ExperimentalMaxtextDisaggregatedServer"
 
 
@@ -8,8 +8,6 @@ model_name: "llama2-70b"
 sharding_strategy: "experimental"
 attention: 'dot_product'
 allow_split_physical_axes: true
-# Used to replicate the quantization scale to avoid the inefficient XLA fusion.
-replicate_quant_scale: true
 
 logical_axis_rules: [
                       ['embed', []],
 
@@ -10,8 +10,6 @@ sharding_strategy: "experimental"
 attention: 'dot_product'
 allow_split_physical_axes: true
 tokenizer_path: "assets/tokenizer_llama3.tiktoken"
-# Used to replicate the quantization scale to avoid the inefficient XLA fusion.
-replicate_quant_scale: true
 
 logical_axis_rules: [
                       ['embed', []],
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`absl-py`
`2`		`-aqtp`
`3`	`2`	`array-record`
`4`	`3`	`chex`
`5`	`4`	`cloud-accelerator-diagnostics`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`absl_py>=2.3.1`
`2`		`-aqtp>=0.9.0`
`3`	`2`	`chex>=0.1.90`
`4`	`3`	`datasets>=4.2.0`
`5`	`4`	`etils>=1.13.0`