Extend LoRA adapter tests with transpose scenarios and skip unsupported cases

Shehrozkashif · Shehrozkashif · commit 67b1544e5e38 · 2026-03-14T21:45:03.000Z
diff --git a/src/nncf/common/tensor_statistics/statistics.py b/src/nncf/common/tensor_statistics/statistics.py
@@ -276,15 +276,13 @@ def __eq__(self, other: Any) -> bool:
         return mean_values_equal
 
     def _get_serialized_data(self) -> dict[str, Tensor]:
-        backend = self.mean_values[0].backend
-        device = self.mean_values[0].device
         return {
             self.MEAN_STAT: fns.stack(self.mean_values),
             self.SHAPE_STAT: fns.tensor(
                 self.shape_values,
-                backend=backend,
+                backend=self.mean_values[0].backend,
                 dtype=TensorDataType.int32,
-                device=device,
+                device=self.mean_values[0].device,
             ),
         }
 
diff --git a/src/nncf/quantization/algorithms/weight_compression/activation_stats.py b/src/nncf/quantization/algorithms/weight_compression/activation_stats.py
@@ -17,7 +17,12 @@
 from nncf.tensor import functions as fns
 
 
-def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int = -1) -> tuple[Tensor, Tensor]:
+def process_stats(
+    stats: WCTensorStatistic,
+    subset_size: int,
+    act_ch_axis: int = -1,
+    transpose_a: bool = False,
+) -> tuple[Tensor, Tensor]:
     """
     A function for processing activations. Shared between AWQ, Scale Estimation and LoRA Correction algorithms.
 
@@ -37,8 +42,13 @@ def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int =
     axes = list(range(1, len(X.shape))) + [0]
     X_full = fns.transpose(X, axes=axes)
 
-    # The sample dimension is always the last axis after transpose
-    sample_axis = -1
+    if transpose_a:
+        axes = list(range(len(X_full.shape)))
+        axes[-1], axes[-2] = axes[-2], axes[-1]
+        X_full = fns.transpose(X_full, axes=axes)
+
+    # The sample dimension is axis -1 by default, but moves to -2 if transpose_a is True
+    sample_axis = -2 if transpose_a else -1
 
     # Prevent high memory and time consumption by sampling
     if X_full.shape[sample_axis] > subset_size and subset_size > 0:
@@ -47,11 +57,13 @@ def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int =
         ]
         step = X_full.shape[sample_axis] // subset_size
         idxs = [i[0] for i in sorted(enumerate(lens), key=lambda x: -x[1])][::step]
-        X = X_full[..., idxs]
+        if transpose_a:
+            X = X_full[..., idxs, :]
+        else:
+            X = X_full[..., idxs]
     else:
         X = X_full
 
-    # Compute max magnitude along the sample axis (last axis)
-    # Result: [HiddenDim] or [No. of Experts, HiddenDim]
+    # Compute max magnitude along the sample axis
     s = fns.max(fns.abs(X_full), axis=sample_axis)
     return s, X
diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -1152,11 +1152,6 @@ def apply_with_parameters(
                 )
 
             if self._lora_correction:
-                for wc_params in all_weight_params:
-                    if self._backend_entity.matmul_has_transposed_activations(wc_params.node_with_weight, graph):
-                        msg = "Transposed activations are not supported yet for the LoRa correction algorithm"
-                        raise nncf.UnsupportedModelError(msg)
-
                 lora_correction_params = self._advanced_parameters.lora_correction_params
                 lora_correction_algo = LoraCorrectionAlgorithm(statistics, lora_correction_params)
                 description += " with correction of low-rank adapters"
@@ -1370,7 +1365,7 @@ def _get_statistics_for_weights_compression(
         # Where mean_value is a 1D tensor representing an activation reduced over batch and sequence length dimensions,
         # shape is an original shape of an activation before reduction, n is the size of the dataset (or subset_size).
         statistics = {}
-        for (act_node, output_port_id, _), matmul_nodes in matmul_input_to_output_nodes_map.items():
+        for (act_node, output_port_id, _act_channel_axis), matmul_nodes in matmul_input_to_output_nodes_map.items():
             tensor_collectors = list(
                 statistic_points.get_algo_statistics_for_node(
                     act_node.node_name,
diff --git a/src/nncf/quantization/algorithms/weight_compression/lora_correction.py b/src/nncf/quantization/algorithms/weight_compression/lora_correction.py
@@ -9,9 +9,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pathlib import Path
-from typing import Optional
-
-import pandas as pd
 
 import nncf
 from nncf.common.logging import nncf_logger
@@ -43,9 +40,10 @@ def __init__(self):
     def add_noises(self, layer_name: str, value: float):
         self._noise_per_layer[layer_name] = value
 
-    @skip_if_dependency_unavailable(dependencies=["matplotlib.pyplot"])
+    @skip_if_dependency_unavailable(dependencies=["matplotlib.pyplot", "pandas"])
     def dump_data(self):
         import matplotlib.pyplot as plt
+        import pandas as pd
 
         if not self._noise_per_layer:
             return
@@ -108,27 +106,35 @@ def is_applicable(self, wc_params: WeightCompressionParameters):
         return wc_params.compression_config.num_bits == 4
 
     def calculate_adapters(
-        self, weight: Tensor, compressed_weight: CompressedWeight, wc_params: WeightCompressionParameters
+        self,
+        weight: Tensor,
+        compressed_weight: CompressedWeight,
+        wc_params: WeightCompressionParameters,
+        act_ch_axis: int,
     ) -> tuple[Tensor, Tensor, list[float]]:
         """
         Calculates low rank matrices for a given original and compressed weights.
 
         :param weight: original floating-point weight matrix.
         :param compressed_weight: compressed weight matrix.
         :param wc_params: parameters of weight compression.
+        :param act_ch_axis: axis number of the activation tensor which correspond to it channel.
         :return: two low rank matrices in the order of execution of corresponding linear layers.
         """
         layer_name = wc_params.node_with_weight.node_name
         layer_statistics = self._statistics[layer_name]
         is_debug = self._debug_interface is not None
+        transpose_a_flag = getattr(wc_params.node_with_weight, "transpose_a", False)
         lora_A, lora_B, mean_noises = self.calculate_low_rank_matrices(
             weight,
             compressed_weight,
             wc_params.compression_config,
             wc_params.reduction_axes,
             self._lora_correction_params,
             layer_statistics,
+            act_ch_axis,
             is_debug,
+            transpose_a=transpose_a_flag,
         )
         if is_debug:
             self._debug_interface.add_noises(layer_name, mean_noises)
@@ -142,7 +148,9 @@ def calculate_low_rank_matrices(
         reduction_axes: tuple[int, ...],
         lora_correction_params: AdvancedLoraCorrectionParameters,
         layer_statistics: WCTensorStatistic,
-        is_debug: Optional[bool] = False,
+        act_ch_axis: int,
+        is_debug: bool | None = False,
+        transpose_a: bool = False,
     ):
         """
         Calculates low rank matrices for a given original and compressed weights.
@@ -157,6 +165,7 @@ def calculate_low_rank_matrices(
         :param reduction_axes: axes along which different statistics reduced.
         :param lora_correction_params: parameters to configure the algorithm.
         :param layer_statistics: an object containing statistics for the layer.
+        :param act_ch_axis: axis number of the activation tensor which correspond to it channel.
         :param is_debug: whether to collect debug information, defaults to False.
         :return: two low rank matrices in the order of execution of corresponding linear layers and list of mean noises.
             Noises are collected from each step of the algorithm if debug was enabled.
@@ -170,7 +179,15 @@ def calculate_low_rank_matrices(
         )
         mode = compression_config.mode
         assert len(reduction_axes) == 1, "Assumed a single reduction axis"
-        reduction_axis = reduction_axes[0] if compression_config.group_size != -1 else -1
+
+        if compression_config.group_size != -1:
+            reduction_axis = reduction_axes[0]
+        else:
+            reduction_axis = -1
+
+        if transpose_a and reduction_axis != -1:
+            reduction_axis = 1
+
         if mode in (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM):
             fq_weights = do_integer_dequantization(
                 compressed_weight.tensor,
@@ -194,16 +211,8 @@ def calculate_low_rank_matrices(
             svd_residual = fns.transpose(svd_residual)
         residual = svd_residual.clone()  # [H, O]
 
-        # Get the activation channel axis
-        act_ch_axis = getattr(layer_statistics, "act_ch_axis", -1)  # default to last axis
-
-        # Pass it to process_stats
-        s, X = process_stats(layer_statistics, subset_size, act_ch_axis)
-
-        # Conditionally transpose X so samples are rows and channels are columns
-        if act_ch_axis != 0:  # if channel is not already the first axis
-            X = fns.transpose(X, axes=(1, 0))  # [SS, H]
-
+        # Pass it to process_stats with transpose_a=True to get [SS, H] layout
+        s, X = process_stats(layer_statistics, subset_size, act_ch_axis, transpose_a=True)
         if compression_config.group_size > 0:
             # Multiply residual of weights by maximum channel magnitude of activations normalized per quantization
             # group. As a consequence, weights corresponding to a "noisy" activations has a higher error to correct.
diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -206,7 +206,8 @@ def insert_adapters(
             A_W = opset.constant(lora_A.data)
             B_W = opset.constant(lora_B.data)
 
-        A_MM = opset.matmul(input_node, A_W, transpose_a=False, transpose_b=True)
+        transpose_a = wc_params.node_with_weight.layer_attributes.input_attributes["transpose"]
+        A_MM = opset.matmul(input_node, A_W, transpose_a=transpose_a, transpose_b=True)
         B_MM = opset.matmul(A_MM, B_W, transpose_a=False, transpose_b=True)
 
         node_output_port = mm_node.output(0)
@@ -349,7 +350,15 @@ def transform_model(
                 compressed_weight.tensor = compressed_weight.tensor.as_numpy_tensor()
                 if compressed_weight.zero_point is not None:
                     compressed_weight.zero_point = compressed_weight.zero_point.as_numpy_tensor()
-                adapters = lora_correction_algo.calculate_adapters(weight, compressed_weight, wc_params)
+
+                activation_port_id = self.get_activation_port_id(wc_params.node_with_weight, graph)
+                activation_edge = graph.get_input_edge_by_port_id(wc_params.node_with_weight, activation_port_id)
+                activation_shape = activation_edge.tensor_shape
+                act_ch_axis = self.get_activation_channel_axis(
+                    wc_params.node_with_weight, activation_port_id, activation_shape
+                )
+
+                adapters = lora_correction_algo.calculate_adapters(weight, compressed_weight, wc_params, act_ch_axis)
                 self.insert_adapters(wc_params, *adapters, int8_lora=lora_correction_algo.use_int8_adapters)
         self.name_to_node_mapping = None
 
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py