Allow multiple convolution layers instead of a fixed two-layer setup in subsampler

kmxyvb · wangkuiyi · commit b3bef65b5254 · 2026-05-13T20:06:51.000Z
GitOrigin-RevId: 0115da7aeac05b5d9661a334ee55133be968b4ef
diff --git a/axlearn/audio/encoder_asr.py b/axlearn/audio/encoder_asr.py
@@ -21,6 +21,7 @@
 from axlearn.common.base_layer import BaseLayer
 from axlearn.common.config import REQUIRED, Required, config_class
 from axlearn.common.conformer import RepeatedConformerLayer
+from axlearn.common.convolution import Conv1DWithPadding
 from axlearn.common.ein_ops import rearrange
 from axlearn.common.layers import Dropout, Linear
 from axlearn.common.module import Module, nowrap
@@ -139,7 +140,9 @@ class Config(BaseLayer.Config):
         # Dropout applied after projection.
         dropout: Dropout.Config = Dropout.default_config()
         # Positional embeddings.
-        pos_emb: BaseLayer.Config = SinusoidalPositionalEmbedding.default_config()
+        pos_emb: Optional[BaseLayer.Config] = SinusoidalPositionalEmbedding.default_config()
+        # Post Convolution downsample
+        post_downsample: Optional[Conv1DWithPadding.Config] = None
         # Context layers, e.g. a conformer stack.
         context: BaseLayer.Config = RepeatedConformerLayer.default_config()
 
@@ -150,8 +153,14 @@ def __init__(self, cfg: Config, *, parent: Optional[Module]):
             "input_linear", cfg.input_linear.set(input_dim=cfg.input_dim, output_dim=cfg.output_dim)
         )
         self._add_child("dropout", cfg.dropout)
-        self._add_child("pos_emb", cfg.pos_emb.set(dim=cfg.output_dim))
+        if cfg.pos_emb is not None:
+            self._add_child("pos_emb", cfg.pos_emb.set(dim=cfg.output_dim))
         self._add_child("context", cfg.context.set(input_dim=cfg.output_dim))
+        if cfg.post_downsample is not None:
+            self._add_child(
+                "post_downsample",
+                cfg.post_downsample.set(input_dim=cfg.output_dim, output_dim=cfg.output_dim),
+            )
 
     def forward(self, inputs: Tensor, *, segment_ids: Tensor) -> dict[str, Tensor]:
         """Computes context features.
@@ -172,7 +181,8 @@ def forward(self, inputs: Tensor, *, segment_ids: Tensor) -> dict[str, Tensor]:
 
         if isinstance(cfg.context, RepeatedConformerLayer.Config):
             positions = _segment_relative_positions(segment_ids)
-            x = x + self.pos_emb(positions)
+            if cfg.pos_emb is not None:
+                x = x + self.pos_emb(positions)
             x = self.context(inputs=x, segment_ids=segment_ids)
         elif isinstance(cfg.context, RepeatedTransformerLayer.Config):
             # We don't need to do add pos_emb for transformer block
@@ -188,6 +198,9 @@ def forward(self, inputs: Tensor, *, segment_ids: Tensor) -> dict[str, Tensor]:
             activations=x,
             activation_paddings=segment_ids == 0,
         )
+        if cfg.post_downsample is not None:
+            x, _ = self.post_downsample(x=x, paddings=segment_ids == 0)
+            segment_ids = self.post_downsample.conv_paddings(segment_ids)
         return dict(outputs=x * (segment_ids != 0)[..., None], segment_ids=segment_ids)
 
 
diff --git a/axlearn/audio/encoder_asr_test.py b/axlearn/audio/encoder_asr_test.py
@@ -16,6 +16,7 @@
 )
 from axlearn.audio.test_utils import fake_audio
 from axlearn.common.attention import RepeatedTransformerLayer
+from axlearn.common.convolution import Conv1DWithPadding
 from axlearn.common.kv_cache.sliding_window_kv_cache import enable_sliding_window_attention
 from axlearn.common.module import functional as F
 from axlearn.common.test_utils import TestCase
@@ -263,6 +264,121 @@ def test_transformer(self, is_training: bool) -> None:
             output_collections.summaries["activations/speech_context_norm"].weight, weights
         )
 
+    @parameterized.parameters([True, False])
+    @pytest.mark.fp64
+    def test_conformer_without_pos_emb(self, is_training: bool):
+        """Tests SpeechContextNetwork with RepeatedConformerLayer when pos_emb is None."""
+        input_dim, output_dim, dropout_rate, num_layers = 32, 16, 0.2, 2
+
+        cfg = SpeechContextNetwork.default_config().set(
+            input_dim=input_dim, output_dim=output_dim, dtype=jnp.float64
+        )
+        cfg.dropout.rate = dropout_rate
+        cfg.context.num_layers = num_layers
+        cfg.context.layer.self_attention.attention.num_heads = 4
+        cfg.context.layer.lconv.dropout.rate = dropout_rate
+        cfg.pos_emb = None
+
+        prng_key = jax.random.PRNGKey(123)
+        prng_key, init_key, input_key, length_key = jax.random.split(prng_key, num=4)
+        layer = cfg.set(name="test").instantiate(parent=None)
+        layer_params = layer.initialize_parameters_recursively(init_key)
+
+        # pos_emb should be absent from parameters when disabled.
+        self.assertNotIn("pos_emb", layer.children)
+        self.assertNotIn("pos_emb", layer_params)
+
+        # Generate inputs.
+        batch_size, seq_len = 4, 10
+        inputs = jnp.tile(
+            jax.random.normal(input_key, [batch_size // 2, seq_len, input_dim]), [2, 1, 1]
+        )
+        lengths = jnp.tile(
+            jax.random.randint(length_key, shape=[batch_size // 2, 1], minval=0, maxval=seq_len),
+            [2, 1],
+        )
+        segment_ids = (jnp.arange(seq_len)[None, :] < lengths).astype(jnp.int32)
+        padding_data = jax.random.normal(jax.random.PRNGKey(135), inputs.shape)
+        inputs = jnp.where(segment_ids[..., None] == 0, padding_data, inputs)
+
+        output_batch, _ = F(
+            layer,
+            inputs=dict(inputs=inputs, segment_ids=segment_ids),
+            is_training=is_training,
+            prng_key=prng_key,
+            state=layer_params,
+        )
+        outputs, output_segment_ids = output_batch["outputs"], output_batch["segment_ids"]
+        self.assertSequenceEqual(outputs.shape, (batch_size, seq_len, output_dim))
+        self.assertTrue(jnp.all(output_segment_ids == segment_ids))
+
+        # If is_training, outputs differ due to dropout; otherwise identical despite padding noise.
+        self.assertEqual(not is_training, bool(jnp.allclose(outputs[:2], outputs[2:])))
+
+    @parameterized.parameters([2, 3, 4, 5])
+    def test_post_downsample(self, strides) -> None:
+        """Test the code branch with RepeatedTransformerLayer as context layer.
+
+        Args:
+            is_training: Whether the is_training code path is tested.
+        """
+        is_training = True
+        input_dim, output_dim, dropout_rate, num_layers = 32, 16, 0.2, 2
+        num_heads = 8
+        hidden_dim = 4 * input_dim
+
+        cfg = SpeechContextNetwork.default_config().set(
+            input_dim=input_dim, output_dim=output_dim, dtype=jnp.float64
+        )
+        cfg.dropout.rate = dropout_rate
+        cfg.context = RepeatedTransformerLayer.default_config().set(num_layers=num_layers)
+        attention = cfg.context.layer.self_attention.attention
+        attention.num_heads = num_heads
+        attention = enable_sliding_window_attention(attention, sliding_window_size=3)
+        cfg.context.layer.self_attention.attention = attention
+        # Dropout in transformer
+        cfg.context.layer.self_attention.dropout.rate = dropout_rate
+        cfg.context.layer.feed_forward.set(
+            hidden_dim=hidden_dim,
+        )
+
+        # Initialize layer parameters.
+        prng_key = jax.random.PRNGKey(123)
+        prng_key, init_key, input_key, length_key = jax.random.split(prng_key, num=4)
+
+        # Generate inputs.
+        batch_size, seq_len = 4, 10
+        inputs = jnp.tile(
+            jax.random.normal(input_key, [batch_size // 2, seq_len, input_dim]), [2, 1, 1]
+        )
+        lengths = jnp.tile(
+            jax.random.randint(length_key, shape=[batch_size // 2, 1], minval=0, maxval=seq_len),
+            [2, 1],
+        )
+        segment_ids = (jnp.arange(seq_len)[None, :] < lengths).astype(jnp.int32)
+        padding_data = jax.random.normal(jax.random.PRNGKey(135), inputs.shape)
+        inputs = jnp.where(segment_ids[..., None] == 0, padding_data, inputs)
+
+        cfg.post_downsample = Conv1DWithPadding.default_config().set(
+            window=strides, strides=strides, padding=((strides - 1, 0),)
+        )
+        layer = cfg.set(name="test").instantiate(parent=None)
+        layer_params = layer.initialize_parameters_recursively(init_key)
+        output_batch, _ = F(
+            layer,
+            inputs=dict(inputs=inputs, segment_ids=segment_ids),
+            is_training=is_training,
+            prng_key=prng_key,
+            state=layer_params,
+        )
+        outputs, output_segment_ids = output_batch["outputs"], output_batch["segment_ids"]
+        self.assertSequenceEqual(
+            outputs.shape, (batch_size, (seq_len + strides - 1) // strides, output_dim)
+        )
+        self.assertSequenceEqual(
+            output_segment_ids.shape, (batch_size, (seq_len + strides - 1) // strides)
+        )
+
 
 class ASREncoderTest(TestCase):
     """Tests ASREncoder."""
diff --git a/axlearn/audio/subsamplers.py b/axlearn/audio/subsamplers.py
@@ -33,7 +33,7 @@ class Config(BaseLayer.Config):
         # Output channel dim.
         output_dim: Required[int] = REQUIRED
         # Hidden dim of the conv layers. If None, defaults to output_dim.
-        hidden_dim: Optional[int] = None
+        hidden_dim: int | list[int] | None = None
         # Configures both of the convolutions.
         conv: Conv2DWith1DPadding.Config = Conv2DWith1DPadding.default_config().set(
             window=(3, 3), strides=(2, 2), padding=((1, 1), (1, 1))
@@ -46,27 +46,41 @@ class Config(BaseLayer.Config):
         # activation to only one convolution).
         activation: Optional[Union[Optional[str], tuple[Optional[str], Optional[str]]]] = None
 
+    @classmethod
+    def get_hidden_dim_list(cls, cfg: Config) -> list[int]:
+        if isinstance(cfg.hidden_dim, int):
+            hidden_dim = [cfg.hidden_dim]
+        elif cfg.hidden_dim is None:
+            hidden_dim = [cfg.output_dim]
+        else:
+            hidden_dim = list(cfg.hidden_dim)
+        return hidden_dim
+
     def __init__(self, cfg: Config, *, parent: Optional[Module]):
         super().__init__(cfg, parent=parent)
         cfg = self.config
 
         activation = cfg.activation
+        hidden_dim = [cfg.input_dim] + self.get_hidden_dim_list(cfg) + [cfg.output_dim]
+        self.num_layers = len(hidden_dim) - 1
         if not isinstance(activation, (list, tuple)):
-            activation = (activation, activation)
-        if len(activation) != 2 or not all(x is None or isinstance(x, str) for x in activation):
+            activation = [activation] * self.num_layers
+        if len(activation) != self.num_layers or not all(
+            x is None or isinstance(x, str) for x in activation
+        ):
             raise ValueError(
-                "Expected cfg.activation to be None, a string, or pair of string | None, "
+                "Expected cfg.activation to be None, a string, or list/tuple of string | None, "
                 f"got: {cfg.activation}"
             )
         self._activation = [None if act is None else get_activation_fn(act) for act in activation]
 
-        hidden_dim = cfg.hidden_dim or cfg.output_dim
-        self._add_child("conv1", cfg.conv.set(input_dim=cfg.input_dim, output_dim=hidden_dim))
-        self._add_child("conv2", cfg.conv.set(input_dim=hidden_dim, output_dim=cfg.output_dim))
-
+        for i in range(1, len(hidden_dim)):
+            self._add_child(
+                f"conv{i}", cfg.conv.set(input_dim=hidden_dim[i - 1], output_dim=hidden_dim[i])
+            )
         if cfg.norm:
-            self._add_child("norm1", cfg.norm.set(input_dim=hidden_dim))
-            self._add_child("norm2", cfg.norm.set(input_dim=cfg.output_dim))
+            for i in range(1, len(hidden_dim)):
+                self._add_child(f"norm{i}", cfg.norm.set(input_dim=hidden_dim[i]))
 
     def output_shape(self, *, input_shape: Sequence[Optional[int]]):
         """Computes the output shape after subsampling.
@@ -90,9 +104,9 @@ def output_shape(self, *, input_shape: Sequence[Optional[int]]):
                 f"input_shape[-1] = {input_shape[-1]} does not match "
                 f"cfg.input_dim = {cfg.input_dim}."
             )
-        conv1_shape = self.conv1.output_shape(input_shape=input_shape)
-        conv2_shape = self.conv2.output_shape(input_shape=conv1_shape)
-        return conv2_shape
+        for i in range(1, self.num_layers + 1):
+            input_shape = self._children[f"conv{i}"].output_shape(input_shape=input_shape)
+        return input_shape
 
     def forward(self, inputs: Tensor, *, segment_ids: Tensor) -> dict[str, Tensor]:
         """Subsamples the speech.
@@ -112,20 +126,14 @@ def forward(self, inputs: Tensor, *, segment_ids: Tensor) -> dict[str, Tensor]:
         self._add_activation_summary(
             name="subsampler_inputs", activations=inputs, activation_paddings=paddings
         )
-        x, paddings = self.conv1(inputs, paddings=paddings)
-        segment_ids = self.conv1.conv_paddings(segment_ids)
-        if cfg.norm:
-            x = self.norm1(x, segment_ids=segment_ids)
-        if self._activation[0]:
-            x = self._activation[0](x)
-
-        x, paddings = self.conv2(x, paddings=paddings)
-        segment_ids = self.conv2.conv_paddings(segment_ids)
-        if cfg.norm:
-            x = self.norm2(x, segment_ids=segment_ids)
-        if self._activation[1]:
-            x = self._activation[1](x)
-
+        x = inputs
+        for i in range(1, self.num_layers + 1):
+            x, paddings = self._children[f"conv{i}"](x, paddings=paddings)
+            segment_ids = self._children[f"conv{i}"].conv_paddings(segment_ids)
+            if cfg.norm:
+                x = self._children[f"norm{i}"](x, segment_ids=segment_ids)
+            if self._activation[i - 1]:
+                x = self._activation[i - 1](x)
         self._add_activation_summary(
             name="subsampler_outputs", activations=x, activation_paddings=paddings
         )
diff --git a/axlearn/audio/subsamplers_test.py b/axlearn/audio/subsamplers_test.py
@@ -21,8 +21,11 @@ class ConvSubSamplerTest(TestCase):
     """Tests ConvSubSampler."""
 
     @parameterized.parameters(
-        dict(activation=("nn.tanh", "nn.relu", "nn.silu"), expected=ValueError("pair of string")),
-        dict(activation=("nn.tanh",), expected=ValueError("pair of string")),
+        dict(
+            activation=("nn.tanh", "nn.relu", "nn.silu"),
+            expected=ValueError("list/tuple of string"),
+        ),
+        dict(activation=("nn.tanh",), expected=ValueError("list/tuple of string")),
         dict(activation="nn.tanh"),  # Single value is broadcasted.
         dict(activation=("nn.tanh", None)),  # Some of the values can be None.
         dict(activation=(None, None)),  # Some of the values can be None.
@@ -189,6 +192,86 @@ def test_segment_ids(
         self.assertEqual(tuple(subsampled_shape), outputs["outputs"].shape)
         self.assertEqual(tuple(subsampled_shape)[:2], outputs["segment_ids"].shape)
 
+    @parameterized.parameters(
+        dict(
+            window=5,
+            stride=2,
+            conv_padding=(1, 1),
+            hidden_dim=[12, 16],
+            output_dim=8,
+            activation=("nn.tanh", None, None),
+        ),
+        dict(
+            window=3,
+            stride=2,
+            conv_padding=(1, 0),
+            hidden_dim=[12, 16, 32],
+            output_dim=8,
+            activation=("nn.tanh", None, None, None),
+        ),
+    )
+    def test_multi_layers(
+        self,
+        window: int,
+        stride: int,
+        conv_padding: tuple[int, int],
+        output_dim: int,
+        hidden_dim: Optional[int] = None,
+        activation: Optional[Union[Optional[str], tuple[Optional[str], Optional[str]]]] = None,
+    ):
+        """Tests that padding inputs do not affect outputs."""
+        batch_size, num_frames, num_filters, input_dim = 4, 10, 80, 1
+        cfg = ConvSubSampler.default_config().set(
+            input_dim=input_dim,
+            output_dim=output_dim,
+            hidden_dim=hidden_dim,
+            activation=activation,
+        )
+        cfg.conv.window = (window, window)
+        cfg.conv.strides = (stride, stride)
+        cfg.conv.padding = (conv_padding, conv_padding)
+        cfg.norm = BatchNorm.default_config()
+
+        # Initialize layer parameters.
+        layer = cfg.set(name="test").instantiate(parent=None)
+        prng_key = jax.random.PRNGKey(123)
+        prng_key, init_key, data_key = jax.random.split(prng_key, num=3)
+        layer_params = layer.initialize_parameters_recursively(init_key)
+
+        hidden_dim = [cfg.input_dim] + hidden_dim + [cfg.output_dim]
+        self.assertEqual(
+            {
+                f"conv{i + 1}": dict(
+                    weight=(window, window, hidden_dim[i], hidden_dim[i + 1]),
+                    bias=(hidden_dim[i + 1],),
+                )
+                for i in range(len(hidden_dim) - 1)
+            }
+            | {
+                f"norm{i + 1}": dict(
+                    bias=(hidden_dim[i + 1],),
+                    moving_mean=(hidden_dim[i + 1],),
+                    moving_variance=(hidden_dim[i + 1],),
+                    scale=(hidden_dim[i + 1],),
+                )
+                for i in range(len(hidden_dim) - 1)
+            },
+            utils.shapes(layer_params),
+        )
+
+        inputs_shape = [batch_size, num_frames, num_filters, input_dim]
+        inputs = jax.random.normal(key=data_key, shape=inputs_shape) * 10.0
+        segment_ids = jnp.ones([batch_size, num_frames])
+        outputs, _ = F(
+            layer,
+            inputs=dict(inputs=inputs, segment_ids=segment_ids),
+            is_training=True,
+            prng_key=prng_key,
+            state=layer_params,
+        )
+        expected_shape = layer.output_shape(input_shape=tuple(inputs.shape))
+        self.assertEqual(tuple(expected_shape), outputs["outputs"].shape)
+
     @parameterized.parameters(jnp.float32, jnp.bfloat16)
     def test_activation_summaries(self, dtype):
         """Tests that activation summaries behave as expected."""
diff --git a/axlearn/common/conformer.py b/axlearn/common/conformer.py