Add audio codec model tests

rlangman · rlangman · commit 4ee0cfed2b67 · 2026-04-08T11:28:05.000-07:00
Signed-off-by: Ryan &lt;rlangman@nvidia.com&gt;
diff --git a/examples/tts/conf/audio_codec/acoustic_codec_16000.yaml b/examples/tts/conf/audio_codec/acoustic_codec_16000.yaml
@@ -0,0 +1,193 @@
+name: AudioCodec
+
+max_epochs: ???
+# Adjust batch size based on GPU memory
+batch_size: 16
+# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch.
+# If null, then weighted sampling is disabled.
+weighted_sampling_steps_per_epoch: null
+
+# Dataset metadata for each manifest
+# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41
+train_ds_meta: ???
+val_ds_meta: ???
+
+log_ds_meta: ???
+log_dir: ???
+
+semantic_codec_path: ???
+
+# Modify these values based on your sample rate
+sample_rate: 16000
+samples_per_frame: 640
+train_n_samples: 12800
+# The product of the down_sample_rates and up_sample_rates should match the samples_per_frame.
+# For example 2 * 5 * 8 * 8 = 640.
+down_sample_rates: [2, 5, 8, 8]
+up_sample_rates: [8, 8, 5, 2]
+
+num_codebooks: 8
+encoder_out_dim: 42
+decoder_input_dim: 48
+
+model:
+
+  semantic_codec_path: ${semantic_codec_path}
+
+  max_epochs: ${max_epochs}
+  steps_per_epoch: ${weighted_sampling_steps_per_epoch}
+
+  sample_rate: ${sample_rate}
+  samples_per_frame: ${samples_per_frame}
+
+  mel_loss_l1_scale: 10.0
+  mel_loss_l2_scale: 0.0
+  stft_loss_scale: 10.0
+  time_domain_loss_scale: 0.0
+  commit_loss_scale: 0.0
+
+  # Probability of updating the discriminator during each training step
+  # For example, update the discriminator 1/2 times (1 update for every 2 batches)
+  disc_updates_per_period: 1
+  disc_update_period: 2
+
+  # All resolutions for mel reconstruction loss, ordered [num_fft, hop_length, window_length]
+  loss_resolutions: [
+    [32, 8, 32], [64, 16, 64], [128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024]
+  ]
+  mel_loss_dims: [5, 10, 20, 40, 80, 160]
+  mel_loss_log_guard: 1.0
+  stft_loss_log_guard: 1.0
+  feature_loss_type: absolute
+
+  train_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
+      dataset_meta: ${train_ds_meta}
+      weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch}
+      sample_rate: ${sample_rate}
+      n_samples: ${train_n_samples}
+      min_duration: 0.4 # seconds
+      max_duration: null
+
+    dataloader_params:
+      batch_size: ${batch_size}
+      drop_last: true
+      num_workers: 4
+
+  validation_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
+      sample_rate: ${sample_rate}
+      n_samples: null
+      min_duration: null
+      max_duration: null
+      trunc_duration: 10.0 # Only use the first 10 seconds of audio for computing validation loss
+      dataset_meta: ${val_ds_meta}
+
+    dataloader_params:
+      batch_size: 4
+      num_workers: 2
+
+  # Configures how audio samples are generated and saved during training.
+  # Remove this section to disable logging.
+  log_config:
+    log_dir: ${log_dir}
+    log_epochs: [10, 50]
+    epoch_frequency: 100
+    log_tensorboard: false
+    log_wandb: false
+
+    generators:
+      - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
+        log_audio: true
+        log_encoding: false
+        log_dequantized: false
+
+    dataset:
+      _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
+      sample_rate: ${sample_rate}
+      n_samples: null
+      min_duration: null
+      max_duration: null
+      trunc_duration: 10.0 # Only log the first 10 seconds of generated audio.
+      dataset_meta: ${log_ds_meta}
+
+    dataloader_params:
+      batch_size: 4
+      num_workers: 2
+
+  audio_encoder:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.HiFiGANEncoder
+    down_sample_rates: ${down_sample_rates}
+    encoded_dim: ${encoder_out_dim}
+    base_channels: 48
+    activation: "lrelu"
+
+  audio_decoder:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.HiFiGANDecoder
+    up_sample_rates: ${up_sample_rates}
+    input_dim: ${decoder_input_dim}
+    base_channels: 768
+    activation: "half_snake"
+    output_activation: "clamp"
+
+  vector_quantizer:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.GroupFiniteScalarQuantizer
+    num_groups: ${num_codebooks}
+    num_levels_per_group: [4, 4, 4, 4, 4, 4]
+
+  discriminator:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.Discriminator
+    discriminators:
+      - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiPeriodDiscriminator
+      - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiResolutionDiscriminatorSTFT
+        resolutions: [[512, 128, 512], [1024, 256, 1024]]
+        stft_bands: [[0.0, 0.1], [0.1, 0.25], [0.25, 0.5], [0.5, 0.75], [0.75, 1.0]]
+
+  generator_loss:
+    _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss
+
+  discriminator_loss:
+    _target_: nemo.collections.tts.losses.audio_codec_loss.DiscriminatorSquaredLoss
+
+  optim:
+    _target_: torch.optim.Adam
+    lr: 2e-4
+    betas: [0.8, 0.99]
+
+    sched:
+      name: ExponentialLR
+      gamma: 0.998
+
+trainer:
+  num_nodes: 1
+  devices: -1
+  accelerator: gpu
+  strategy: ddp_find_unused_parameters_true
+  precision: 16
+  max_epochs: ${max_epochs}
+  accumulate_grad_batches: 1
+  enable_checkpointing: False # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 10
+  benchmark: false
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: false
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
+  create_checkpoint_callback: true 
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
+    save_top_k: 5
+    save_best_model: true
+    always_save_nemo: true
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
diff --git a/tests/collections/tts/models/test_audio_codec.py b/tests/collections/tts/models/test_audio_codec.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+import torch
+from omegaconf import DictConfig
+
+from nemo.collections.tts.models import AudioCodecModel
+
+
+def create_codec_config():
+    audio_encoder = {
+        'cls': 'nemo.collections.tts.modules.audio_codec_modules.MultiResolutionSTFTEncoder',
+        'params': {
+            'out_dim': 40,
+            'resolutions': [[960, 240, 960], [1920, 480, 1920]],
+            'resolution_filter_list': [256, 512],
+        },
+    }
+    audio_decoder = {
+        'cls': 'nemo.collections.tts.modules.audio_codec_modules.ResNetDecoder',
+        'params': {
+            'input_dim': 40,
+            'input_filters': 512,
+            'n_hidden_layers': 6,
+            'hidden_filters': 512,
+            'pre_up_sample_rates': [],
+            'pre_up_sample_filters': [],
+            'resblock_up_sample_rates': [10, 8, 6],
+            'resblock_up_sample_filters': [256, 128, 32],
+        },
+    }
+    vector_quantizer = {
+        'cls': 'nemo.collections.tts.modules.audio_codec_modules.GroupFiniteScalarQuantizer',
+        'params': {
+            'num_groups': 8,
+            'num_levels_per_group': [4, 4, 4, 4, 4],
+        },
+    }
+    generator_loss = {
+        'cls': 'nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss',
+    }
+    discriminator_loss = {
+        'cls': 'nemo.collections.tts.losses.audio_codec_loss.DiscriminatorSquaredLoss',
+    }
+
+    model_cfg = DictConfig(
+        {
+            'sample_rate': 24000,
+            'samples_per_frame': 480,
+            'loss_resolutions': [[960, 240, 960], [1920, 480, 1920]],
+            'mel_loss_dims': [160, 320],
+            'commit_loss_scale': 0.0,
+            'audio_encoder': DictConfig(audio_encoder),
+            'audio_decoder': DictConfig(audio_decoder),
+            'vector_quantizer': DictConfig(vector_quantizer),
+            'generator_loss': DictConfig(generator_loss),
+            'discriminator_loss': DictConfig(discriminator_loss),
+        }
+    )
+    return model_cfg
+
+
+@pytest.fixture()
+def codec_model():
+    model_cfg = create_codec_config()
+    codec_model = AudioCodecModel(cfg=model_cfg)
+    return codec_model
+
+
+@pytest.fixture()
+def acoustic_codec_model():
+    semantic_model_cfg = create_codec_config()
+    semantic_model_cfg.vector_quantizer.params.num_groups = 1
+    semantic_model_cfg.audio_encoder.params.out_dim = 5
+    semantic_model_cfg.audio_decoder.params.input_dim = 5
+
+    acoustic_model_cfg = create_codec_config()
+    acoustic_model_cfg.semantic_codec = semantic_model_cfg
+    acoustic_model_cfg.audio_encoder.params.out_dim = 35
+    acoustic_codec_model = AudioCodecModel(cfg=acoustic_model_cfg)
+
+    return acoustic_codec_model
+
+
+class TestAudioCodecModel:
+    @pytest.mark.unit
+    def test_forward(self, codec_model):
+        batch_size = 2
+        audio = torch.randn(size=(batch_size, 20000))
+        audio_len = torch.randint(size=[batch_size], low=10000, high=20000)
+        output_audio, output_audio_len = codec_model.forward(
+            audio=audio, audio_len=audio_len, sample_rate=codec_model.sample_rate
+        )
+        assert output_audio.shape[0] == batch_size
+        assert output_audio.shape[1] == output_audio_len.max()
+
+    @pytest.mark.unit
+    def test_forward_with_acoustic_codec(self, acoustic_codec_model):
+        batch_size = 3
+        audio = torch.randn(size=(batch_size, 20000))
+        audio_len = torch.randint(size=[batch_size], low=10000, high=20000)
+        output_audio, output_audio_len = acoustic_codec_model.forward(
+            audio=audio, audio_len=audio_len, sample_rate=acoustic_codec_model.sample_rate
+        )
+        assert output_audio.shape[0] == batch_size
+        assert output_audio.shape[1] == output_audio_len.max()
+
+    @pytest.mark.unit
+    def test_encode_and_decode(self, codec_model):
+        batch_size = 4
+        audio = torch.randn(size=(batch_size, 20000))
+        audio_len = torch.randint(size=[batch_size], low=10000, high=20000)
+
+        tokens, tokens_len = codec_model.encode(audio=audio, audio_len=audio_len, sample_rate=codec_model.sample_rate)
+        assert tokens.shape[0] == batch_size
+        assert tokens.shape[2] == tokens_len.max()
+
+        output_audio, output_audio_len = codec_model.decode(tokens=tokens, tokens_len=tokens_len)
+        assert output_audio.shape[0] == batch_size
+        assert output_audio.shape[1] == output_audio_len.max()
+
+    @pytest.mark.unit
+    def test_encode_and_decode_with_acoustic_codec(self, acoustic_codec_model):
+        batch_size = 5
+        audio = torch.randn(size=(batch_size, 20000))
+        audio_len = torch.randint(size=[batch_size], low=10000, high=20000)
+
+        tokens, tokens_len = acoustic_codec_model.encode(
+            audio=audio, audio_len=audio_len, sample_rate=acoustic_codec_model.sample_rate
+        )
+        assert tokens.shape[0] == batch_size
+        assert tokens.shape[2] == tokens_len.max()
+
+        output_audio, output_audio_len = acoustic_codec_model.decode(tokens=tokens, tokens_len=tokens_len)
+        assert output_audio.shape[0] == batch_size
+        assert output_audio.shape[1] == output_audio_len.max()
diff --git a/tests/functional_tests/L2_TTS_Fast_dev_runs_AcousticCodec.sh b/tests/functional_tests/L2_TTS_Fast_dev_runs_AcousticCodec.sh
@@ -0,0 +1,33 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/audio_codec.py \
+    --config-name acoustic_codec_16000.yaml \
+    semantic_codec_path="/home/TestData/tts/TestSemanticCodec.nemo" \
+    +train_ds_meta.an4.manifest_path="/home/TestData/an4_dataset/an4_train_context_v1.json" \
+    +train_ds_meta.an4.audio_dir="/" \
+    +train_ds_meta.an4.sample_weight=1.0 \
+    +val_ds_meta.an4.manifest_path="/home/TestData/an4_dataset/an4_val_context_v1.json" \
+    +val_ds_meta.an4.audio_dir="/" \
+    +log_ds_meta.an4.manifest_path="/home/TestData/an4_dataset/an4_val_context_v1.json" \
+    +log_ds_meta.an4.audio_dir="/" \
+    log_dir="/tmp/audio_codec_training_output" \
+    max_epochs=1 \
+    batch_size=4 \
+    weighted_sampling_steps_per_epoch=10 \
+    +trainer.limit_val_batches=1 \
+    trainer.devices="[0]" \
+    trainer.strategy=auto \
+    model.train_ds.dataloader_params.num_workers=0 \
+    model.validation_ds.dataloader_params.num_workers=0 \
+    ~trainer.check_val_every_n_epoch
diff --git a/tests/functional_tests/L2_TTS_Fast_dev_runs_AudioCodec.sh b/tests/functional_tests/L2_TTS_Fast_dev_runs_AudioCodec.sh