TexasInstruments
diff --git a/‎…es/motor_bearing_fault/config_MSPM0.yaml‎ ‎…e_fault_classification/config_MSPM0.yaml‎tinyml-modelzoo/examples/motor_bearing_fault/config_MSPM0.yaml renamed to tinyml-modelzoo/examples/fan_blade_fault_classification/config_MSPM0.yaml b/‎…es/motor_bearing_fault/config_MSPM0.yaml‎ ‎…e_fault_classification/config_MSPM0.yaml‎tinyml-modelzoo/examples/motor_bearing_fault/config_MSPM0.yaml renamed to tinyml-modelzoo/examples/fan_blade_fault_classification/config_MSPM0.yaml
diff --git a/‎tinyml-modelzoo/examples/google_speech_command/config_MSPM0.yaml‎
Lines changed: 19 additions & 0 deletions b/‎tinyml-modelzoo/examples/google_speech_command/config_MSPM0.yaml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎tinyml-modelzoo/examples/google_speech_command/generate_dataset.py‎
Lines changed: 132 additions & 0 deletions b/‎tinyml-modelzoo/examples/google_speech_command/generate_dataset.py‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎tinyml-modelzoo/examples/google_speech_command/readme.md‎
Lines changed: 166 additions & 0 deletions b/‎tinyml-modelzoo/examples/google_speech_command/readme.md‎
Lines changed: 166 additions & 0 deletions
diff --git a/‎tinyml-modelzoo/tinyml_modelzoo/device_info/run_info.py‎
Lines changed: 10 additions & 1 deletion b/‎tinyml-modelzoo/tinyml_modelzoo/device_info/run_info.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎tinyml-modelzoo/tinyml_modelzoo/models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tinyml-modelzoo/tinyml_modelzoo/models/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,19 @@
+common:
+  target_module: audio
+  task_type: audio_classification
+  target_device: MSPM0G5187
+dataset:
+  dataset_name: google_speech_commands_12class
+  input_data_path: https://software-dl.ti.com/C2000/esd/mcu_ai/01_04_00/datasets/google_speech_commands_12class.zip
+data_processing_feature_extraction:
+  feature_extraction_name: GoogleSpeechCommands_MFCC_Default
+training:
+  model_name: DSCNN_NPU
+  batch_size: 64
+  training_epochs: 20
+  num_gpus: 0
+  quantization: 2
+  learning_rate: 0.1
+  weight_decay: 1e-5
+testing: {}
+compilation: {}
@@ -0,0 +1,132 @@
+import os
+import shutil
+from pathlib import Path
+
+import torchaudio
+from scipy.io import wavfile
+from pydub import AudioSegment
+import numpy as np
+from tqdm import tqdm
+
+
+KNOWN_LABELS = [
+    "down", "go", "left", "no", "off", "on",
+    "right", "stop", "up", "yes",
+]
+
+FINAL_LABELS = KNOWN_LABELS + ["_unknown_", "_silence_"]
+
+BACKGROUND_NOISE_DIR_NAME = "_background_noise_"
+SAMPLE_RATE = 16000
+
+
+def copy_wav(src_path: Path, dst_path: Path):
+    dst_path.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy(src_path, dst_path)
+
+
+def create_silence_samples(background_noise_dir: Path, silence_dir: Path):
+    silence_dir.mkdir(parents=True, exist_ok=True)
+
+    count = 0
+
+    for wav_path in background_noise_dir.glob("*.wav"):
+        audio = AudioSegment.from_wav(wav_path)
+        audio_samples = np.array(audio.get_array_of_samples())
+
+        for start in range(0, len(audio_samples) - SAMPLE_RATE, SAMPLE_RATE // 2):
+            segment = audio_samples[start:start + SAMPLE_RATE]
+            output_path = silence_dir / f"{wav_path.stem}_{start:06d}.wav"
+            wavfile.write(output_path, SAMPLE_RATE, segment.astype(np.int16))
+            count += 1
+
+    return count
+
+
+def prepare_speechcommands_class_folders(root=".", output_dir=None, force=False):
+    root = Path(root)
+
+    print("Downloading SpeechCommands if needed...")
+
+    torchaudio.datasets.SPEECHCOMMANDS(
+        root=str(root),
+        url="speech_commands_v0.02",
+        folder_in_archive="SpeechCommands",
+        download=True,
+    )
+    print("Download finished.")
+    raw_dir = root / "SpeechCommands" / "speech_commands_v0.02"
+
+    if output_dir is None:
+        output_root = root / "SpeechCommands" / "classes"
+    else:
+        output_root = Path(output_dir)
+
+    if output_root.exists():
+        if force:
+            shutil.rmtree(output_root)
+        else:
+            print(f"Output already exists: {output_root}")
+            return output_root
+
+    for label in FINAL_LABELS:
+        (output_root / label).mkdir(parents=True, exist_ok=True)
+
+    print("Creating class-folder dataset...")
+
+    copied_count = 0
+    unknown_count = 0
+
+    for label_dir in tqdm(list(raw_dir.iterdir()), desc="Processing labels"):
+        if not label_dir.is_dir():
+            continue
+
+        label = label_dir.name
+
+        if label == BACKGROUND_NOISE_DIR_NAME:
+            continue
+
+        target_label = label if label in KNOWN_LABELS else "_unknown_"
+
+        for wav_path in label_dir.glob("*.wav"):
+            if target_label == "_unknown_":
+                dst_name = f"{label}_{wav_path.name}"
+                unknown_count += 1
+            else:
+                dst_name = wav_path.name
+
+            dst_path = output_root / target_label / dst_name
+            copy_wav(wav_path, dst_path)
+            copied_count += 1
+
+    background_noise_dir = raw_dir / BACKGROUND_NOISE_DIR_NAME
+    silence_count = 0
+
+    if background_noise_dir.exists():
+        print("Creating _silence_ samples from background noise...")
+        silence_count = create_silence_samples(
+            background_noise_dir=background_noise_dir,
+            silence_dir=output_root / "_silence_",
+        )
+
+    print("\nDone.")
+    print(f"Output dataset: {output_root.resolve()}")
+    print(f"Known/unknown wavs copied: {copied_count}")
+    print(f"Unknown-class wavs: {unknown_count}")
+    print(f"Silence wavs created: {silence_count}")
+
+    print("\nFinal structure:")
+    print(f"{output_root}/")
+    for label in FINAL_LABELS:
+        count = len(list((output_root / label).glob('*.wav')))
+        print(f"  {label}/  {count} wavs")
+
+    return output_root
+
+
+if __name__ == "__main__":
+    prepare_speechcommands_class_folders(
+        root=".",
+        output_dir=None,
+        force=False,
+    )
@@ -0,0 +1,166 @@
+ # Google Speech Commands 12-Class Dataset
+
+  A TensorLab-compatible 12-class variant of the Google Speech Commands dataset for keyword spotting and audio            classification experiments using TinyML models such as DSCNN.
+                                                                                                                          ---
+
+  ## Dataset Source
+
+  Downloaded automatically via TorchAudio:
+
+  ```python
+  torchaudio.datasets.SPEECHCOMMANDS(
+      root=root,
+      url="speech_commands_v0.02",
+      folder_in_archive="SpeechCommands",
+      download=True,
+  )
+ ```
+  ## Classes
+
+```
+  ┌────────────────┬──────────────────────────────────────────────────────────┐
+  │      Type      │                          Labels                          │
+  ├────────────────┼──────────────────────────────────────────────────────────┤
+  │ Known keywords │ down, go, left, no, off, on, right, stop, up, yes        │
+  ├────────────────┼──────────────────────────────────────────────────────────┤
+  │ Unknown        │ _unknown_ - all non-keyword words (e.g. bird, cat, tree) │
+  ├────────────────┼──────────────────────────────────────────────────────────┤
+  │ Silence        │ _silence_ - 1-second clips from _background_noise_/      │
+  └────────────────┴──────────────────────────────────────────────────────────┘
+```
+
+  ## Quick Start
+
+  - Install dependencies
+  
+  ```python
+  python -m pip install torch torchaudio scipy pydub numpy tqdm
+```
+
+  - Generate the dataset
+
+  ```python
+  python generate_dataset.py
+  ```
+
+  ## Output Structure
+
+```
+  SpeechCommands/
+  ├── speech_commands_v0.02/       # Original downloaded dataset
+  │   ├── down/
+  │   ├── go/
+  │   └── _background_noise_/
+  └── classes/                     # TensorLab-ready dataset
+      ├── down/
+      ├── go/
+      ├── left/
+      ├── no/
+      ├── off/
+      ├── on/
+      ├── right/
+      ├── stop/
+      ├── up/
+      ├── yes/
+      ├── _silence_/
+      └── _unknown_/
+```
+
+  What generate_dataset.py Does:
+
+  1. Downloads Google Speech Commands v0.02 via TorchAudio
+  2. Copies the 10 keyword classes into their own folders
+  3. Maps all other word classes into _unknown_ (prefixing the original label to avoid filename collisions)
+  4. Splits _background_noise_/ audio into 1-second clips for _silence_
+  5. Saves everything under SpeechCommands/classes/
+
+
+
+  ## How _silence_ Samples Are Generated
+                                                Silence samples are not recorded speech - they are synthetic clips cut from the background noise audio files in_background_noise_/.                                                                                                
+  **Source files:** all `.wav` files inside `SpeechCommands/speech_commands_v0.02/_background_noise_/`
+
+  **Process (`create_silence_samples`):**
+
+  1. Each background noise file is loaded in full via `pydub.AudioSegment`
+  2. Raw PCM samples are extracted as a NumPy array
+  3. The array is sliced into 1-second windows using a sliding loop with 50% overlap.
+  4. Each segment is written to classes/_silence_/ as a 16-bit .wav file
+
+  ## Recommended preset
+
+```
+  GoogleSpeechCommands_MFCC_Default = dict(
+      data_processing_feature_extraction=dict(
+          sampling_rate=16000,
+          audio_duration_ms=1000,
+          audio_feature="MFCC",
+          n_mfcc=10,
+          n_mels=40,
+          frame_length_ms=30,
+          frame_step_ms=20,
+          normalize_audio=True,
+          mono=True,
+          variables=1,
+          feat_ext_transform=["MFCC"],
+          data_proc_transforms=[],
+      ),
+      common=dict(
+          task_type=TASK_TYPE_AUDIO_CLASSIFICATION,
+      ),
+  )
+```
+
+
+  ## MFCC Feature Extraction
+
+  MFCCs (Mel Frequency Cepstral Coefficients) compactly represent the frequency characteristics of speech, making them
+  well-suited for keyword spotting.
+
+```
+  ┌───────────────────┬──────────┐
+  │     Parameter     │  Value   │
+  ├───────────────────┼──────────┤
+  │ Sampling rate     │ 16000 Hz │
+  ├───────────────────┼──────────┤
+  │ Audio duration    │ 1000 ms  │
+  ├───────────────────┼──────────┤
+  │ Frame length      │ 30 ms    │
+  ├───────────────────┼──────────┤
+  │ Frame step        │ 20 ms    │
+  ├───────────────────┼──────────┤
+  │ MFCC coefficients │ 10       │
+  ├───────────────────┼──────────┤
+  │ Mel bins          │ 40       │
+  └───────────────────┴──────────┘
+```
+  Output feature shape: [N, 1, 49, 10]
+  (batch size × 1 channel × 49 time frames × 10 MFCC coefficients)
+
+ 
+  ## DSCNN Model
+
+  The recommended model is DSCNN (Depthwise Separable Convolutional Neural Network), designed for efficient TinyML
+  inference on our NPU
+
+  Architecture
+
+  Conv10x4 / stride 2
+  Dropout
+  Depthwise3x3 + Pointwise1x1  ×4
+  Dropout
+  AdaptiveAvgPool
+  Fully Connected (→ 12 classes)
+
+  Filters: 64 | Output classes: 12
+
+  ### Why DSCNN
+
+  A standard convolution performs spatial filtering and channel mixing in one operation. DSCNN splits this into:
+
+  - Depthwise conv - spatial filtering independently per channel
+  - Pointwise conv - 1×1 convolution for channel mixing
+
+  This reduces computation and model size while maintaining strong keyword spotting accuracy, making it suitable for
+  embedded deployment.
+
@@ -1364,7 +1364,16 @@
         'MSPM0G3519': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
         'MSPM0G5187': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'}
     },
-
+    'MobileNetV2_58k_NPU': {
+        'MSPM0G3507': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
+        'MSPM0G3519': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
+        'MSPM0G5187': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'}
+    },     
+    'DSCNN_NPU': {
+        'MSPM0G3507': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
+        'MSPM0G3519': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
+        'MSPM0G5187': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'}
+    },
     # NPU-Optimized Generic Classification Models
     'CLS_100_NPU': {
         'F280013': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
 
@@ -57,6 +57,7 @@
     'forecasting',
     'feature_extraction',
     'image',
+    'audio'
 ]
 
 # Central model registry - built dynamically
Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,7 @@`
`57`	`57`	`'forecasting',`
`58`	`58`	`'feature_extraction',`
`59`	`59`	`'image',`
	`60`	`+ 'audio'`
`60`	`61`	`]`
`61`	`62`
`62`	`63`	`# Central model registry - built dynamically`