Skip to content

Commit 7efceae

Browse files
Merge commit '2ba517893461e398715037dbaa8283a73e7a0690'
2 parents 4cc4ff7 + 2ba5178 commit 7efceae

8 files changed

Lines changed: 550 additions & 2 deletions

File tree

tinyml-modelzoo/examples/motor_bearing_fault/config_MSPM0.yaml renamed to tinyml-modelzoo/examples/fan_blade_fault_classification/config_MSPM0.yaml

File renamed without changes.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
common:
2+
target_module: audio
3+
task_type: audio_classification
4+
target_device: MSPM0G5187
5+
dataset:
6+
dataset_name: google_speech_commands_12class
7+
input_data_path: https://software-dl.ti.com/C2000/esd/mcu_ai/01_04_00/datasets/google_speech_commands_12class.zip
8+
data_processing_feature_extraction:
9+
feature_extraction_name: GoogleSpeechCommands_MFCC_Default
10+
training:
11+
model_name: DSCNN_NPU
12+
batch_size: 64
13+
training_epochs: 20
14+
num_gpus: 0
15+
quantization: 2
16+
learning_rate: 0.1
17+
weight_decay: 1e-5
18+
testing: {}
19+
compilation: {}
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import os
2+
import shutil
3+
from pathlib import Path
4+
5+
import torchaudio
6+
from scipy.io import wavfile
7+
from pydub import AudioSegment
8+
import numpy as np
9+
from tqdm import tqdm
10+
11+
12+
KNOWN_LABELS = [
13+
"down", "go", "left", "no", "off", "on",
14+
"right", "stop", "up", "yes",
15+
]
16+
17+
FINAL_LABELS = KNOWN_LABELS + ["_unknown_", "_silence_"]
18+
19+
BACKGROUND_NOISE_DIR_NAME = "_background_noise_"
20+
SAMPLE_RATE = 16000
21+
22+
23+
def copy_wav(src_path: Path, dst_path: Path):
24+
dst_path.parent.mkdir(parents=True, exist_ok=True)
25+
shutil.copy(src_path, dst_path)
26+
27+
28+
def create_silence_samples(background_noise_dir: Path, silence_dir: Path):
29+
silence_dir.mkdir(parents=True, exist_ok=True)
30+
31+
count = 0
32+
33+
for wav_path in background_noise_dir.glob("*.wav"):
34+
audio = AudioSegment.from_wav(wav_path)
35+
audio_samples = np.array(audio.get_array_of_samples())
36+
37+
for start in range(0, len(audio_samples) - SAMPLE_RATE, SAMPLE_RATE // 2):
38+
segment = audio_samples[start:start + SAMPLE_RATE]
39+
output_path = silence_dir / f"{wav_path.stem}_{start:06d}.wav"
40+
wavfile.write(output_path, SAMPLE_RATE, segment.astype(np.int16))
41+
count += 1
42+
43+
return count
44+
45+
46+
def prepare_speechcommands_class_folders(root=".", output_dir=None, force=False):
47+
root = Path(root)
48+
49+
print("Downloading SpeechCommands if needed...")
50+
51+
torchaudio.datasets.SPEECHCOMMANDS(
52+
root=str(root),
53+
url="speech_commands_v0.02",
54+
folder_in_archive="SpeechCommands",
55+
download=True,
56+
)
57+
print("Download finished.")
58+
raw_dir = root / "SpeechCommands" / "speech_commands_v0.02"
59+
60+
if output_dir is None:
61+
output_root = root / "SpeechCommands" / "classes"
62+
else:
63+
output_root = Path(output_dir)
64+
65+
if output_root.exists():
66+
if force:
67+
shutil.rmtree(output_root)
68+
else:
69+
print(f"Output already exists: {output_root}")
70+
return output_root
71+
72+
for label in FINAL_LABELS:
73+
(output_root / label).mkdir(parents=True, exist_ok=True)
74+
75+
print("Creating class-folder dataset...")
76+
77+
copied_count = 0
78+
unknown_count = 0
79+
80+
for label_dir in tqdm(list(raw_dir.iterdir()), desc="Processing labels"):
81+
if not label_dir.is_dir():
82+
continue
83+
84+
label = label_dir.name
85+
86+
if label == BACKGROUND_NOISE_DIR_NAME:
87+
continue
88+
89+
target_label = label if label in KNOWN_LABELS else "_unknown_"
90+
91+
for wav_path in label_dir.glob("*.wav"):
92+
if target_label == "_unknown_":
93+
dst_name = f"{label}_{wav_path.name}"
94+
unknown_count += 1
95+
else:
96+
dst_name = wav_path.name
97+
98+
dst_path = output_root / target_label / dst_name
99+
copy_wav(wav_path, dst_path)
100+
copied_count += 1
101+
102+
background_noise_dir = raw_dir / BACKGROUND_NOISE_DIR_NAME
103+
silence_count = 0
104+
105+
if background_noise_dir.exists():
106+
print("Creating _silence_ samples from background noise...")
107+
silence_count = create_silence_samples(
108+
background_noise_dir=background_noise_dir,
109+
silence_dir=output_root / "_silence_",
110+
)
111+
112+
print("\nDone.")
113+
print(f"Output dataset: {output_root.resolve()}")
114+
print(f"Known/unknown wavs copied: {copied_count}")
115+
print(f"Unknown-class wavs: {unknown_count}")
116+
print(f"Silence wavs created: {silence_count}")
117+
118+
print("\nFinal structure:")
119+
print(f"{output_root}/")
120+
for label in FINAL_LABELS:
121+
count = len(list((output_root / label).glob('*.wav')))
122+
print(f" {label}/ {count} wavs")
123+
124+
return output_root
125+
126+
127+
if __name__ == "__main__":
128+
prepare_speechcommands_class_folders(
129+
root=".",
130+
output_dir=None,
131+
force=False,
132+
)
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# Google Speech Commands 12-Class Dataset
2+
3+
A TensorLab-compatible 12-class variant of the Google Speech Commands dataset for keyword spotting and audio classification experiments using TinyML models such as DSCNN.
4+
---
5+
6+
## Dataset Source
7+
8+
Downloaded automatically via TorchAudio:
9+
10+
```python
11+
torchaudio.datasets.SPEECHCOMMANDS(
12+
root=root,
13+
url="speech_commands_v0.02",
14+
folder_in_archive="SpeechCommands",
15+
download=True,
16+
)
17+
```
18+
## Classes
19+
20+
```
21+
┌────────────────┬──────────────────────────────────────────────────────────┐
22+
│ Type │ Labels │
23+
├────────────────┼──────────────────────────────────────────────────────────┤
24+
│ Known keywords │ down, go, left, no, off, on, right, stop, up, yes │
25+
├────────────────┼──────────────────────────────────────────────────────────┤
26+
│ Unknown │ _unknown_ - all non-keyword words (e.g. bird, cat, tree) │
27+
├────────────────┼──────────────────────────────────────────────────────────┤
28+
│ Silence │ _silence_ - 1-second clips from _background_noise_/ │
29+
└────────────────┴──────────────────────────────────────────────────────────┘
30+
```
31+
32+
## Quick Start
33+
34+
- Install dependencies
35+
36+
```python
37+
python -m pip install torch torchaudio scipy pydub numpy tqdm
38+
```
39+
40+
- Generate the dataset
41+
42+
```python
43+
python generate_dataset.py
44+
```
45+
46+
## Output Structure
47+
48+
```
49+
SpeechCommands/
50+
├── speech_commands_v0.02/ # Original downloaded dataset
51+
│ ├── down/
52+
│ ├── go/
53+
│ └── _background_noise_/
54+
└── classes/ # TensorLab-ready dataset
55+
├── down/
56+
├── go/
57+
├── left/
58+
├── no/
59+
├── off/
60+
├── on/
61+
├── right/
62+
├── stop/
63+
├── up/
64+
├── yes/
65+
├── _silence_/
66+
└── _unknown_/
67+
```
68+
69+
What generate_dataset.py Does:
70+
71+
1. Downloads Google Speech Commands v0.02 via TorchAudio
72+
2. Copies the 10 keyword classes into their own folders
73+
3. Maps all other word classes into _unknown_ (prefixing the original label to avoid filename collisions)
74+
4. Splits _background_noise_/ audio into 1-second clips for _silence_
75+
5. Saves everything under SpeechCommands/classes/
76+
77+
78+
79+
## How _silence_ Samples Are Generated
80+
Silence samples are not recorded speech - they are synthetic clips cut from the background noise audio files in_background_noise_/.
81+
**Source files:** all `.wav` files inside `SpeechCommands/speech_commands_v0.02/_background_noise_/`
82+
83+
**Process (`create_silence_samples`):**
84+
85+
1. Each background noise file is loaded in full via `pydub.AudioSegment`
86+
2. Raw PCM samples are extracted as a NumPy array
87+
3. The array is sliced into 1-second windows using a sliding loop with 50% overlap.
88+
4. Each segment is written to classes/_silence_/ as a 16-bit .wav file
89+
90+
## Recommended preset
91+
92+
```
93+
GoogleSpeechCommands_MFCC_Default = dict(
94+
data_processing_feature_extraction=dict(
95+
sampling_rate=16000,
96+
audio_duration_ms=1000,
97+
audio_feature="MFCC",
98+
n_mfcc=10,
99+
n_mels=40,
100+
frame_length_ms=30,
101+
frame_step_ms=20,
102+
normalize_audio=True,
103+
mono=True,
104+
variables=1,
105+
feat_ext_transform=["MFCC"],
106+
data_proc_transforms=[],
107+
),
108+
common=dict(
109+
task_type=TASK_TYPE_AUDIO_CLASSIFICATION,
110+
),
111+
)
112+
```
113+
114+
115+
## MFCC Feature Extraction
116+
117+
MFCCs (Mel Frequency Cepstral Coefficients) compactly represent the frequency characteristics of speech, making them
118+
well-suited for keyword spotting.
119+
120+
```
121+
┌───────────────────┬──────────┐
122+
│ Parameter │ Value │
123+
├───────────────────┼──────────┤
124+
│ Sampling rate │ 16000 Hz │
125+
├───────────────────┼──────────┤
126+
│ Audio duration │ 1000 ms │
127+
├───────────────────┼──────────┤
128+
│ Frame length │ 30 ms │
129+
├───────────────────┼──────────┤
130+
│ Frame step │ 20 ms │
131+
├───────────────────┼──────────┤
132+
│ MFCC coefficients │ 10 │
133+
├───────────────────┼──────────┤
134+
│ Mel bins │ 40 │
135+
└───────────────────┴──────────┘
136+
```
137+
Output feature shape: [N, 1, 49, 10]
138+
(batch size × 1 channel × 49 time frames × 10 MFCC coefficients)
139+
140+
141+
## DSCNN Model
142+
143+
The recommended model is DSCNN (Depthwise Separable Convolutional Neural Network), designed for efficient TinyML
144+
inference on our NPU
145+
146+
Architecture
147+
148+
Conv10x4 / stride 2
149+
Dropout
150+
Depthwise3x3 + Pointwise1x1 ×4
151+
Dropout
152+
AdaptiveAvgPool
153+
Fully Connected (→ 12 classes)
154+
155+
Filters: 64 | Output classes: 12
156+
157+
### Why DSCNN
158+
159+
A standard convolution performs spatial filtering and channel mixing in one operation. DSCNN splits this into:
160+
161+
- Depthwise conv - spatial filtering independently per channel
162+
- Pointwise conv - 1×1 convolution for channel mixing
163+
164+
This reduces computation and model size while maintaining strong keyword spotting accuracy, making it suitable for
165+
embedded deployment.
166+

tinyml-modelzoo/tinyml_modelzoo/device_info/run_info.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1364,7 +1364,16 @@
13641364
'MSPM0G3519': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
13651365
'MSPM0G5187': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'}
13661366
},
1367-
1367+
'MobileNetV2_58k_NPU': {
1368+
'MSPM0G3507': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
1369+
'MSPM0G3519': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
1370+
'MSPM0G5187': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'}
1371+
},
1372+
'DSCNN_NPU': {
1373+
'MSPM0G3507': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
1374+
'MSPM0G3519': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},
1375+
'MSPM0G5187': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'}
1376+
},
13681377
# NPU-Optimized Generic Classification Models
13691378
'CLS_100_NPU': {
13701379
'F280013': {'flash': 'TBD', 'inference_time_us': 'TBD', 'sram': 'TBD'},

tinyml-modelzoo/tinyml_modelzoo/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
'forecasting',
5858
'feature_extraction',
5959
'image',
60+
'audio'
6061
]
6162

6263
# Central model registry - built dynamically

0 commit comments

Comments
 (0)