@@ -136,6 +136,132 @@ def __init__(
136136 self .estimate_f0_and_loudness = estimate_f0_and_loudness
137137
138138
139+ def _emit_base_example (ex , split ):
140+ """Maps an input example to a TFDS example."""
141+ beam = tfds .core .lazy_imports .apache_beam
142+ beam .metrics .Metrics .counter (split , "base-examples" ).inc ()
143+ features = ex .features .feature
144+ id_ = features ["note_str" ].bytes_list .value [0 ]
145+ return id_ , {
146+ "id" : id_ ,
147+ "audio" : np .array (features ["audio" ].float_list .value , dtype = np .float32 ),
148+ "pitch" : features ["pitch" ].int64_list .value [0 ],
149+ "velocity" : features ["velocity" ].int64_list .value [0 ],
150+ "instrument" : {
151+ "label" : tf .compat .as_text (
152+ features ["instrument_str" ].bytes_list .value [0 ]
153+ ),
154+ "family" : tf .compat .as_text (
155+ features ["instrument_family_str" ].bytes_list .value [0 ]
156+ ),
157+ "source" : tf .compat .as_text (
158+ features ["instrument_source_str" ].bytes_list .value [0 ]
159+ ),
160+ },
161+ "qualities" : {
162+ q : features ["qualities" ].int64_list .value [i ]
163+ for (i , q ) in enumerate (_QUALITIES )
164+ },
165+ }
166+
167+
168+ def _in_split (id_ex , split_ids , split ):
169+ unused_id , ex = id_ex
170+ if not split_ids or tf .compat .as_text (ex ["id" ]) in split_ids :
171+ beam = tfds .core .lazy_imports .apache_beam
172+ beam .metrics .Metrics .counter (split , "in-split" ).inc ()
173+ return True
174+ return False
175+
176+
177+ def _estimate_f0 (id_ex , split ):
178+ """Estimate the fundamental frequency using CREPE and add to example."""
179+ id_ , ex = id_ex
180+ beam = tfds .core .lazy_imports .apache_beam
181+ beam .metrics .Metrics .counter (split , "estimate-f0" ).inc ()
182+
183+ audio = ex ["audio" ]
184+
185+ # Copied from magenta/ddsp/spectral_ops.py
186+ # Pad end so that `num_frames = _NUM_SECS * _F0_AND_LOUDNESS_RATE`.
187+ hop_size = _AUDIO_RATE / _F0_AND_LOUDNESS_RATE
188+ n_samples = len (audio )
189+ n_frames = _NUM_SECS * _F0_AND_LOUDNESS_RATE
190+ n_samples_padded = (n_frames - 1 ) * hop_size + _CREPE_FRAME_SIZE
191+ n_padding = n_samples_padded - n_samples
192+ assert n_padding % 1 == 0
193+ audio = np .pad (audio , (0 , int (n_padding )), mode = "constant" )
194+ crepe_step_size = 1000 / _F0_AND_LOUDNESS_RATE # milliseconds
195+
196+ _ , f0_hz , f0_confidence , _ = tfds .core .lazy_imports .crepe .predict (
197+ audio ,
198+ sr = _AUDIO_RATE ,
199+ viterbi = True ,
200+ step_size = crepe_step_size ,
201+ center = False ,
202+ verbose = 0 ,
203+ )
204+ f0_midi = tfds .core .lazy_imports .librosa .core .hz_to_midi (f0_hz )
205+ # Set -infs introduced by hz_to_midi to 0.
206+ f0_midi [f0_midi == - np .inf ] = 0
207+ # Set nans to 0 in confidence.
208+ f0_confidence = np .nan_to_num (f0_confidence )
209+ ex = dict (ex )
210+ ex ["f0" ] = {
211+ "hz" : f0_hz .astype (np .float32 ),
212+ "midi" : f0_midi .astype (np .float32 ),
213+ "confidence" : f0_confidence .astype (np .float32 ),
214+ }
215+ return id_ , ex
216+
217+
218+ def _calc_loudness (id_ex , split ):
219+ """Compute loudness, add to example (ref is white noise, amplitude=1)."""
220+ id_ , ex = id_ex
221+ beam = tfds .core .lazy_imports .apache_beam
222+ beam .metrics .Metrics .counter (split , "compute-loudness" ).inc ()
223+
224+ audio = ex ["audio" ]
225+
226+ # Copied from magenta/ddsp/spectral_ops.py
227+ # Get magnitudes.
228+ hop_size = int (_AUDIO_RATE // _F0_AND_LOUDNESS_RATE )
229+
230+ # Add padding to the end
231+ n_samples_initial = int (audio .shape [- 1 ])
232+ n_frames = int (np .ceil (n_samples_initial / hop_size ))
233+ n_samples_final = (n_frames - 1 ) * hop_size + _LD_N_FFT
234+ pad = n_samples_final - n_samples_initial
235+ audio = np .pad (audio , ((0 , pad ),), "constant" )
236+
237+ librosa = tfds .core .lazy_imports .librosa
238+ spectra = librosa .stft (
239+ audio , n_fft = _LD_N_FFT , hop_length = hop_size , center = False
240+ ).T
241+
242+ # Compute power
243+ amplitude = np .abs (spectra )
244+ amin = 1e-20 # Avoid log(0) instabilities.
245+ power_db = np .log10 (np .maximum (amin , amplitude ))
246+ power_db *= 20.0
247+
248+ # Perceptual weighting.
249+ frequencies = librosa .fft_frequencies (sr = _AUDIO_RATE , n_fft = _LD_N_FFT )
250+ a_weighting = librosa .A_weighting (frequencies )[np .newaxis , :]
251+ loudness = power_db + a_weighting
252+
253+ # Set dynamic range.
254+ loudness -= _REF_DB
255+ loudness = np .maximum (loudness , - _LD_RANGE )
256+
257+ # Average over frequency bins.
258+ mean_loudness_db = np .mean (loudness , axis = - 1 )
259+
260+ ex = dict (ex )
261+ ex ["loudness" ] = {"db" : mean_loudness_db .astype (np .float32 )}
262+ return id_ , ex
263+
264+
139265class Builder (tfds .core .BeamBasedBuilder ):
140266 """A large-scale and high-quality dataset of annotated musical notes."""
141267
@@ -230,141 +356,21 @@ def _build_pcollection(self, pipeline, tfrecord_dirs, ids, split):
230356 """Build PCollection of examples for split."""
231357 beam = tfds .core .lazy_imports .apache_beam
232358
233- def _emit_base_example (ex ):
234- """Maps an input example to a TFDS example."""
235- beam .metrics .Metrics .counter (split , "base-examples" ).inc ()
236- features = ex .features .feature
237- id_ = features ["note_str" ].bytes_list .value [0 ]
238- return id_ , {
239- "id" : id_ ,
240- "audio" : np .array (
241- features ["audio" ].float_list .value , dtype = np .float32
242- ),
243- "pitch" : features ["pitch" ].int64_list .value [0 ],
244- "velocity" : features ["velocity" ].int64_list .value [0 ],
245- "instrument" : {
246- "label" : tf .compat .as_text (
247- features ["instrument_str" ].bytes_list .value [0 ]
248- ),
249- "family" : tf .compat .as_text (
250- features ["instrument_family_str" ].bytes_list .value [0 ]
251- ),
252- "source" : tf .compat .as_text (
253- features ["instrument_source_str" ].bytes_list .value [0 ]
254- ),
255- },
256- "qualities" : {
257- q : features ["qualities" ].int64_list .value [i ]
258- for (i , q ) in enumerate (_QUALITIES )
259- },
260- }
261-
262- def _in_split (id_ex , split_ids ):
263- unused_id , ex = id_ex
264- if not split_ids or tf .compat .as_text (ex ["id" ]) in split_ids :
265- beam .metrics .Metrics .counter (split , "in-split" ).inc ()
266- return True
267- return False
268-
269- def _estimate_f0 (id_ex ):
270- """Estimate the fundamental frequency using CREPE and add to example."""
271- id_ , ex = id_ex
272- beam .metrics .Metrics .counter (split , "estimate-f0" ).inc ()
273-
274- audio = ex ["audio" ]
275-
276- # Copied from magenta/ddsp/spectral_ops.py
277- # Pad end so that `num_frames = _NUM_SECS * _F0_AND_LOUDNESS_RATE`.
278- hop_size = _AUDIO_RATE / _F0_AND_LOUDNESS_RATE
279- n_samples = len (audio )
280- n_frames = _NUM_SECS * _F0_AND_LOUDNESS_RATE
281- n_samples_padded = (n_frames - 1 ) * hop_size + _CREPE_FRAME_SIZE
282- n_padding = n_samples_padded - n_samples
283- assert n_padding % 1 == 0
284- audio = np .pad (audio , (0 , int (n_padding )), mode = "constant" )
285- crepe_step_size = 1000 / _F0_AND_LOUDNESS_RATE # milliseconds
286-
287- _ , f0_hz , f0_confidence , _ = tfds .core .lazy_imports .crepe .predict (
288- audio ,
289- sr = _AUDIO_RATE ,
290- viterbi = True ,
291- step_size = crepe_step_size ,
292- center = False ,
293- verbose = 0 ,
294- )
295- f0_midi = tfds .core .lazy_imports .librosa .core .hz_to_midi (f0_hz )
296- # Set -infs introduced by hz_to_midi to 0.
297- f0_midi [f0_midi == - np .inf ] = 0
298- # Set nans to 0 in confidence.
299- f0_confidence = np .nan_to_num (f0_confidence )
300- ex = dict (ex )
301- ex ["f0" ] = {
302- "hz" : f0_hz .astype (np .float32 ),
303- "midi" : f0_midi .astype (np .float32 ),
304- "confidence" : f0_confidence .astype (np .float32 ),
305- }
306- return id_ , ex
307-
308- def _calc_loudness (id_ex ):
309- """Compute loudness, add to example (ref is white noise, amplitude=1)."""
310- id_ , ex = id_ex
311- beam .metrics .Metrics .counter (split , "compute-loudness" ).inc ()
312-
313- audio = ex ["audio" ]
314-
315- # Copied from magenta/ddsp/spectral_ops.py
316- # Get magnitudes.
317- hop_size = int (_AUDIO_RATE // _F0_AND_LOUDNESS_RATE )
318-
319- # Add padding to the end
320- n_samples_initial = int (audio .shape [- 1 ])
321- n_frames = int (np .ceil (n_samples_initial / hop_size ))
322- n_samples_final = (n_frames - 1 ) * hop_size + _LD_N_FFT
323- pad = n_samples_final - n_samples_initial
324- audio = np .pad (audio , ((0 , pad ),), "constant" )
325-
326- librosa = tfds .core .lazy_imports .librosa
327- spectra = librosa .stft (
328- audio , n_fft = _LD_N_FFT , hop_length = hop_size , center = False
329- ).T
330-
331- # Compute power
332- amplitude = np .abs (spectra )
333- amin = 1e-20 # Avoid log(0) instabilities.
334- power_db = np .log10 (np .maximum (amin , amplitude ))
335- power_db *= 20.0
336-
337- # Perceptual weighting.
338- frequencies = librosa .fft_frequencies (sr = _AUDIO_RATE , n_fft = _LD_N_FFT )
339- a_weighting = librosa .A_weighting (frequencies )[np .newaxis , :]
340- loudness = power_db + a_weighting
341-
342- # Set dynamic range.
343- loudness -= _REF_DB
344- loudness = np .maximum (loudness , - _LD_RANGE )
345-
346- # Average over frequency bins.
347- mean_loudness_db = np .mean (loudness , axis = - 1 )
348-
349- ex = dict (ex )
350- ex ["loudness" ] = {"db" : mean_loudness_db .astype (np .float32 )}
351- return id_ , ex
352-
353359 examples = (
354360 pipeline
355361 | beam .Create ([os .path .join (dir_ , "*" ) for dir_ in tfrecord_dirs ])
356362 | beam .io .tfrecordio .ReadAllFromTFRecord (
357363 coder = beam .coders .ProtoCoder (tf .train .Example )
358364 )
359- | beam .Map (_emit_base_example )
360- | beam .Filter (_in_split , split_ids = ids )
365+ | beam .Map (_emit_base_example , split = split )
366+ | beam .Filter (_in_split , split_ids = ids , split = split )
361367 )
362368 if self .builder_config .estimate_f0_and_loudness :
363369 examples = (
364370 examples
365371 | beam .Reshuffle ()
366- | beam .Map (_estimate_f0 )
367- | beam .Map (_calc_loudness )
372+ | beam .Map (_estimate_f0 , split = split )
373+ | beam .Map (_calc_loudness , split = split )
368374 )
369375
370376 return examples
0 commit comments