@@ -78,13 +78,6 @@ def transform(self, modality, aggregation=None):
7878 return transformed_modality
7979
8080 def get_output_stats (self , input_stats ) -> RepresentationStats :
81- """
82- Estimate output shape of Spectral features.
83-
84- We compute 4 spectral feature sequences (centroid, bandwidth,
85- rolloff, flatness), each over frames of length ``hop_length``.
86- The resulting tensors have shape (num_frames, 4).
87- """
8881 num_instances = getattr (input_stats , "num_instances" , 0 )
8982
9083 # Try to infer signal length from stats
@@ -103,6 +96,13 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
10396
10497 return RepresentationStats (num_instances , (num_frames , 4 ))
10598
99+ def estimate_peak_memory_bytes (self , input_stats ) -> dict :
100+ # TODO
101+ return {
102+ "cpu_peak_bytes" : 0 ,
103+ "gpu_peak_bytes" : 0 ,
104+ }
105+
106106
107107@register_representation (ModalityType .AUDIO )
108108class ZeroCrossing (UnimodalRepresentation ):
@@ -130,13 +130,6 @@ def transform(self, modality, aggregation=None):
130130 return transformed_modality
131131
132132 def get_output_stats (self , input_stats ) -> RepresentationStats :
133- """
134- Estimate output shape of ZeroCrossing features.
135-
136- ``librosa.feature.zero_crossing_rate`` returns an array of shape
137- (1, num_frames), so each instance is treated as a sequence of
138- scalar features over frames.
139- """
140133 num_instances = getattr (input_stats , "num_instances" , 0 )
141134
142135 if hasattr (input_stats , "max_length" ):
@@ -152,9 +145,15 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
152145 num_frames = 1 + max (int ((signal_length - 1 ) // self .hop_length ), 0 )
153146 num_frames = max (int (num_frames ), 1 )
154147
155- # shape (num_frames, 1): one scalar feature per frame
156148 return RepresentationStats (num_instances , (num_frames , 1 ))
157149
150+ def estimate_peak_memory_bytes (self , input_stats ) -> dict :
151+ # TODO
152+ return {
153+ "cpu_peak_bytes" : 0 ,
154+ "gpu_peak_bytes" : 0 ,
155+ }
156+
158157
159158@register_representation (ModalityType .AUDIO )
160159class RMSE (UnimodalRepresentation ):
@@ -183,12 +182,6 @@ def transform(self, modality, aggregation=None):
183182 return transformed_modality
184183
185184 def get_output_stats (self , input_stats ) -> RepresentationStats :
186- """
187- Estimate output shape of RMSE features.
188-
189- ``librosa.feature.rms`` returns an array of shape (1, num_frames),
190- so each instance is a sequence of scalar RMS values over frames.
191- """
192185 num_instances = getattr (input_stats , "num_instances" , 0 )
193186
194187 if hasattr (input_stats , "max_length" ):
@@ -201,12 +194,18 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
201194 if signal_length <= 0 :
202195 num_frames = 1
203196 else :
204- # librosa.rms uses frame_length and hop_length; approximate
205197 num_frames = 1 + max (int ((signal_length - 1 ) // self .hop_length ), 0 )
206198 num_frames = max (int (num_frames ), 1 )
207199
208200 return RepresentationStats (num_instances , (num_frames , 1 ))
209201
202+ def estimate_peak_memory_bytes (self , input_stats ) -> dict :
203+ # TODO
204+ return {
205+ "cpu_peak_bytes" : 0 ,
206+ "gpu_peak_bytes" : 0 ,
207+ }
208+
210209
211210@register_representation (ModalityType .AUDIO )
212211class Pitch (UnimodalRepresentation ):
@@ -253,3 +252,10 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
253252 num_frames = max (int (num_frames ), 1 )
254253
255254 return RepresentationStats (num_instances , (num_frames , 1 ))
255+
256+ def estimate_peak_memory_bytes (self , input_stats ) -> dict :
257+ # TODO
258+ return {
259+ "cpu_peak_bytes" : 0 ,
260+ "gpu_peak_bytes" : 0 ,
261+ }
0 commit comments