Skip to content

Commit 942746f

Browse files
MaelleTtrtMaëlle TORTEROTOTmathieudpnt
authored
Fix aplose2raven (#272)
* Partial fix of raven time related issue for duty cycled data * apply duty cycle corr * fix test_utils * fix aplose2raven and adapt test_utils * ruff fix * fix aplose2raven for det_end in OFF duty cycle phase * rename variables * improve comment section * quick fix * adapt varible names in test * fix - case: detection in last audio --------- Co-authored-by: Maëlle TORTEROTOT <maelle.torterotot@ensta.fr> Co-authored-by: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com>
1 parent df8dea8 commit 942746f

2 files changed

Lines changed: 110 additions & 56 deletions

File tree

src/osekit/utils/formatting_utils.py

Lines changed: 107 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
import numpy as np
44
from pandas import DataFrame, Timedelta, Timestamp
55

6+
67
def aplose2raven(
78
aplose_result: DataFrame,
8-
audio_datetimes: list[Timestamp],
9-
audio_durations: list[float],
9+
list_audio_begin_time: list[Timestamp],
10+
audio_durations: list[Timedelta],
1011
) -> DataFrame:
1112
r"""Format an APLOSE result DataFrame to a Raven result DataFrame.
1213
@@ -19,89 +20,141 @@ def aplose2raven(
1920
aplose_result: Dataframe,
2021
APLOSE formatted result DataFrame.
2122
22-
audio_datetimes: list[pd.Timestamp]
23-
list of tz-aware timestamps from considered audio files.
23+
list_audio_begin_time: list[Timestamp]
24+
list of tz-aware timestamps from considered audio files begin time.
2425
25-
audio_durations: list[float]
26-
list of all considered audio file durations in seconds.
26+
audio_durations: list[Timedelta]
27+
list of all considered audio file durations.
2728
2829
Returns
2930
-------
3031
Raven formatted DataFrame.
3132
3233
Example of use
3334
--------------
34-
aplose_file = Path("path/to/aplose/result/file")
35-
timestamp_list = list(filenames)
36-
duration_list = list(durations)
37-
38-
aplose_result = (
39-
pd.read_csv(aplose_file, parse_dates=["start_datetime", "end_datetime"])
40-
.sort_values("start_datetime")
41-
.reset_index(drop=True)
42-
)
43-
raven_result = aplose2raven(aplose_result, filename_list, duration_list)
35+
>>> from pathlib import Path
36+
>>> from pandas import read_csv
37+
>>> from osekit.core_api.audio_dataset import AudioDataset
38+
>>> from osekit.utils.formatting_utils import aplose2raven
39+
40+
>>> dataset_folder = Path(r"path\to\audio\folder")
41+
>>> dataset = AudioDataset.from_folder(dataset_folder,
42+
>>> strptime_format="strptime_format",
43+
>>> timezone='utc',
44+
>>> )
4445
45-
# export to Raven format: tab-separated files with a txt extension
46-
raven_result.to_csv('path/to/result/file.txt', sep='\t', index=False)
46+
>>> begin_list = sorted([f.begin for f in list(dataset.files)])
47+
>>> duration_list = sorted([f.duration for f in list(dataset.files)])
48+
49+
>>> csv = Path(r"path\to\result\csv")
50+
>>> df = read_csv(csv,
51+
>>> parse_dates=["start_datetime", "end_datetime"]
52+
>>> ).sort_values("start_datetime")
53+
>>> .reset_index(drop=True)
54+
55+
>>> df_raven = aplose2raven(df, begin_list, duration_list)
56+
>>> raven_result.to_csv('path/to/result/file.txt', sep='\t', index=False)
4757
4858
"""
49-
# index of the corresponding wav file for each detection
59+
# index of the corresponding audio file for each detection
5060
index_detection = (
51-
np.searchsorted(audio_datetimes, aplose_result["start_datetime"], side="right")
61+
np.searchsorted(list_audio_begin_time,
62+
aplose_result["start_datetime"],
63+
side="right"
64+
)
5265
- 1
5366
)
5467

55-
# Add beg datetime of the wavfile
56-
aplose_result["wav_timestamp"] = [audio_datetimes[i] for i in index_detection]
57-
58-
# time differences between consecutive datetimes and add wav_duration
59-
filename_diff = [td.total_seconds() for td in np.diff(audio_datetimes).tolist()]
60-
adjust = [0]
61-
adjust.extend([t1 - t2 for (t1, t2) in zip(audio_durations[:-1], filename_diff, strict=False)])
62-
cumsum_adjust = list(np.cumsum(adjust))
68+
"""
69+
The following time adjustment is necessary because Raven does not account
70+
for the duty cycle, nor for any potential offset between the end of one
71+
file and the start of the next. To ensure that detection timestamps in
72+
APLOSE format align with the spectrograms displayed by Raven, a correction
73+
of the number of seconds is required, since the software only uses the
74+
elapsed time from the beginning of the first file to generate the bounding boxes.
75+
"""
6376

64-
# adjusted datetimes to match Raven annoying functioning
65-
begin_datetime_adjusted = []
66-
end_datetime_adjusted = []
67-
for (beg_det, end_det, beg_wav, ind) in (zip(aplose_result["start_datetime"], aplose_result["end_datetime"],
68-
aplose_result["wav_timestamp"], index_detection, strict=False)):
77+
# Add the begin time of the audio file corresponding to each detection
78+
aplose_result["wav_timestamp"] = [list_audio_begin_time[i] for i in index_detection]
79+
80+
# Compute the time gaps between consecutive audio file begin time
81+
audio_begin_timegap = list(np.diff(list_audio_begin_time).tolist())
82+
83+
# Adjustment values: difference between each file's duration
84+
# and the gap until the next file.
85+
# (Required to account for potential gaps/overlaps between files)
86+
adjustment_values = [Timedelta(0)]
87+
adjustment_values.extend(
88+
[t1 - t2 for (t1, t2) in zip(audio_durations[:-1],
89+
audio_begin_timegap, strict=False)
90+
]
91+
)
92+
93+
# Cumulative adjustment in seconds, to realign all detection timestamps consistently
94+
cumsum_adjust = list(np.cumsum(adjustment_values))
95+
96+
detection_begin_datetime_adjusted = []
97+
detection_end_datetime_adjusted = []
98+
for i in range(len(aplose_result)):
99+
detection_begin_time = aplose_result["start_datetime"].iloc[i]
100+
detection_end_time = aplose_result["end_datetime"].iloc[i]
101+
audio_begin_time = aplose_result["wav_timestamp"].iloc[i]
102+
ind = index_detection[i]
69103
"""
70-
For duty cycled data, if the aplose_result detections were reshaped (eg : to 60-second duration),
104+
For duty cycled data, if aplose_result detections were reshaped (eg to 60s duration),
71105
the start or end of the detection might virtually be located in a OFF duty cycle phase.
72106
This would cause issue in Raven, because the OFF part are not represented,
73-
and the detection start will be located on the previous wav file.
74-
The following 'if' conditions apply the appropriate correction to make the Raven box (1)starts or (2) ends
75-
at the appropriate timing in Raven (ie at the begining or end of a wav file).
107+
and the detection start will be located on the previous audio file.
108+
The 2 following 'if' conditions apply the appropriate correction
109+
to make the Raven box (1)starts or (2) ends.
110+
at the appropriate timing in Raven (ie at the begining or end of an audio file).
76111
"""
77112

78-
if (beg_wav + Timedelta(seconds=audio_durations[ind])) < beg_det < (beg_wav + Timedelta(seconds = filename_diff[ind])):
79-
corr_dur = (audio_datetimes[ind + 1] - beg_det).total_seconds()
80-
begin_datetime_adjusted.append(beg_det + Timedelta(seconds=cumsum_adjust[ind + 1]) + Timedelta(seconds=corr_dur))
81-
end_datetime_adjusted.append(end_det + Timedelta(seconds=cumsum_adjust[ind + 1]))
82-
elif (beg_wav + Timedelta(seconds=audio_durations[ind])) < end_det < (beg_wav + Timedelta(seconds = filename_diff[ind])):
83-
begin_datetime_adjusted.append(
84-
beg_det + Timedelta(seconds=cumsum_adjust[ind])
113+
audio_begin_time_adjusted = audio_begin_time + audio_durations[ind]
114+
115+
if ind < len(audio_begin_timegap):
116+
next_audio_begin_time_adjusted = audio_begin_time + audio_begin_timegap[ind]
117+
else:
118+
next_audio_begin_time_adjusted += audio_durations[ind]
119+
120+
121+
if audio_begin_time_adjusted < detection_begin_time < next_audio_begin_time_adjusted:
122+
correction_duration = (list_audio_begin_time[ind + 1] - detection_begin_time)
123+
detection_begin_datetime_adjusted.append(detection_begin_time
124+
+ cumsum_adjust[ind + 1]
125+
+ correction_duration
126+
)
127+
detection_end_datetime_adjusted.append(detection_end_time
128+
+ cumsum_adjust[ind + 1]
129+
)
130+
elif audio_begin_time_adjusted < detection_end_time < next_audio_begin_time_adjusted:
131+
detection_begin_datetime_adjusted.append(
132+
detection_begin_time + cumsum_adjust[ind]
85133
)
86-
corr_dur = (end_det-beg_det).total_seconds() - ((beg_wav + Timedelta(seconds=audio_durations[ind])) -beg_det).total_seconds()
87-
end_datetime_adjusted.append(end_det + Timedelta(seconds=cumsum_adjust[ind]) - Timedelta(seconds=corr_dur))
134+
correction_duration = ((detection_end_time-detection_begin_time) -
135+
((audio_begin_time + audio_durations[ind])
136+
- detection_begin_time))
137+
detection_end_datetime_adjusted.append(detection_end_time +
138+
cumsum_adjust[ind] -
139+
correction_duration)
88140

89141
else:
90-
# Else, apply normal raven time correction
91-
begin_datetime_adjusted.append(
92-
beg_det + Timedelta(seconds=cumsum_adjust[ind])
142+
# Else, apply normal Raven time correction
143+
detection_begin_datetime_adjusted.append(
144+
detection_begin_time + cumsum_adjust[ind]
93145
)
94-
end_datetime_adjusted.append(
95-
end_det + Timedelta(seconds=cumsum_adjust[ind])
146+
detection_end_datetime_adjusted.append(
147+
detection_end_time + cumsum_adjust[ind]
96148
)
97149

98-
# Convert the datetimes to seconds from the start of first wav (raven format)
150+
# Convert the datetimes to seconds from the start of first audio (raven format)
99151
begin_time_adjusted = [
100-
(d - audio_datetimes[0]).total_seconds() for d in begin_datetime_adjusted
152+
(d - list_audio_begin_time[0]).total_seconds() for d in detection_begin_datetime_adjusted
101153
]
102154
end_time_adjusted = [
103-
(d - audio_datetimes[0]).total_seconds() for d in end_datetime_adjusted
155+
(d - list_audio_begin_time[0]).total_seconds() for d in detection_end_datetime_adjusted
104156
]
157+
105158
# Build corrected Raven selection table
106159
raven_result = DataFrame()
107160
raven_result["Selection"] = list(range(1, len(aplose_result) + 1))

tests/test_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
)
1515
from osekit.utils.formatting_utils import aplose2raven
1616
from osekit.utils.path_utils import move_tree
17+
from pandas import Timedelta
1718

1819

1920
@pytest.fixture
@@ -60,7 +61,7 @@ def audio_timestamps() -> list:
6061

6162
@pytest.fixture
6263
def audio_durations(audio_timestamps: pytest.fixture) -> list:
63-
return [30] * len(audio_timestamps)
64+
return [Timedelta("30s")] * len(audio_timestamps)
6465

6566

6667
@pytest.mark.unit
@@ -71,7 +72,7 @@ def test_aplose2raven(
7172
) -> None:
7273
raven_dataframe = aplose2raven(
7374
aplose_result=aplose_dataframe,
74-
audio_datetimes=audio_timestamps,
75+
list_audio_begin_time=audio_timestamps,
7576
audio_durations=audio_durations,
7677
)
7778

0 commit comments

Comments
 (0)