Skip to content

Commit 67fe0e1

Browse files
committed
[UPDATE] Optimize identify anomaly periods algorithm
Previously, we were creating each sliding window one-by-one which took forever on large datasets. Now we use a cumulative sum approach that gives identical results but runs way faster on extremly huge datasets. Signed-off-by: Kaveh Shahedi <kaveh.shahedi@ericsson.com>
1 parent 4307e98 commit 67fe0e1

1 file changed

Lines changed: 18 additions & 6 deletions

File tree

  • tmll/ml/modules/anomaly_detection/strategies

tmll/ml/modules/anomaly_detection/strategies/base.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
import numpy as np
12
import pandas as pd
23
from typing import List, Tuple
34
from abc import ABC, abstractmethod
45

56
from tmll.common.services.logger import Logger
67

8+
79
class AnomalyDetectionStrategy(ABC):
810
"""
911
Abstract class for anomaly detection strategies.
@@ -37,7 +39,7 @@ def _remove_minimum(data: pd.DataFrame) -> pd.DataFrame:
3739
:rtype: pd.DataFrame
3840
"""
3941
return data[data > data.min()]
40-
42+
4143
@staticmethod
4244
def _calculate_adaptive_window_size(data: pd.DataFrame) -> int:
4345
"""
@@ -63,18 +65,28 @@ def _identify_anomaly_periods(anomalies: pd.DataFrame, threshold: float = 0.9) -
6365
:rtype: List[Tuple[pd.Timestamp, pd.Timestamp]]
6466
"""
6567
window_size = AnomalyDetectionStrategy._calculate_adaptive_window_size(anomalies)
68+
69+
if len(anomalies.columns) > 1:
70+
anomaly_array = anomalies.any(axis=1).values
71+
else:
72+
anomaly_array = anomalies.iloc[:, 0].values
73+
74+
window_means = np.zeros(len(anomalies) - window_size + 1)
75+
cumsum = np.cumsum(np.insert(np.array(anomaly_array), 0, 0))
76+
window_sums = cumsum[window_size:] - cumsum[:-window_size]
77+
window_means = window_sums / window_size
78+
6679
anomaly_periods = []
6780
start = None
6881

69-
for i in range(len(anomalies) - window_size + 1):
70-
window = anomalies.iloc[i:i+window_size]
71-
if window.any(axis=1).mean() >= threshold and start is None:
82+
for i in range(len(window_means)):
83+
if window_means[i] >= threshold and start is None:
7284
start = anomalies.index[i]
73-
elif window.any(axis=1).mean() < threshold and start is not None:
85+
elif window_means[i] < threshold and start is not None:
7486
anomaly_periods.append((start, anomalies.index[i+window_size-1]))
7587
start = None
7688

7789
if start is not None:
7890
anomaly_periods.append((start, anomalies.index[-1]))
7991

80-
return anomaly_periods
92+
return anomaly_periods

0 commit comments

Comments
 (0)