|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +import numba |
| 4 | +from vtools.functions.filter import cosine_lanczos |
| 5 | + |
| 6 | +__all__ = ["get_tidal_hl", "get_tidal_amplitude", "get_tidal_hl_zerocrossing", "get_tidal_phase_diff"] |
| 7 | + |
| 8 | +def get_smoothed_resampled( |
| 9 | + df, cutoff_period="2h", resample_period="1min", interpolate_method="pchip" |
| 10 | +): |
| 11 | + """Resample the dataframe (indexed by time) to the regular period of resample_period using the interpolate method |
| 12 | +
|
| 13 | + Furthermore the cosine lanczos filter is used with a cutoff_period to smooth the signal to remove high frequency noise |
| 14 | +
|
| 15 | + Args: |
| 16 | +
|
| 17 | + df (DataFrame): A single column dataframe indexed by datetime |
| 18 | +
|
| 19 | + cutoff_period (str, optional): cutoff period for cosine lanczos filter. Defaults to '2h'. |
| 20 | +
|
| 21 | + resample_period (str, optional): Resample to regular period. Defaults to '1min'. |
| 22 | +
|
| 23 | + interpolate_method (str, optional): interpolation for resampling. Defaults to 'pchip'. |
| 24 | +
|
| 25 | + Returns: |
| 26 | +
|
| 27 | + DataFrame: smoothed and resampled dataframe indexed by datetime |
| 28 | + """ |
| 29 | + dfb = df.resample(resample_period).bfill() |
| 30 | + df = df.resample(resample_period).interpolate(method=interpolate_method) |
| 31 | + df[dfb.iloc[:, 0].isna()] = np.nan |
| 32 | + return cosine_lanczos(df, cutoff_period) |
| 33 | + |
| 34 | + |
| 35 | +@numba.jit(nopython=True) |
| 36 | +def lmax(arr): |
| 37 | + """Local maximum: Returns value only when centered on maximum""" |
| 38 | + idx = np.argmax(arr) |
| 39 | + if idx == len(arr) / 2: |
| 40 | + return arr[idx] |
| 41 | + else: |
| 42 | + return np.NaN |
| 43 | + |
| 44 | + |
| 45 | +@numba.jit(nopython=True) |
| 46 | +def lmin(arr): |
| 47 | + """Local minimum: Returns value only when centered on minimum""" |
| 48 | + idx = np.argmin(arr) |
| 49 | + if idx == len(arr) / 2: |
| 50 | + return arr[idx] |
| 51 | + else: |
| 52 | + return np.NaN |
| 53 | + |
| 54 | + |
| 55 | +def periods_per_window(moving_window_size: str, period_str: str) -> int: |
| 56 | + """Number of period size in moving window |
| 57 | +
|
| 58 | + Args: |
| 59 | +
|
| 60 | + moving_window_size (str): moving window size as a string e.g 7H for 7 hour |
| 61 | +
|
| 62 | + period_str (str): period as str e.g. 1T for 1 min |
| 63 | +
|
| 64 | + Returns: |
| 65 | +
|
| 66 | + int: number of periods in the moving window rounded to an integer |
| 67 | + """ |
| 68 | + |
| 69 | + return int( |
| 70 | + pd.Timedelta(moving_window_size) |
| 71 | + / pd.to_timedelta(pd.tseries.frequencies.to_offset(period_str)) |
| 72 | + ) |
| 73 | + |
| 74 | + |
| 75 | +def tidal_highs(df, moving_window_size="7h"): |
| 76 | + """Tidal highs (could be upto two highs in a 25 hr period) |
| 77 | +
|
| 78 | + Args: |
| 79 | +
|
| 80 | + df (DataFrame): a time series with a regular frequency |
| 81 | +
|
| 82 | + moving_window_size (str, optional): moving window size to look for highs within. Defaults to '7h'. |
| 83 | +
|
| 84 | + Returns: |
| 85 | +
|
| 86 | + DataFrame: an irregular time series with highs at resolution of df.index |
| 87 | + """ |
| 88 | + period_str = df.index.freqstr |
| 89 | + periods = periods_per_window(moving_window_size, period_str) |
| 90 | + dfmax = df.rolling(moving_window_size, min_periods=periods).apply(lmax, raw=True) |
| 91 | + dfmax = dfmax.shift(periods=-(periods // 2 - 1)) |
| 92 | + dfmax = dfmax.dropna() |
| 93 | + dfmax.columns = ["max"] |
| 94 | + return dfmax |
| 95 | + |
| 96 | + |
| 97 | +def tidal_lows(df, moving_window_size="7h"): |
| 98 | + """Tidal lows (could be upto two lows in a 25 hr period) |
| 99 | +
|
| 100 | + Args: |
| 101 | +
|
| 102 | + df (DataFrame): a time series with a regular frequency |
| 103 | +
|
| 104 | + moving_window_size (str, optional): moving window size to look for lows within. Defaults to '7h'. |
| 105 | +
|
| 106 | + Returns: |
| 107 | +
|
| 108 | + DataFrame: an irregular time series with lows at resolution of df.index |
| 109 | + """ |
| 110 | + period_str = df.index.freqstr |
| 111 | + periods = periods_per_window(moving_window_size, period_str) |
| 112 | + dfmin = df.rolling(moving_window_size, min_periods=periods).apply(lmin, raw=True) |
| 113 | + dfmin = dfmin.shift(periods=-(periods // 2 - 1)) |
| 114 | + dfmin = dfmin.dropna() |
| 115 | + dfmin.columns = ["min"] |
| 116 | + return dfmin |
| 117 | + |
| 118 | + |
| 119 | +def get_tidal_hl( |
| 120 | + df, |
| 121 | + cutoff_period="2h", |
| 122 | + resample_period="1min", |
| 123 | + interpolate_method="pchip", |
| 124 | + moving_window_size="7h", |
| 125 | +): |
| 126 | + """Get Tidal highs and lows |
| 127 | +
|
| 128 | + Args: |
| 129 | +
|
| 130 | + df (DataFrame): A single column dataframe indexed by datetime |
| 131 | +
|
| 132 | + cutoff_period (str, optional): cutoff period for cosine lanczos filter. Defaults to '2h'. |
| 133 | +
|
| 134 | + resample_period (str, optional): Resample to regular period. Defaults to '1min'. |
| 135 | +
|
| 136 | + interpolate_method (str, optional): interpolation for resampling. Defaults to 'pchip'. |
| 137 | +
|
| 138 | + moving_window_size (str, optional): moving window size to look for lows within. Defaults to '7h'. |
| 139 | +
|
| 140 | + Returns: |
| 141 | +
|
| 142 | + tuple of DataFrame: Tidal high and tidal low time series |
| 143 | + """ |
| 144 | + dfs = get_smoothed_resampled(df, cutoff_period, resample_period, interpolate_method) |
| 145 | + return tidal_highs(dfs), tidal_lows(dfs) |
| 146 | + |
| 147 | + |
| 148 | +get_tidal_hl_rolling = get_tidal_hl # for older refs. #FIXME |
| 149 | + |
| 150 | + |
| 151 | +def get_tidal_amplitude(dfh, dfl): |
| 152 | + """Tidal amplitude given tidal highs and lows |
| 153 | +
|
| 154 | + Args: |
| 155 | +
|
| 156 | + dfh (DataFrame): Tidal highs time series |
| 157 | +
|
| 158 | + dfl (DataFrame): Tidal lows time series |
| 159 | +
|
| 160 | + Returns: |
| 161 | +
|
| 162 | + DataFrame: Amplitude timeseries, at the times of the low following the high being used for amplitude calculation |
| 163 | + """ |
| 164 | + dfamp = pd.concat([dfh, dfl], axis=1) |
| 165 | + dfamp = dfamp[["min"]].dropna().join(dfamp[["max"]].ffill()) |
| 166 | + return pd.DataFrame(dfamp["max"] - dfamp["min"], columns=["amplitude"]) |
| 167 | + |
| 168 | + |
| 169 | +def get_tidal_amplitude_diff(dfamp1, dfamp2, percent_diff=False, tolerance="4h"): |
| 170 | + """Get the difference of values within +/- 4H of values in the two amplitude arrays |
| 171 | +
|
| 172 | + Args: |
| 173 | +
|
| 174 | + dfamp1 (DataFrame): Amplitude time series |
| 175 | +
|
| 176 | + dfamp2 (DataFrame): Amplitude time series |
| 177 | +
|
| 178 | + percent_diff (bool, optional): If true do percent diff. Defaults to False. |
| 179 | +
|
| 180 | + Returns: |
| 181 | +
|
| 182 | + DataFrame: Difference dfamp1-dfamp2 or % Difference (dfamp1-dfamp2)/dfamp2*100 for values within +/- 4H of each other |
| 183 | + """ |
| 184 | + dfamp = pd.merge_asof( |
| 185 | + dfamp1, |
| 186 | + dfamp2, |
| 187 | + left_index=True, |
| 188 | + right_index=True, |
| 189 | + direction="nearest", |
| 190 | + tolerance=pd.Timedelta(tolerance), |
| 191 | + ) |
| 192 | + if percent_diff: |
| 193 | + dfdiff = 100.0 * (dfamp.iloc[:, 0] - dfamp.iloc[:, 1]) / dfamp.iloc[:, 1] |
| 194 | + else: |
| 195 | + dfdiff = dfamp.iloc[:, 0] - dfamp.iloc[:, 1] |
| 196 | + return pd.DataFrame(dfdiff, columns=["amplitude_diff"]) |
| 197 | + |
| 198 | + |
| 199 | +def get_phase_diff(df1, df2, tolerance="4h"): |
| 200 | + df1["time"] = df1.index |
| 201 | + df2["time"] = df2.index |
| 202 | + df21 = pd.merge_asof( |
| 203 | + df2, |
| 204 | + df1, |
| 205 | + left_index=True, |
| 206 | + right_index=True, |
| 207 | + direction="nearest", |
| 208 | + tolerance=pd.Timedelta(tolerance), |
| 209 | + ) |
| 210 | + return (df21["time_x"] - df21["time_y"]).apply(lambda x: x.total_seconds() / 60) |
| 211 | + |
| 212 | + |
| 213 | +def get_tidal_phase_diff(dfh2, dfl2, dfh1, dfl1, tolerance="4h"): |
| 214 | + """Calculates the phase difference between df2 and df1 tidal highs and lows |
| 215 | +
|
| 216 | + Scans +/- 4 hours in df1 to get the highs and lows in that windows for df2 to |
| 217 | + get the tidal highs and lows at the times of df1 |
| 218 | +
|
| 219 | +
|
| 220 | + Args: |
| 221 | +
|
| 222 | + dfh2 (DataFrame): Timeseries of tidal highs |
| 223 | +
|
| 224 | + dfl2 (DataFrame): Timeseries of tidal lows |
| 225 | +
|
| 226 | + dfh1 (DataFrame): Timeseries of tidal highs |
| 227 | +
|
| 228 | + dfl1 (DataFRame): Timeseries of tidal lows |
| 229 | +
|
| 230 | + Returns: |
| 231 | +
|
| 232 | + DataFrame: Phase difference (dfh2-dfh1) and (dfl2-dfl1) in minutes |
| 233 | + """ |
| 234 | + high_phase_diff = get_phase_diff(dfh2, dfh1, tolerance) |
| 235 | + low_phase_diff = get_phase_diff(dfl2, dfl1, tolerance) |
| 236 | + merged_diff = pd.merge( |
| 237 | + pd.DataFrame(high_phase_diff, index=dfh1.index), |
| 238 | + pd.DataFrame(low_phase_diff, index=dfl1.index), |
| 239 | + how="outer", |
| 240 | + left_index=True, |
| 241 | + right_index=True, |
| 242 | + ) |
| 243 | + return merged_diff.iloc[:, 0].fillna(merged_diff.iloc[:, 1]) |
| 244 | + |
| 245 | + |
| 246 | +def get_tidal_hl_zerocrossing(df, round_to="1min"): |
| 247 | + """ |
| 248 | + Finds the tidal high and low times using zero crossings of the first derivative. |
| 249 | +
|
| 250 | + This works for all situations but is not robust in the face of noise and perturbations in the signal |
| 251 | + """ |
| 252 | + zc, zi = zerocross(df) |
| 253 | + if round_to: |
| 254 | + zc = pd.to_datetime(zc).round(round_to) |
| 255 | + return zc |
| 256 | + |
| 257 | + |
| 258 | +def zerocross(df): |
| 259 | + """ |
| 260 | + Calculates the gradient of the time series and identifies locations where gradient changes sign |
| 261 | + Returns the time rounded to nearest minute where the zero crossing happens (based on linear derivative assumption) |
| 262 | + """ |
| 263 | + diffdfv = pd.Series(np.gradient(df[df.columns[0]].values), index=df.index) |
| 264 | + indi = np.where((np.diff(np.sign(diffdfv))) & (diffdfv[1:] != 0))[0] |
| 265 | + # Find the zero crossing by linear interpolation |
| 266 | + zdb = diffdfv[indi].index |
| 267 | + zda = diffdfv[indi + 1].index |
| 268 | + x = diffdfv.index |
| 269 | + y = diffdfv.values |
| 270 | + dx = x[indi + 1] - x[indi] |
| 271 | + dy = y[indi + 1] - y[indi] |
| 272 | + zc = -y[indi] * (dx / dy) + x[indi] |
| 273 | + return zc, indi |
| 274 | + |
| 275 | + |
| 276 | +##---- FUNCTIONS CACHED BELOW THIS LINE PERHAPS TO USE LATER? ---# |
| 277 | + |
| 278 | + |
| 279 | +def where_changed(df): |
| 280 | + """ """ |
| 281 | + diff = np.diff(df[df.columns[0]].values) |
| 282 | + wdiff = np.where(diff != 0)[0] |
| 283 | + wdiff = np.insert(wdiff, 0, 0) # insert the first value i.e. zero index |
| 284 | + return df.iloc[wdiff + 1, :] |
| 285 | + |
| 286 | + |
| 287 | +def where_same(dfg, df): |
| 288 | + """ |
| 289 | + return dfg only where its value is the same as df for the same time stamps |
| 290 | + i.e. the interesection locations with df |
| 291 | + """ |
| 292 | + dfall = pd.concat([dfg, df], axis=1) |
| 293 | + return dfall[dfall.iloc[:, 0] == dfall.iloc[:, 1]].iloc[:, 0] |
| 294 | + |
| 295 | + |
| 296 | +def limit_to_indices(df, si, ei): |
| 297 | + return df[(df.index > si) & (df.index < ei)] |
| 298 | + |
| 299 | + |
| 300 | +def filter_where_na(df, dfb): |
| 301 | + """ |
| 302 | + remove values in df where dfb has na values |
| 303 | + """ |
| 304 | + dfx = dfb.loc[df.index] |
| 305 | + return df.loc[dfx.dropna().index, :] |
0 commit comments