Skip to content

Commit bb23b56

Browse files
EliEli
authored andcommitted
Changed boundary NaN behavior in blend, added new implementation of rhistinterp and added NOAA-style high-low labeling to tidalhl.py
1 parent 21c8370 commit bb23b56

4 files changed

Lines changed: 680 additions & 261 deletions

File tree

tests/test_rhistinterp.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import sys
2+
import types
3+
4+
import numpy as np
5+
import pandas as pd
6+
7+
# Stub vtools dependency required by legacy interpolate.py import.
8+
vtools_mod = types.ModuleType("vtools")
9+
functions_mod = types.ModuleType("vtools.functions")
10+
mono_mod = types.ModuleType("vtools.functions._monotonic_spline")
11+
mono_mod._monotonic_spline = lambda x, y, xnew: np.interp(xnew, x, y)
12+
sys.modules.setdefault("vtools", vtools_mod)
13+
sys.modules.setdefault("vtools.functions", functions_mod)
14+
sys.modules.setdefault("vtools.functions._monotonic_spline", mono_mod)
15+
16+
from vtools.functions.rhistinterp import find_runs, rhistinterp
17+
from vtools.functions.interpolate import rhistinterp as legacy_rhistinterp
18+
19+
20+
def test_find_runs_empty():
21+
mask = np.array([], dtype=bool)
22+
assert find_runs(mask) == []
23+
24+
25+
def test_find_runs():
26+
mask = np.array([False, True, True, False, True, False, True, True, True])
27+
assert find_runs(mask) == [(1, 3), (4, 5), (6, 9)]
28+
29+
30+
def test_rhistinterp_thresh_none_matches_legacy_series():
31+
ndx = pd.period_range(start='2001-01-01', periods=6, freq='M')
32+
ts = pd.Series([1.0, 3.0, 2.5, 4.0, 3.5, 2.0], index=ndx, name='x')
33+
dest = pd.date_range(start=ndx[0].start_time, end=ndx[-1].end_time.round('s').floor('D'), freq='D')
34+
35+
got = rhistinterp(ts, dest, p=1.5, lowbound=0.0, thresh=None)
36+
want = legacy_rhistinterp(ts, dest, p=1.5, lowbound=0.0)
37+
38+
pd.testing.assert_series_equal(got, want)
39+
40+
41+
def test_rhistinterp_protected_middle_interval_is_constant():
42+
ndx = pd.period_range(start='2001-01-01', periods=3, freq='M')
43+
ts = pd.Series([1.0, 0.02, 1.0], index=ndx, name='x')
44+
dest = pd.date_range(start=ndx[0].start_time, end=ndx[-1].end_time.round('s').floor('D'), freq='D')
45+
46+
got = rhistinterp(ts, dest, p=1.5, lowbound=0.0, thresh=0.05)
47+
48+
left = ndx[1].start_time
49+
right = ndx[1].end_time.round('s')
50+
mask = (got.index >= left) & (got.index < right)
51+
vals = got.loc[mask].to_numpy()
52+
assert vals.size > 0
53+
assert np.allclose(vals, 0.02)
54+
assert np.nanmin(got.to_numpy()) >= 0.0

vtools/functions/blend.py

Lines changed: 7 additions & 247 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import numpy as np
33
from vtools import to_timedelta
44
from vtools.functions.colname_align import align_inputs_strict
5-
from vtools.data.indexing import resolve_common_freq, regular_index_from_valid_extent
5+
from vtools.data.indexing import resolve_common_freq
66

77
__all__ = ["ts_blend"]
88

@@ -12,7 +12,8 @@ def _blend_output_index(series):
1212
Determine the working index for ts_blend.
1313
1414
Blending requires inputs with a common regular frequency. The working
15-
index is the regular index spanning the valid-data extent of the inputs.
15+
index is the regular index spanning the full union extent of the inputs
16+
(earliest start to latest end across all series).
1617
1718
Parameters
1819
----------
@@ -21,7 +22,7 @@ def _blend_output_index(series):
2122
2223
Returns
2324
-------
24-
pandas.Index
25+
pandas.DatetimeIndex
2526
Regular index on which blending should be performed.
2627
2728
Raises
@@ -40,250 +41,9 @@ def _blend_output_index(series):
4041
"For irregular handoff behavior, use ts_splice."
4142
)
4243

43-
return regular_index_from_valid_extent(series, output_freq)
44-
45-
46-
def _distance_to_gap(hi_col: pd.Series, mode: str = "count") -> pd.Series:
47-
"""
48-
Distance to nearest gap (NaN) in hi_col.
49-
50-
Parameters
51-
----------
52-
hi_col : Series
53-
Higher-priority series.
54-
mode : {'count', 'freq'}
55-
'count' -> distance in # of samples (0 at gaps).
56-
'freq' -> distance as Timedelta, using hi_col.index.freq.
57-
58-
Returns
59-
-------
60-
Series
61-
Same index as hi_col, distance to nearest NaN.
62-
"""
63-
idx = hi_col.index
64-
n = len(idx)
65-
mask = hi_col.isna().to_numpy()
66-
67-
# No gaps -> everything is effectively "far away"
68-
if not mask.any():
69-
dist = np.full(n, np.inf, dtype=float)
70-
return pd.Series(dist, index=idx)
71-
72-
dist = np.full(n, np.inf, dtype=float)
73-
74-
# Forward pass: distance from the last gap
75-
last_gap = None
76-
for i in range(n):
77-
if mask[i]:
78-
dist[i] = 0.0
79-
last_gap = i
80-
elif last_gap is not None:
81-
dist[i] = float(i - last_gap)
82-
83-
# Backward pass: distance from the next gap
84-
last_gap = None
85-
for i in range(n - 1, -1, -1):
86-
if mask[i]:
87-
last_gap = i
88-
elif last_gap is not None:
89-
dist[i] = min(dist[i], float(last_gap - i))
90-
91-
dist_s = pd.Series(dist, index=idx)
92-
93-
if mode == "count":
94-
return dist_s
95-
96-
if mode == "freq":
97-
freq = idx.freq
98-
if freq is None:
99-
raise ValueError(
100-
"Time-based blending requires a regular index with .freq set."
101-
)
102-
# counts * freq → Timedelta
103-
return dist_s * to_timedelta(freq)
104-
105-
raise ValueError("mode must be 'count' or 'freq'")
106-
107-
108-
def _normalize_blend_length(blend_length, index):
109-
"""
110-
Interpret blend_length as sample count or time span.
111-
112-
Returns
113-
-------
114-
(mode, L)
115-
mode : {'count', 'freq'} or None
116-
L : numeric (count) or Timedelta
117-
"""
118-
if blend_length is None:
119-
return None, None
120-
if isinstance(blend_length, str):
121-
blend_length = blend_length.replace("H", "h")
122-
blend_length = blend_length.replace("d", "D")
123-
124-
# Integer: number of samples
125-
if isinstance(blend_length, (int, np.integer)):
126-
if blend_length <= 0:
127-
return None, None
128-
return "count", float(blend_length)
129-
130-
# Timedelta-like: e.g. '2h', '30min'
131-
td = pd.to_timedelta(blend_length)
132-
if not isinstance(index, (pd.DatetimeIndex, pd.PeriodIndex)):
133-
raise ValueError(
134-
"Time-based blend_length requires a DatetimeIndex or PeriodIndex."
135-
)
136-
if index.freq is None:
137-
raise ValueError(
138-
"Time-based blend_length requires a regular index with a .freq attribute."
139-
)
140-
if td <= pd.Timedelta(0):
141-
return None, None
142-
143-
return "freq", td
144-
145-
146-
def _blend_two(
147-
aligned_hi: pd.DataFrame,
148-
aligned_lo: pd.DataFrame,
149-
blend_mode: str,
150-
blend_L,
151-
) -> pd.DataFrame:
152-
"""
153-
Blend a lower-priority DataFrame into a higher-priority DataFrame.
154-
155-
Parameters
156-
----------
157-
aligned_hi, aligned_lo : DataFrame
158-
Same index. Higher priority is 'aligned_hi'.
159-
blend_mode : {'count', 'freq'} or None
160-
blend_L : float or Timedelta
161-
162-
Returns
163-
-------
164-
DataFrame
165-
Blended result.
166-
"""
167-
# No blending requested → just do priority overlay
168-
if blend_mode is None or blend_L is None:
169-
return aligned_hi.combine_first(aligned_lo)
170-
171-
idx = aligned_hi.index
172-
out = aligned_hi.copy()
173-
cols = sorted(set(aligned_hi.columns) | set(aligned_lo.columns))
174-
175-
for col in cols:
176-
hi_col = (
177-
aligned_hi[col]
178-
if col in aligned_hi.columns
179-
else pd.Series(index=idx, dtype=float)
180-
)
181-
lo_col = (
182-
aligned_lo[col]
183-
if col in aligned_lo.columns
184-
else pd.Series(index=idx, dtype=float)
185-
)
186-
187-
hi_nan = hi_col.isna()
188-
lo_nan = lo_col.isna()
189-
190-
# Priority baseline: hi where present, otherwise lo
191-
merged = hi_col.copy()
192-
fill_mask = hi_nan & (~lo_nan)
193-
merged[fill_mask] = lo_col[fill_mask]
194-
195-
# Distance to nearest gap in the *high-priority* series
196-
dist_to_gap = _distance_to_gap(
197-
hi_col,
198-
mode="count" if blend_mode == "count" else "freq",
199-
)
200-
201-
# Candidate points for blending on the shoulders of gaps:
202-
# - hi has data
203-
# - lo has data
204-
near_gap = (~hi_nan) & (~lo_nan)
205-
206-
if blend_mode == "count":
207-
near_gap &= (dist_to_gap > 0) & (dist_to_gap <= blend_L)
208-
if not near_gap.any():
209-
out[col] = merged
210-
continue
211-
d = dist_to_gap[near_gap].astype(float)
212-
t = (blend_L - d) / blend_L
213-
else: # 'freq' mode (Timedelta)
214-
near_gap &= (dist_to_gap > pd.Timedelta(0)) & (dist_to_gap <= blend_L)
215-
if not near_gap.any():
216-
out[col] = merged
217-
continue
218-
d = dist_to_gap[near_gap]
219-
t = 1.0 - (d / blend_L)
220-
221-
t = t.clip(lower=0.0, upper=1.0)
222-
223-
# Kernel: lower-priority gets up to 0.5 weight at the gap edge,
224-
# tapering to 0 at distance >= blend_L.
225-
w_lo = 0.5 * t
226-
w_hi = 1.0 - w_lo
227-
228-
hi_vals = hi_col[near_gap].astype(float)
229-
lo_vals = lo_col[near_gap].astype(float)
230-
231-
blended_vals = (
232-
w_hi.to_numpy() * hi_vals.to_numpy() + w_lo.to_numpy() * lo_vals.to_numpy()
233-
)
234-
235-
# IMPORTANT: use .loc with a boolean mask, not .at, so we never hit
236-
# DataFrame._set_value with a non-scalar index.
237-
merged.loc[near_gap] = blended_vals
238-
239-
out[col] = merged
240-
241-
return out
242-
243-
244-
import pandas as pd
245-
import numpy as np
246-
from vtools import to_timedelta
247-
from vtools.functions.colname_align import align_inputs_strict
248-
from vtools.data.indexing import resolve_common_freq, regular_index_from_valid_extent
249-
250-
__all__ = ["ts_blend"]
251-
252-
253-
def _blend_output_index(series):
254-
"""
255-
Determine the working index for ts_blend.
256-
257-
Blending requires inputs with a common regular frequency. The working
258-
index is the regular index spanning the valid-data extent of the inputs.
259-
260-
Parameters
261-
----------
262-
series : sequence of pandas.Series or pandas.DataFrame
263-
Input time series.
264-
265-
Returns
266-
-------
267-
pandas.Index
268-
Regular index on which blending should be performed.
269-
270-
Raises
271-
------
272-
ValueError
273-
If a common regular frequency cannot be established.
274-
"""
275-
output_freq = resolve_common_freq(
276-
[s.index for s in series],
277-
preserve_freq=True,
278-
)
279-
280-
if output_freq is None:
281-
raise ValueError(
282-
"ts_blend requires inputs with a common regular frequency. "
283-
"For irregular handoff behavior, use ts_splice."
284-
)
285-
286-
return regular_index_from_valid_extent(series, output_freq)
44+
start = min(s.index.min() for s in series)
45+
end = max(s.index.max() for s in series)
46+
return pd.date_range(start, end, freq=output_freq)
28747

28848

28949
def _distance_to_gap(hi_col: pd.Series, mode: str = "count") -> pd.Series:

0 commit comments

Comments
 (0)