Skip to content

Commit b0bddc9

Browse files
EliEli
authored andcommitted
Added transition and improved extrapolation including notebooks and tests. Some improvements to merge notebook and gap visualization.
1 parent 589828a commit b0bddc9

9 files changed

Lines changed: 930 additions & 411 deletions

File tree

docs/doctrees/nbsphinx/notebooks/merge_splice.ipynb

Lines changed: 15 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
"id": "436d5ccf",
66
"metadata": {},
77
"source": [
8-
"# Understanding `ts_merge` and `ts_splice`\n",
8+
"# Merging and Splicing Time Series\n",
99
"This tutorial demonstrates the usage and difference between `ts_merge` and `ts_splice`, two methods for folding together time series into a combined data structure.\n",
1010
"\n",
1111
"- **`ts_merge`** blends multiple time series together based on priority, filling missing values. It potentiallyu uses all the input series at all timestamps.\n",
1212
"- **`ts_splice`** stitches together time series in sequential time **blocks** without mixing values.\n",
1313
"\n",
14-
"We will describe the effect on regularly sampled series (which have the `freq` attribute) and on irregular. We will also explore the **`names`** argument, which controls how columns are selected or renamed in the merging/splicing process.\n",
14+
"We will describe the effect on regularly sampled series (which have the `freq` attribute) and on irregular. We will also explore the **`names`** argument, which controls how columns are selected or renamed in the merging/splicing process. There is a file-level command line tools for this as well in the `dms_datastore` package.\n",
1515
"\n",
1616
"## Prioritized filling on regular series\n",
1717
"Let's begin by showing how `ts_merge` and `ts_splice` fold together two regular series but gappy \n",
@@ -22,95 +22,20 @@
2222
},
2323
{
2424
"cell_type": "code",
25-
"execution_count": 53,
25+
"execution_count": 1,
2626
"id": "e52fb077",
2727
"metadata": {},
2828
"outputs": [
2929
{
30-
"name": "stdout",
31-
"output_type": "stream",
32-
"text": [
33-
"Series 1 (Primary):\n"
34-
]
35-
},
36-
{
37-
"data": {
38-
"text/plain": [
39-
"2023-01-01 1.0\n",
40-
"2023-01-02 NaN\n",
41-
"2023-01-03 3.0\n",
42-
"2023-01-04 NaN\n",
43-
"2023-01-05 5.0\n",
44-
"2023-01-06 6.0\n",
45-
"2023-01-07 NaN\n",
46-
"2023-01-08 8.0\n",
47-
"2023-01-09 9.0\n",
48-
"2023-01-10 10.0\n",
49-
"Freq: D, Name: A, dtype: float64"
50-
]
51-
},
52-
"metadata": {},
53-
"output_type": "display_data"
54-
},
55-
{
56-
"name": "stdout",
57-
"output_type": "stream",
58-
"text": [
59-
"\n",
60-
"Series 2 (Secondary - Fills Gaps):\n"
61-
]
62-
},
63-
{
64-
"data": {
65-
"text/plain": [
66-
"2023-01-01 NaN\n",
67-
"2023-01-02 2.0\n",
68-
"2023-01-03 NaN\n",
69-
"2023-01-04 4.0\n",
70-
"2023-01-05 NaN\n",
71-
"2023-01-06 NaN\n",
72-
"2023-01-07 7.0\n",
73-
"2023-01-08 NaN\n",
74-
"2023-01-09 NaN\n",
75-
"2023-01-10 NaN\n",
76-
"2023-01-11 3.0\n",
77-
"2023-01-12 4.0\n",
78-
"Freq: D, Name: A, dtype: float64"
79-
]
80-
},
81-
"metadata": {},
82-
"output_type": "display_data"
83-
},
84-
{
85-
"name": "stdout",
86-
"output_type": "stream",
87-
"text": [
88-
"\n",
89-
"Series 3 (Tertiary - Fills Gaps):\n"
30+
"ename": "NameError",
31+
"evalue": "name 'pd' is not defined",
32+
"output_type": "error",
33+
"traceback": [
34+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
35+
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
36+
"Cell \u001b[1;32mIn[1], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# ========================================\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# 1️⃣ Creating Regular Time Series (1D Frequency with Missing Data)\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# ========================================\u001b[39;00m\n\u001b[1;32m----> 4\u001b[0m idx1 \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mdate_range(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m2023-01-01\u001b[39m\u001b[38;5;124m\"\u001b[39m, periods\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m, freq\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1D\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 5\u001b[0m idx2 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mdate_range(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m2023-01-01\u001b[39m\u001b[38;5;124m\"\u001b[39m, periods\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m12\u001b[39m, freq\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1D\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 6\u001b[0m idx3 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mdate_range(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m2022-12-31\u001b[39m\u001b[38;5;124m\"\u001b[39m, periods\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m14\u001b[39m, freq\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1D\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
37+
"\u001b[1;31mNameError\u001b[0m: name 'pd' is not defined"
9038
]
91-
},
92-
{
93-
"data": {
94-
"text/plain": [
95-
"2022-12-31 1000.0\n",
96-
"2023-01-01 1001.0\n",
97-
"2023-01-02 1002.0\n",
98-
"2023-01-03 NaN\n",
99-
"2023-01-04 1004.0\n",
100-
"2023-01-05 NaN\n",
101-
"2023-01-06 NaN\n",
102-
"2023-01-07 1007.0\n",
103-
"2023-01-08 NaN\n",
104-
"2023-01-09 NaN\n",
105-
"2023-01-10 NaN\n",
106-
"2023-01-11 1005.0\n",
107-
"2023-01-12 1006.0\n",
108-
"2023-01-13 1007.0\n",
109-
"Freq: D, Name: A, dtype: float64"
110-
]
111-
},
112-
"metadata": {},
113-
"output_type": "display_data"
11439
}
11540
],
11641
"source": [
@@ -148,7 +73,7 @@
14873
},
14974
{
15075
"cell_type": "code",
151-
"execution_count": 54,
76+
"execution_count": null,
15277
"id": "5dd08914",
15378
"metadata": {},
15479
"outputs": [
@@ -291,7 +216,7 @@
291216
},
292217
{
293218
"cell_type": "code",
294-
"execution_count": 61,
219+
"execution_count": null,
295220
"id": "9a1d0dae",
296221
"metadata": {},
297222
"outputs": [
@@ -429,7 +354,7 @@
429354
},
430355
{
431356
"cell_type": "code",
432-
"execution_count": 63,
357+
"execution_count": null,
433358
"id": "35cfc422",
434359
"metadata": {},
435360
"outputs": [
@@ -750,7 +675,7 @@
750675
},
751676
{
752677
"cell_type": "code",
753-
"execution_count": 62,
678+
"execution_count": null,
754679
"id": "c6949e66",
755680
"metadata": {},
756681
"outputs": [

docsrc/notebooks/transition.ipynb

Lines changed: 178 additions & 0 deletions
Large diffs are not rendered by default.

tests/test_extrapolation.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
2+
import pandas as pd
3+
import numpy as np
4+
import pytest
5+
from pandas.testing import assert_series_equal, assert_frame_equal
6+
7+
import pandas as pd
8+
import numpy as np
9+
import pytest
10+
from pandas.testing import assert_series_equal
11+
from vtools import extrapolate_ts
12+
13+
# Core test cases
14+
def test_constant_forward():
15+
ts = pd.Series([1, 2, 3], index=pd.date_range("2020-01-01", periods=3, freq="d"))
16+
result = extrapolate_ts(ts, end="2020-01-05", method="constant", val=10)
17+
expected = pd.Series([1, 2, 3, 10, 10], index=pd.date_range("2020-01-01", periods=5, freq="d"))
18+
assert_series_equal(result, expected)
19+
20+
def test_constant_backward():
21+
ts = pd.Series([4, 5, 6], index=pd.date_range("2020-01-03", periods=3, freq="d"))
22+
result = extrapolate_ts(ts, start="2020-01-01", method="constant", val=0)
23+
expected = pd.Series([0, 0, 4, 5, 6], index=pd.date_range("2020-01-01", periods=5, freq="d"))
24+
assert_series_equal(result, expected)
25+
26+
def test_taper_forward():
27+
ts = pd.Series([5], index=pd.date_range("2020-01-01", periods=1, freq="D"))
28+
result = extrapolate_ts(ts, end="2020-01-04", method="taper", val=0)
29+
expected = pd.Series([5, 3.3333, 1.6667, 0], index=pd.date_range("2020-01-01", periods=4, freq="D"))
30+
assert_series_equal(result.round(4), expected.round(4), check_dtype=False)
31+
32+
def test_taper_backward():
33+
ts = pd.Series([5], index=pd.date_range("2020-01-04", periods=1, freq="D"))
34+
result = extrapolate_ts(ts, start="2020-01-01", method="taper", val=0)
35+
expected = pd.Series([0, 1.6667, 3.3333, 5], index=pd.date_range("2020-01-01", periods=4, freq="D"))
36+
assert_series_equal(result.round(4), expected.round(4), check_dtype=False)
37+
38+
def test_linear_slope_bidirectional():
39+
ts = pd.Series([2, 4], index=pd.date_range("2020-01-02", periods=2, freq="D"))
40+
result = extrapolate_ts(ts, start="2020-01-01", end="2020-01-04", method="linear_slope")
41+
expected = pd.Series([0, 2, 4, 6], index=pd.date_range("2020-01-01", periods=4, freq="D"))
42+
assert_series_equal(result.round(4), expected.round(4), check_dtype=False)
43+
44+
# Contract violations
45+
def test_ffill_before_start_error():
46+
ts = pd.Series([1, 2, 3], index=pd.date_range("2020-01-03", periods=3, freq="D"))
47+
with pytest.raises(ValueError, match="ffill.*before start"):
48+
extrapolate_ts(ts, start="2020-01-01", method="ffill")
49+
50+
def test_bfill_after_end_error():
51+
ts = pd.Series([1, 2, 3], index=pd.date_range("2020-01-01", periods=3, freq="D"))
52+
with pytest.raises(ValueError, match="bfill.*after end"):
53+
extrapolate_ts(ts, end="2020-01-05", method="bfill")
54+
55+
def test_taper_without_val():
56+
ts = pd.Series([1], index=pd.date_range("2020-01-01", periods=1, freq="D"))
57+
with pytest.raises(ValueError, match="requires 'val'"):
58+
extrapolate_ts(ts, end="2020-01-02", method="taper")
59+
60+
def test_linear_with_val_error():
61+
ts = pd.Series([1, 2], index=pd.date_range("2020-01-01", periods=2, freq="d"))
62+
with pytest.raises(ValueError, match="does not use 'val'"):
63+
extrapolate_ts(ts, start="2019-12-30", end="2020-01-03", method="linear_slope", val=99)
64+
65+
66+
def test_short_linear_error():
67+
ts = pd.Series([1], index=pd.date_range("2020-01-01", periods=1, freq="D"))
68+
with pytest.raises(ValueError, match="2 data points.*required"):
69+
extrapolate_ts(ts, start="2019-12-30", method="linear_slope")
70+
71+
# Regression test from thread
72+
def test_dtype_preservation():
73+
ts = pd.Series([1, 2], dtype="int64", index=pd.date_range("2020-01-02", periods=2, freq="D"))
74+
extended = extrapolate_ts(ts, end="2020-01-04", method="ffill")
75+
expected = pd.Series([1, 2, 2], index=pd.date_range("2020-01-02", periods=3, freq="D"), dtype="int64")
76+
assert_series_equal(extended, expected)
77+
78+
def test_fill_preserves_original():
79+
ts = pd.Series([1, 2, 3], index=pd.date_range("2020-01-01", periods=3, freq="D"))
80+
result = extrapolate_ts(ts, end="2020-01-05", method="constant", val=5)
81+
assert result.loc["2020-01-01"] == 1
82+
assert result.loc["2020-01-04"] == 5
83+
84+
85+
86+
def generate_series(start, periods, freq, values=None):
87+
"""Helper to generate a series with given freq and values."""
88+
idx = pd.date_range(start=start, periods=periods, freq=freq)
89+
vals = values if values is not None else np.arange(periods)
90+
return pd.Series(vals, index=idx)
91+
92+
93+
def test_taper_across_frequencies():
94+
freqs = ["15min", "h", "d"]
95+
for freq in freqs:
96+
ts = generate_series("2020-01-01", periods=2, freq=freq, values=[10, 20])
97+
interval = ts.index[1] - ts.index[0]
98+
end_time = ts.index[-1] + 3 * interval
99+
result = extrapolate_ts(ts, end=end_time, method="taper", val=0.0)
100+
assert result.index[-1] == end_time
101+
102+
103+
def test_taper_across_frequencies():
104+
results = {}
105+
for freq in ["15min", "h", "d"]:
106+
ts = generate_series("2020-01-01", periods=2, freq=freq, values=[10, 10]) # Ensure 2+ points
107+
step = ts.index[1] - ts.index[0]
108+
end_time = ts.index[-1] + 3 * step
109+
result = extrapolate_ts(ts, end=end_time, method="taper", val=0.0)
110+
assert result.index.freqstr.lower() == freq
111+
results[freq] = result
112+

vtools/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from vtools.data.vtime import *
1212
from vtools.data.timeseries import *
1313

14+
from vtools.functions.transition import *
1415
from vtools.functions.climatology import *
1516
from vtools.functions.interannual import *
1617
from vtools.functions.filter import *

vtools/data/gap.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,68 @@ def gap_distance(ts, disttype="count", to = "good"):
203203
raise ValueError("invalid input disttype, must be count or freq")
204204

205205

206+
import pandas as pd
207+
import numpy as np
208+
209+
def describe_series_gaps(s: pd.Series, name: str, context: int = 2):
210+
"""
211+
Print gaps in a single Series s, showing `context` non-null points
212+
before and after each gap, with an ellipsis marker in between.
213+
"""
214+
mask = s.isna().to_numpy()
215+
idx = s.index.to_numpy()
216+
217+
if not mask.any():
218+
print(f"{name}: no missing values\n")
219+
return
220+
221+
# find rising edges (gap starts) and falling edges (gap ends)
222+
diffs = np.diff(mask.astype(int), prepend=0, append=0)
223+
starts = np.where(diffs == 1)[0]
224+
ends = np.where(diffs == -1)[0] - 1
225+
226+
for i, (st, en) in enumerate(zip(starts, ends), 1):
227+
gap_len = en - st + 1
228+
print(f"\n{name} — gap #{i}:")
229+
print(f" from {idx[st]} to {idx[en]} ({gap_len} samples missing)")
230+
231+
# pre-gap context
232+
pre_idxs = []
233+
j = st - 1
234+
while j >= 0 and len(pre_idxs) < context:
235+
if not mask[j]:
236+
pre_idxs.append(j)
237+
j -= 1
238+
for pi in reversed(pre_idxs):
239+
print(f" → {idx[pi]} : {s.iloc[pi]}")
240+
241+
# ellipsis marker
242+
print(" ... [ missing block ] ...")
243+
244+
# post-gap context
245+
post_idxs = []
246+
j = en + 1
247+
N = len(mask)
248+
while j < N and len(post_idxs) < context:
249+
if not mask[j]:
250+
post_idxs.append(j)
251+
j += 1
252+
for pi in post_idxs:
253+
print(f" ← {idx[pi]} : {s.iloc[pi]}")
254+
print()
255+
256+
def describe_null(dset, name, context=2):
257+
"""
258+
If dset is a DataFrame, run describe_series_gaps on each column.
259+
If it's a Series, just run it once.
260+
"""
261+
if isinstance(dset, pd.DataFrame):
262+
for col in dset.columns:
263+
describe_series_gaps(dset[col], f"{name}.{col}", context=context)
264+
else:
265+
describe_series_gaps(dset, name, context=context)
266+
267+
206268

207269
def example_gap():
208270
import numpy as np

0 commit comments

Comments
 (0)