Skip to content

Commit fa1598d

Browse files
EliEli
authored andcommitted
Gap visualization tool
1 parent 998343b commit fa1598d

1 file changed

Lines changed: 185 additions & 0 deletions

File tree

vtools/data/vis_gap.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
2+
3+
import numpy as np
4+
import pandas as pd
5+
import matplotlib.pyplot as plt
6+
import matplotlib.dates as mdates
7+
from datetime import timedelta
8+
9+
10+
all = ["plot_missing_data"]
11+
12+
def generate_sample_data():
13+
"""
14+
Generate sample time series data with:
15+
- 12 series over a 2-year period at 15-minute intervals.
16+
- 20 random gaps (length 1 to 200 intervals) per series.
17+
- In 6 randomly chosen series, a gap at the end (up to 2 days missing).
18+
- In the first series, a gap covering the first year.
19+
"""
20+
freq = '15min'
21+
start = pd.Timestamp("2010-01-01")
22+
end = pd.Timestamp("2012-01-01")
23+
index = pd.date_range(start, end, freq=freq)
24+
n = len(index)
25+
cols = [f"Series{i+1:02d}" for i in range(12)]
26+
27+
# Create DataFrame with random data
28+
data = np.random.randn(n, 12)
29+
df = pd.DataFrame(data, index=index, columns=cols)
30+
31+
# For reproducibility of gaps:
32+
rng = np.random.default_rng(seed=0)
33+
34+
# Introduce 20 large and 20 small scattered gaps per series (randomly chosen start and gap length)
35+
for col in df.columns:
36+
for _ in range(20):
37+
start_idx = rng.integers(0, n - 1)
38+
gap_length = rng.integers(1, 201) # gap length between 1 and 200 intervals
39+
end_idx = min(start_idx + gap_length, n)
40+
df.loc[df.index[start_idx:end_idx], col] = np.nan
41+
start_idx = rng.integers(0, n - 1)
42+
gap_length = rng.integers(1, 5) # gap length between 1 and 5 intervals
43+
end_idx = min(start_idx + gap_length, n)
44+
df.loc[df.index[start_idx:end_idx], col] = np.nan
45+
46+
47+
# For 6 randomly chosen series, remove a gap at the end (up to 2 days missing)
48+
gap_end_candidates = rng.choice(df.columns, size=6, replace=False)
49+
intervals_per_day = int((24*60) / 15)
50+
for col in gap_end_candidates:
51+
gap_length = rng.integers(1, 2 * intervals_per_day + 1)
52+
df.loc[df.index[-gap_length:], col] = np.nan
53+
54+
# For the first series, remove the first year of data
55+
df.loc[df.index < (start + pd.DateOffset(years=1)), df.columns[0]] = np.nan
56+
df.iloc[0:(n-400),7] = 0.
57+
df.iloc[:,8] = 0.
58+
return df[cols]
59+
60+
def plot_missing_data(df, ax, min_gap_duration, overall_start, overall_end):
61+
"""
62+
Plot missing data onto the provided axis with a given minimum gap duration.
63+
"""
64+
overall_start_num = mdates.date2num(overall_start)
65+
overall_end_num = mdates.date2num(overall_end)
66+
67+
# Colors for the bars
68+
overall_color = 'skyblue'
69+
gap_color = 'orange'
70+
boundary_gap_color = 'indianred'
71+
72+
bar_height = 0.8 # thickness for each horizontal bar
73+
74+
# Clear current content on ax
75+
ax.cla()
76+
77+
# Prepare lists for y-ticks and labels with annotations.
78+
y_ticks = []
79+
y_labels = []
80+
81+
# Loop over each series (each column in the DataFrame)
82+
for i, col in enumerate(df.columns):
83+
# Draw a light blue bar covering the entire time span for this series.
84+
ax.broken_barh([(overall_start_num, overall_end_num - overall_start_num)],
85+
(i - bar_height/2, bar_height),
86+
facecolors=overall_color, alpha=0.6)
87+
88+
# Extract the series and find the missing (NaN) segments.
89+
series = df[col]
90+
mask = series.isna()
91+
if mask.any():
92+
groups = (mask != mask.shift()).cumsum()
93+
for group_id, group in mask.groupby(groups):
94+
if group.iloc[0]: # missing segment
95+
gap_start = group.index[0]
96+
gap_end = group.index[-1] + pd.Timedelta(minutes=15)
97+
98+
# Expand gap if too short
99+
actual_gap = gap_end - gap_start
100+
if actual_gap < min_gap_duration:
101+
extra = min_gap_duration - actual_gap
102+
gap_start_adj = gap_start - extra/2
103+
gap_end_adj = gap_end + extra/2
104+
gap_start = max(gap_start_adj, overall_start)
105+
gap_end = min(gap_end_adj, overall_end)
106+
107+
# Use a distinct color if the gap touches either end.
108+
if gap_start <= overall_start or gap_end >= overall_end:
109+
current_gap_color = boundary_gap_color
110+
else:
111+
current_gap_color = gap_color
112+
113+
gap_start_num = mdates.date2num(gap_start)
114+
gap_end_num = mdates.date2num(gap_end)
115+
ax.broken_barh([(gap_start_num, gap_end_num - gap_start_num)],
116+
(i - bar_height/2, bar_height),
117+
facecolors=current_gap_color)
118+
119+
y_ticks.append(i)
120+
# Compute percentage of missing data for annotation.
121+
perc = series.isna().mean() * 100
122+
if perc == 0:
123+
label = f"{col} (0%)"
124+
elif perc > 0 and perc < 0.01:
125+
label = f"{col} (<0.01%)"
126+
else:
127+
label = f"{col} ({perc:.2f}%)"
128+
y_labels.append(label)
129+
130+
# Format axes
131+
ax.set_yticks(y_ticks)
132+
ax.set_yticklabels(y_labels)
133+
ax.xaxis_date()
134+
ax.set_xlabel("Time")
135+
ax.set_title(f"Missing Data Visualization (Min gap = {min_gap_duration})")
136+
ax.figure.autofmt_xdate()
137+
ax.figure.canvas.draw_idle()
138+
139+
def interactive_gap_plot(df):
140+
"""
141+
Create an interactive plot that updates the minimum gap duration based on the current x-axis view.
142+
The mapping used here is:
143+
- >=20 years view: min gap = 1 day
144+
- >=10 years view: min gap = 12 hours
145+
- Otherwise: min gap = 1 hour
146+
"""
147+
# Overall full time range (fixed)
148+
overall_start = df.index[0]
149+
overall_end = df.index[-1] + pd.Timedelta(minutes=15)
150+
151+
# Create figure and initial plot with a default min_gap_duration.
152+
default_min_gap = timedelta(hours=20) # starting default
153+
fig, ax = plt.subplots(figsize=(12, 6))
154+
plot_missing_data(df, ax, default_min_gap, overall_start, overall_end)
155+
156+
def on_xlim_change(event_ax):
157+
print("xlim changed")
158+
# Determine current view duration
159+
xlim = event_ax.get_xlim()
160+
dt0 = mdates.num2date(xlim[0])
161+
dt1 = mdates.num2date(xlim[1])
162+
view_duration = dt1 - dt0
163+
years_view = view_duration.total_seconds() / (365.25 * 24 * 3600)
164+
165+
# Adjust min_gap_duration based on view span
166+
if years_view >= 20:
167+
new_min_gap = timedelta(days=1)
168+
elif years_view >= 10:
169+
new_min_gap = timedelta(hours=12)
170+
else:
171+
new_min_gap = timedelta(hours=1)
172+
173+
# Redraw the missing data visualization with the new min_gap_duration.
174+
plot_missing_data(df, ax, new_min_gap, overall_start, overall_end)
175+
176+
# Connect the x-axis limits change event to our callback.
177+
ax.callbacks.connect('xlim_changed', on_xlim_change)
178+
179+
plt.tight_layout()
180+
plt.show()
181+
182+
# --- Main ---
183+
if __name__ == '__main__':
184+
df_sample = generate_sample_data()
185+
interactive_plot(df_sample)

0 commit comments

Comments
 (0)