|
| 1 | + |
| 2 | + |
| 3 | +import numpy as np |
| 4 | +import pandas as pd |
| 5 | +import matplotlib.pyplot as plt |
| 6 | +import matplotlib.dates as mdates |
| 7 | +from datetime import timedelta |
| 8 | + |
| 9 | + |
| 10 | +all = ["plot_missing_data"] |
| 11 | + |
| 12 | +def generate_sample_data(): |
| 13 | + """ |
| 14 | + Generate sample time series data with: |
| 15 | + - 12 series over a 2-year period at 15-minute intervals. |
| 16 | + - 20 random gaps (length 1 to 200 intervals) per series. |
| 17 | + - In 6 randomly chosen series, a gap at the end (up to 2 days missing). |
| 18 | + - In the first series, a gap covering the first year. |
| 19 | + """ |
| 20 | + freq = '15min' |
| 21 | + start = pd.Timestamp("2010-01-01") |
| 22 | + end = pd.Timestamp("2012-01-01") |
| 23 | + index = pd.date_range(start, end, freq=freq) |
| 24 | + n = len(index) |
| 25 | + cols = [f"Series{i+1:02d}" for i in range(12)] |
| 26 | + |
| 27 | + # Create DataFrame with random data |
| 28 | + data = np.random.randn(n, 12) |
| 29 | + df = pd.DataFrame(data, index=index, columns=cols) |
| 30 | + |
| 31 | + # For reproducibility of gaps: |
| 32 | + rng = np.random.default_rng(seed=0) |
| 33 | + |
| 34 | + # Introduce 20 large and 20 small scattered gaps per series (randomly chosen start and gap length) |
| 35 | + for col in df.columns: |
| 36 | + for _ in range(20): |
| 37 | + start_idx = rng.integers(0, n - 1) |
| 38 | + gap_length = rng.integers(1, 201) # gap length between 1 and 200 intervals |
| 39 | + end_idx = min(start_idx + gap_length, n) |
| 40 | + df.loc[df.index[start_idx:end_idx], col] = np.nan |
| 41 | + start_idx = rng.integers(0, n - 1) |
| 42 | + gap_length = rng.integers(1, 5) # gap length between 1 and 5 intervals |
| 43 | + end_idx = min(start_idx + gap_length, n) |
| 44 | + df.loc[df.index[start_idx:end_idx], col] = np.nan |
| 45 | + |
| 46 | + |
| 47 | + # For 6 randomly chosen series, remove a gap at the end (up to 2 days missing) |
| 48 | + gap_end_candidates = rng.choice(df.columns, size=6, replace=False) |
| 49 | + intervals_per_day = int((24*60) / 15) |
| 50 | + for col in gap_end_candidates: |
| 51 | + gap_length = rng.integers(1, 2 * intervals_per_day + 1) |
| 52 | + df.loc[df.index[-gap_length:], col] = np.nan |
| 53 | + |
| 54 | + # For the first series, remove the first year of data |
| 55 | + df.loc[df.index < (start + pd.DateOffset(years=1)), df.columns[0]] = np.nan |
| 56 | + df.iloc[0:(n-400),7] = 0. |
| 57 | + df.iloc[:,8] = 0. |
| 58 | + return df[cols] |
| 59 | + |
| 60 | +def plot_missing_data(df, ax, min_gap_duration, overall_start, overall_end): |
| 61 | + """ |
| 62 | + Plot missing data onto the provided axis with a given minimum gap duration. |
| 63 | + """ |
| 64 | + overall_start_num = mdates.date2num(overall_start) |
| 65 | + overall_end_num = mdates.date2num(overall_end) |
| 66 | + |
| 67 | + # Colors for the bars |
| 68 | + overall_color = 'skyblue' |
| 69 | + gap_color = 'orange' |
| 70 | + boundary_gap_color = 'indianred' |
| 71 | + |
| 72 | + bar_height = 0.8 # thickness for each horizontal bar |
| 73 | + |
| 74 | + # Clear current content on ax |
| 75 | + ax.cla() |
| 76 | + |
| 77 | + # Prepare lists for y-ticks and labels with annotations. |
| 78 | + y_ticks = [] |
| 79 | + y_labels = [] |
| 80 | + |
| 81 | + # Loop over each series (each column in the DataFrame) |
| 82 | + for i, col in enumerate(df.columns): |
| 83 | + # Draw a light blue bar covering the entire time span for this series. |
| 84 | + ax.broken_barh([(overall_start_num, overall_end_num - overall_start_num)], |
| 85 | + (i - bar_height/2, bar_height), |
| 86 | + facecolors=overall_color, alpha=0.6) |
| 87 | + |
| 88 | + # Extract the series and find the missing (NaN) segments. |
| 89 | + series = df[col] |
| 90 | + mask = series.isna() |
| 91 | + if mask.any(): |
| 92 | + groups = (mask != mask.shift()).cumsum() |
| 93 | + for group_id, group in mask.groupby(groups): |
| 94 | + if group.iloc[0]: # missing segment |
| 95 | + gap_start = group.index[0] |
| 96 | + gap_end = group.index[-1] + pd.Timedelta(minutes=15) |
| 97 | + |
| 98 | + # Expand gap if too short |
| 99 | + actual_gap = gap_end - gap_start |
| 100 | + if actual_gap < min_gap_duration: |
| 101 | + extra = min_gap_duration - actual_gap |
| 102 | + gap_start_adj = gap_start - extra/2 |
| 103 | + gap_end_adj = gap_end + extra/2 |
| 104 | + gap_start = max(gap_start_adj, overall_start) |
| 105 | + gap_end = min(gap_end_adj, overall_end) |
| 106 | + |
| 107 | + # Use a distinct color if the gap touches either end. |
| 108 | + if gap_start <= overall_start or gap_end >= overall_end: |
| 109 | + current_gap_color = boundary_gap_color |
| 110 | + else: |
| 111 | + current_gap_color = gap_color |
| 112 | + |
| 113 | + gap_start_num = mdates.date2num(gap_start) |
| 114 | + gap_end_num = mdates.date2num(gap_end) |
| 115 | + ax.broken_barh([(gap_start_num, gap_end_num - gap_start_num)], |
| 116 | + (i - bar_height/2, bar_height), |
| 117 | + facecolors=current_gap_color) |
| 118 | + |
| 119 | + y_ticks.append(i) |
| 120 | + # Compute percentage of missing data for annotation. |
| 121 | + perc = series.isna().mean() * 100 |
| 122 | + if perc == 0: |
| 123 | + label = f"{col} (0%)" |
| 124 | + elif perc > 0 and perc < 0.01: |
| 125 | + label = f"{col} (<0.01%)" |
| 126 | + else: |
| 127 | + label = f"{col} ({perc:.2f}%)" |
| 128 | + y_labels.append(label) |
| 129 | + |
| 130 | + # Format axes |
| 131 | + ax.set_yticks(y_ticks) |
| 132 | + ax.set_yticklabels(y_labels) |
| 133 | + ax.xaxis_date() |
| 134 | + ax.set_xlabel("Time") |
| 135 | + ax.set_title(f"Missing Data Visualization (Min gap = {min_gap_duration})") |
| 136 | + ax.figure.autofmt_xdate() |
| 137 | + ax.figure.canvas.draw_idle() |
| 138 | + |
| 139 | +def interactive_gap_plot(df): |
| 140 | + """ |
| 141 | + Create an interactive plot that updates the minimum gap duration based on the current x-axis view. |
| 142 | + The mapping used here is: |
| 143 | + - >=20 years view: min gap = 1 day |
| 144 | + - >=10 years view: min gap = 12 hours |
| 145 | + - Otherwise: min gap = 1 hour |
| 146 | + """ |
| 147 | + # Overall full time range (fixed) |
| 148 | + overall_start = df.index[0] |
| 149 | + overall_end = df.index[-1] + pd.Timedelta(minutes=15) |
| 150 | + |
| 151 | + # Create figure and initial plot with a default min_gap_duration. |
| 152 | + default_min_gap = timedelta(hours=20) # starting default |
| 153 | + fig, ax = plt.subplots(figsize=(12, 6)) |
| 154 | + plot_missing_data(df, ax, default_min_gap, overall_start, overall_end) |
| 155 | + |
| 156 | + def on_xlim_change(event_ax): |
| 157 | + print("xlim changed") |
| 158 | + # Determine current view duration |
| 159 | + xlim = event_ax.get_xlim() |
| 160 | + dt0 = mdates.num2date(xlim[0]) |
| 161 | + dt1 = mdates.num2date(xlim[1]) |
| 162 | + view_duration = dt1 - dt0 |
| 163 | + years_view = view_duration.total_seconds() / (365.25 * 24 * 3600) |
| 164 | + |
| 165 | + # Adjust min_gap_duration based on view span |
| 166 | + if years_view >= 20: |
| 167 | + new_min_gap = timedelta(days=1) |
| 168 | + elif years_view >= 10: |
| 169 | + new_min_gap = timedelta(hours=12) |
| 170 | + else: |
| 171 | + new_min_gap = timedelta(hours=1) |
| 172 | + |
| 173 | + # Redraw the missing data visualization with the new min_gap_duration. |
| 174 | + plot_missing_data(df, ax, new_min_gap, overall_start, overall_end) |
| 175 | + |
| 176 | + # Connect the x-axis limits change event to our callback. |
| 177 | + ax.callbacks.connect('xlim_changed', on_xlim_change) |
| 178 | + |
| 179 | + plt.tight_layout() |
| 180 | + plt.show() |
| 181 | + |
| 182 | +# --- Main --- |
| 183 | +if __name__ == '__main__': |
| 184 | + df_sample = generate_sample_data() |
| 185 | + interactive_plot(df_sample) |
0 commit comments