Add chunked telemetry fetch with adaptive queries

haoruizhou · haoruizhou · commit 7f70f4b8a28a · 2026-03-14T22:26:27.000-04:00
Introduce fetch_telemetry_chunked to fetch telemetry over large time ranges by splitting queries into time chunks and using adaptive_query to recursively halve ranges when server file limits are hit. Supports optional resampling, movement filtering, parallel top-level chunks via run_chunks_parallel, and progress output. Export the new function in package __init__ and add .DS_Store to .gitignore. Also import needed utilities (adaptive_query, run_chunks_parallel) and typing hints.

Add sequential execution for single-worker chunks
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,4 @@ build/
 coverage.xml
 htmlcov/
 /examples/__pycache__
+.DS_Store
diff --git a/src/slicks/__init__.py b/src/slicks/__init__.py
@@ -1,4 +1,4 @@
-from .fetcher import fetch_telemetry, bulk_fetch_season, list_target_sensors, get_influx_client
+from .fetcher import fetch_telemetry, fetch_telemetry_chunked, bulk_fetch_season, list_target_sensors, get_influx_client
 from .discovery import discover_sensors
 from .movement_detector import detect_movement_ratio, get_movement_segments, filter_data_in_movement
 from .config import connect_influxdb3
diff --git a/src/slicks/fetcher.py b/src/slicks/fetcher.py
@@ -1,9 +1,10 @@
 import os
 from datetime import datetime, timedelta
+from typing import List, Optional
 import pandas as pd
 from influxdb_client_3 import InfluxDBClient3
 from . import config
-from .query_utils import quote_table
+from .query_utils import quote_table, adaptive_query, run_chunks_parallel
 from .movement_detector import filter_data_in_movement
 
 
@@ -107,6 +108,135 @@ def fetch_telemetry(start_time, end_time, signals=None, client=None, filter_move
         print(f"Error fetching data: {e}")
         return None
 
+def fetch_telemetry_chunked(
+    start_time: datetime,
+    end_time: datetime,
+    signals=None,
+    client=None,
+    filter_movement: bool = True,
+    resample: Optional[str] = "1s",
+    chunk_size: timedelta = timedelta(hours=6),
+    max_workers: int = 1,
+    show_progress: bool = True,
+) -> Optional[pd.DataFrame]:
+    """
+    Fetch telemetry with automatic time-splitting when InfluxDB's per-query
+    file limit is exceeded.
+
+    Identical interface to ``fetch_telemetry`` but uses ``adaptive_query``
+    internally: if a time window hits the server's parquet-file cap the range
+    is recursively halved until each sub-query succeeds, then results are
+    concatenated.  Suitable for ranges that span many test sessions.
+
+    Args:
+        start_time: Start of the query range.
+        end_time:   End of the query range.
+        signals:    Sensor names (defaults to config.SIGNALS).
+        client:     Existing InfluxDBClient3 instance (creates one if None).
+        filter_movement: Apply movement-detection filtering to the final result.
+        resample:   Pandas frequency string, e.g. "1s", "100ms", or None for raw.
+        chunk_size: Initial time window per adaptive-query call.  Each chunk is
+                    split further on file-limit errors. Default: 6 hours.
+        max_workers: Parallel workers for top-level chunks (1 = sequential).
+        show_progress: Print progress messages.
+
+    Returns:
+        Combined DataFrame with DatetimeIndex, or None if no data found.
+    """
+    if signals is None:
+        signals = config.SIGNALS
+    if isinstance(signals, str):
+        signals = [signals]
+    if not signals:
+        return None
+
+    if client is None:
+        client = get_influx_client()
+
+    signal_list = "', '".join(signals)
+    schema = config.INFLUX_SCHEMA or "iox"
+    table = config.INFLUX_TABLE or config.INFLUX_DB
+    table_ref = quote_table(schema, table)
+
+    def _fmt(dt: datetime) -> str:
+        """Format datetime as UTC ISO string for SQL, safe for both naive and tz-aware."""
+        return dt.strftime("%Y-%m-%dT%H:%M:%S") + "Z"
+
+    def _fetch_chunk(cli: InfluxDBClient3, t0: datetime, t1: datetime) -> List[pd.DataFrame]:
+        """Fetch one time window; return list-of-DataFrame for adaptive_query."""
+        query = (
+            f"SELECT time, \"signalName\", \"sensorReading\" "
+            f"FROM {table_ref} "
+            f"WHERE \"signalName\" IN ('{signal_list}') "
+            f"AND time >= '{_fmt(t0)}' AND time < '{_fmt(t1)}' "
+            f"ORDER BY time ASC"
+        )
+        raw = cli.query(query=query, mode="pandas")
+        if raw.empty:
+            return []
+        df = raw.pivot_table(
+            index="time",
+            columns="signalName",
+            values="sensorReading",
+            aggfunc="mean",
+        )
+        return [df]
+
+    # Split full range into top-level chunks, then use adaptive_query per chunk
+    chunks: List[tuple] = []
+    t = start_time
+    while t < end_time:
+        chunks.append((t, min(t + chunk_size, end_time)))
+        t += chunk_size
+
+    if show_progress:
+        print(f"Fetching {len(chunks)} chunk(s) from {start_time.date()} to {end_time.date()}...")
+
+    all_dfs: List[pd.DataFrame] = []
+
+    def _fetch_adaptive(cli: InfluxDBClient3, t0: datetime, t1: datetime) -> List[pd.DataFrame]:
+        return adaptive_query(
+            client=cli,
+            t0=t0,
+            t1=t1,
+            primary_fn=_fetch_chunk,
+            min_span=timedelta(minutes=1),
+        )
+
+    if max_workers > 1:
+        all_dfs = run_chunks_parallel(
+            client_factory=get_influx_client,
+            chunks=chunks,
+            query_fn=_fetch_adaptive,
+            max_workers=max_workers,
+        )
+    else:
+        for i, (t0, t1) in enumerate(chunks):
+            if show_progress:
+                print(f"  chunk {i + 1}/{len(chunks)}: {t0} → {t1}")
+            results = _fetch_adaptive(client, t0, t1)
+            all_dfs.extend(results)
+
+    if not all_dfs:
+        if show_progress:
+            print("No data found.")
+        return None
+
+    df = pd.concat(all_dfs).sort_index()
+    # Remove duplicate timestamps from chunk boundaries
+    df = df[~df.index.duplicated(keep="first")]
+
+    if resample:
+        df = df.resample(resample).mean().dropna(how="all")
+
+    if filter_movement:
+        df = filter_data_in_movement(df)
+
+    if show_progress:
+        print(f"Fetched {len(df)} rows.")
+    return df
+
+
 def bulk_fetch_season(start_date, end_date, output_file="telemetry_season.csv"):
     """
     Fetch data day-by-day.
diff --git a/src/slicks/query_utils.py b/src/slicks/query_utils.py
@@ -134,6 +134,26 @@ def run_chunks_parallel(
     if not chunks:
         return []
 
+    # Sequential path — avoids nested ThreadPoolExecutor when called from inside
+    # another executor thread (e.g. asyncio run_in_executor). Creating gRPC clients
+    # inside nested thread pools causes "bad value(s) in fds_to_keep" on macOS.
+    if max_workers == 1:
+        ordered: List[T] = []
+        for idx, (t0, t1) in enumerate(chunks):
+            client = client_factory()
+            try:
+                ordered.extend(query_fn(client, t0, t1))
+            except PermanentQueryError:
+                raise
+            finally:
+                try:
+                    client.close()
+                except Exception:
+                    pass
+            if on_chunk_done:
+                on_chunk_done(idx)
+        return ordered
+
     results: dict[int, List[T]] = {}
     lock = threading.Lock()
 
diff --git a/src/slicks/scanner.py b/src/slicks/scanner.py
@@ -325,6 +325,7 @@ def scan_data_availability(
     bin_size: str = "hour",
     include_counts: bool = True,
     show_progress: bool = True,
+    max_workers: int = 4,
 ) -> ScanResult:
     """
     Scan the database for data availability windows.
@@ -397,6 +398,7 @@ def scan_data_availability(
             initial_chunk_days=initial_chunk_days,
             show_progress=show_progress,
             total_chunks=total_chunks,
+            max_workers=max_workers,
         ))
     except PermanentQueryError as e:
         raise RuntimeError(
@@ -439,6 +441,7 @@ def _fetch_bins_adaptive(
     initial_chunk_days: int = 31,
     show_progress: bool = True,
     total_chunks: int = 1,
+    max_workers: int = 4,
 ) -> Iterable[Tuple[datetime, int]]:
     """Iterate over bucket start times with counts using parallel adaptive chunking."""
 
@@ -541,7 +544,7 @@ def on_chunk_done(idx: int) -> None:
             client_factory=_make_client,
             chunks=chunks,
             query_fn=process_chunk,
-            max_workers=4,
+            max_workers=max_workers,
             on_chunk_done=on_chunk_done,
         )
     finally:

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from .fetcher import fetch_telemetry, bulk_fetch_season, list_target_sensors, get_influx_client`
	`1`	`+from .fetcher import fetch_telemetry, fetch_telemetry_chunked, bulk_fetch_season, list_target_sensors, get_influx_client`
`2`	`2`	`from .discovery import discover_sensors`
`3`	`3`	`from .movement_detector import detect_movement_ratio, get_movement_segments, filter_data_in_movement`
`4`	`4`	`from .config import connect_influxdb3`