Merge pull request #66 from UniversalScientificTechnologies/update-parsers

cisar2218 · web-flow · commit 369771e6be92 · 2026-04-29T00:17:38.000+02:00
airdos04c parser time and memory optimization
diff --git a/backend/DOSPORTAL/services/parsing/parsers/airdos_04c.py b/backend/DOSPORTAL/services/parsing/parsers/airdos_04c.py
@@ -1,5 +1,6 @@
 from typing import BinaryIO
 
+import numpy as np
 import pandas as pd
 
 from ..contracts import ParsedUnifiedData
@@ -45,109 +46,97 @@ def parse(
     def _parse_file(self, file_obj: BinaryIO, raw_header: str) -> dict[str, object]:
         file_obj.seek(0)
 
-        # ── Pass 1: collect raw blocks ────────────────────────────────────────
-        blocks: list[dict[str, object]] = []
-        current_events: list[int] = []
+        bin_width = (self.ADC_MAX - self.ADC_MIN) / self.N_HIGH_BINS
+        total_channels = self.LOW_CHANNELS + self.N_HIGH_BINS
+
+        time_list: list[float] = []
+        particle_count_list: list[int] = []
+        low_ch_list: list[list[int]] = []
+        high_hist_list: list[list[int]] = []
+        current_hist: list[int] = []
         in_block = False
         skipped_rows = 0
         start_unix_s: int | None = None
+        unix_offset: float | None = None
 
         for raw_line in file_obj:
-            line = self._decode_line(raw_line)
-            if not line or line.startswith("#"):
+            raw_line = raw_line.rstrip()
+            if not raw_line or raw_line.startswith(b"#"):
                 continue
 
-            if line.startswith("$TIME,") and start_unix_s is None:
-                tokens = line.split(",")
+            if raw_line.startswith(b"$TIME,"):
+                tokens = raw_line.split(b",")
                 if len(tokens) >= 4:
-                    start_unix_s = self._safe_int(tokens[3])
-
-            elif line.startswith("$START,"):
-                tokens = line.split(",")
+                    device_s = self._safe_float(tokens[1].decode())
+                    unix_s = self._safe_float(tokens[3].decode())
+                    if device_s is not None and unix_s is not None:
+                        unix_offset = unix_s - device_s
+                    if start_unix_s is None and unix_s is not None:
+                        start_unix_s = int(unix_s)
+
+            elif raw_line.startswith(b"$START,"):
+                tokens = raw_line.split(b",")
                 if len(tokens) >= 3:
                     in_block = True
-                    current_events = []
+                    current_hist = [0] * self.N_HIGH_BINS
                 else:
                     skipped_rows += 1
 
-            elif line.startswith("$E,"):
+            elif raw_line.startswith(b"$E,"):
                 if not in_block:
                     skipped_rows += 1
                     continue
-                tokens = line.split(",")
+                tokens = raw_line.split(b",")
                 if len(tokens) >= 3:
-                    adc = self._safe_int(tokens[2])
+                    adc = self._safe_int(tokens[2].decode())
                     if adc is not None and adc >= self.ADC_MIN:
-                        current_events.append(adc)
+                        current_hist[min(int((adc - self.ADC_MIN) / bin_width), self.N_HIGH_BINS - 1)] += 1
                     else:
                         skipped_rows += 1
                 else:
                     skipped_rows += 1
 
-            elif line.startswith("$STOP,"):
+            elif raw_line.startswith(b"$STOP,"):
                 if not in_block:
                     skipped_rows += 1
                     continue
                 in_block = False
 
                 # $STOP,<count>,<tm>.<tm_s100>,<systime>,<events_count>,<h0>,<h1>,<h2>,<h3>
-                tokens = line.split(",")
+                tokens = raw_line.split(b",")
                 if len(tokens) < 9:
                     skipped_rows += 1
                     continue
 
-                time_s = self._safe_float(tokens[2])
-                events_count = self._safe_int(tokens[4])
-                h = [self._safe_int(tokens[5 + i]) for i in range(self.LOW_CHANNELS)]
+                time_s = self._safe_float(tokens[2].decode())
+                events_count = self._safe_int(tokens[4].decode())
+                h = [self._safe_int(tokens[5 + i].decode()) for i in range(self.LOW_CHANNELS)]
 
                 if time_s is None or events_count is None:
                     skipped_rows += 1
                     continue
 
-                blocks.append(
-                    {
-                        "time_ms": time_s * 1000.0,
-                        "low_channels": [v if v is not None else 0 for v in h],
-                        "high_events": list(current_events),
-                        "events_count": events_count,
-                    }
-                )
+                low_ch = [v if v is not None else 0 for v in h]
+                time_list.append((time_s + unix_offset) * 1000.0 if unix_offset is not None else time_s * 1000.0)
+                particle_count_list.append(events_count + sum(low_ch))
+                low_ch_list.append(low_ch)
+                high_hist_list.append(current_hist)
 
-        if not blocks:
+        if not time_list:
             raise ParsingError("No valid measurement blocks found in AIRDOS04C log file")
 
-        # ── Fixed high-energy bins: ADC_MIN..ADC_MAX → N_HIGH_BINS bins ────────
-        bin_width = (self.ADC_MAX - self.ADC_MIN) / self.N_HIGH_BINS
-
-        def adc_to_bin(adc: int) -> int:
-            return min(int((adc - self.ADC_MIN) / bin_width), self.N_HIGH_BINS - 1)
-
-        # ── Pass 2: build histogram per block ────────────────────────────────
-        rows: list[dict[str, int | float]] = []
-
-        for index, block in enumerate(blocks):
-            high_hist = [0] * self.N_HIGH_BINS
-            for adc in block["high_events"]:  # type: ignore[union-attr]
-                high_hist[adc_to_bin(adc)] += 1
-
-            low_ch: list[int] = block["low_channels"]  # type: ignore[assignment]
-            row: dict[str, int | float] = {
-                "id": index,
-                "time_ms": float(block["time_ms"]),  # type: ignore[arg-type]
-                "particle_count": int(block["events_count"]) + sum(low_ch),  # type: ignore[arg-type]
-            }
-            for i, count in enumerate(low_ch):
-                row[f"channel_{i}"] = count
-            for i, count in enumerate(high_hist):
-                row[f"channel_{self.LOW_CHANNELS + i}"] = count
+        n = len(time_list)
+        channel_names = [f"channel_{i}" for i in range(total_channels)]
 
-            rows.append(row)
+        channel_arr = np.empty((n, total_channels), dtype=np.int32)
+        for i in range(n):
+            channel_arr[i, :self.LOW_CHANNELS] = low_ch_list[i]
+            channel_arr[i, self.LOW_CHANNELS:] = high_hist_list[i]
 
-        df = pd.DataFrame(rows)
-        df["time_ms"] = df["time_ms"] - df["time_ms"].min()
-
-        total_channels = self.LOW_CHANNELS + self.N_HIGH_BINS
-        channel_names = [f"channel_{i}" for i in range(total_channels)]
+        df = pd.DataFrame(channel_arr, columns=channel_names)
+        df.insert(0, "particle_count", np.array(particle_count_list, dtype=np.int32))
+        df.insert(0, "time_ms", np.array(time_list, dtype=np.float64))
+        df.insert(0, "id", np.arange(n, dtype=np.int32))
 
         # Bin edges: ADC value at the start of each high-energy bin
         bin_edges = [self.ADC_MIN + i * bin_width for i in range(self.N_HIGH_BINS + 1)]
@@ -163,7 +152,6 @@ def adc_to_bin(adc: int) -> int:
                 float(df["time_ms"].min()),
                 float(df["time_ms"].max()),
             ],
-            "channel_columns": channel_names,
             "high_energy_bin_edges": bin_edges,
             "high_energy_adc_max": self.ADC_MAX,
             "start_unix_s": int(start_unix_s) if start_unix_s is not None else None,
@@ -172,12 +160,6 @@ def adc_to_bin(adc: int) -> int:
         file_obj.seek(0)
         return {"dataframe": df, "metadata": metadata}
 
-    @staticmethod
-    def _decode_line(raw_line: bytes | str) -> str:
-        if isinstance(raw_line, (bytes, bytearray)):
-            return raw_line.decode("utf-8", errors="ignore").strip()
-        return str(raw_line).strip()
-
     @staticmethod
     def _safe_float(value: str) -> float | None:
         try:
diff --git a/backend/DOSPORTAL/tasks/spectral_records.py b/backend/DOSPORTAL/tasks/spectral_records.py
@@ -120,7 +120,10 @@ def process_spectral_record_into_spectral_file_async(spectral_record_id):
         combined = _combine_raw_log_files(raw_files)
 
         try:
+            import time
+            _t0 = time.perf_counter()
             parsed = parse_log_to_unified(combined)
+            print(f"Parsing done in {time.perf_counter() - _t0:.3f}s (record {record.id}, {len(raw_files)} files)")
         except Exception as e:
             logger.exception("Error parsing SpectralRecord %s", record.id)
             record.processing_status = ProcessingStatusMixin.PROCESSING_FAILED