changes for countrate and multiple files

Aserhisham · Aserhisham · commit 58c8e0171b0f · 2026-01-29T17:53:11.000+01:00
diff --git a/src/sed/loader/cfel/buffer_handler.py b/src/sed/loader/cfel/buffer_handler.py
@@ -3,6 +3,8 @@
 import time
 from pathlib import Path
 
+import h5py
+import numpy as np
 import dask.dataframe as dd
 from joblib import delayed
 from joblib import Parallel
@@ -168,7 +170,48 @@ def _save_buffer_files(self, force_recreate: bool, debug: bool) -> None:
                 f"Could not extract base timestamp: {e}. "
                 "Processing files independently."
             )
+
+        # -------------------------------------------------------
+        # Calculate index offsets
+        # We need to read the 'index' channel (usually countId/NumOfEvents) to know the count.
+        # This requires a quick scan of files.
+        # -------------------------------------------------------
+        index_offsets = {}
+        current_offset = 0
+        
+        index_alias = self._config.get("index", ["countId"])[0]
+        try:
+            channel_config = self._config["channels"][index_alias]
+            dataset_key = channel_config["dataset_key"]
             
+            # Prefer serial scan for safety and simplicity, though could be parallelized
+            # For 200 files it might take a few seconds.
+            logger.info("Calculating index offsets...")
+            for file_set in file_sets:
+                try:
+                    with h5py.File(file_set["raw"], "r") as h5_file:
+                        if dataset_key in h5_file:
+                            
+                            dset = h5_file[dataset_key]
+                            # sum of all events in this file
+                            # Use simple read if small enough
+                            n_events = np.sum(dset)
+                            
+                            index_offsets[file_set["raw"].name] = int(current_offset)
+                            current_offset += int(n_events)
+                        else:
+                             index_offsets[file_set["raw"].name] = int(current_offset)
+                except Exception as e:
+                    logger.warning(f"Failed to read index offset from {file_set['raw'].name}: {e}")
+                    index_offsets[file_set["raw"].name] = int(current_offset)
+            
+            logger.debug(f"Total events calculated: {current_offset}")
+
+        except Exception as e:
+            logger.warning(f"Failed to calculate index offsets: {e}. Indices may reset.")
+            for fs in file_sets:
+                index_offsets[fs["raw"].name] = 0
+
         # -------------------------------------------------------
     
         n_cores = min(len(file_sets), self.n_cores)
@@ -187,6 +230,7 @@ def is_first_file(file_set) -> bool:
                     file_set,
                     is_first_file(file_set),
                     base_timestamp,
+                    index_offset=index_offsets.get(file_set["raw"].name, 0),
                 )
         else:
             # For parallel processing, we need to be careful about the order
@@ -198,18 +242,20 @@ def is_first_file(file_set) -> bool:
                     file_set,
                     is_first_file(file_set),
                     base_timestamp,
+                    index_offset=index_offsets.get(file_set["raw"].name, 0),
                 )
                 for file_set in file_sets
             )
 
-    def _save_buffer_file(self, file_set, is_first_file=True, base_timestamp=None):
+    def _save_buffer_file(self, file_set, is_first_file=True, base_timestamp=None, index_offset=0):
         """
         Saves an HDF5 file to a Parquet file using the DataFrameCreator class.
         
         Args:
             file_set: Dictionary containing file paths
             is_first_file: Whether this is the first file in a multi-file run
             base_timestamp: Base timestamp from the first file (for subsequent files)
+            index_offset: Offset to apply to the index
         """
         start_time = time.time()  # Add this line
         paths = file_set
@@ -218,9 +264,11 @@ def _save_buffer_file(self, file_set, is_first_file=True, base_timestamp=None):
             config_dataframe=self._config, 
             h5_path=paths["raw"],
             is_first_file=is_first_file,
-            base_timestamp=base_timestamp
+            base_timestamp=base_timestamp,
+            index_offset=index_offset
         )
         df = dfc.df
+
         df_timed = dfc.df_timed
 
         # Save electron resolved dataframe
diff --git a/src/sed/loader/cfel/dataframe.py b/src/sed/loader/cfel/dataframe.py
@@ -30,7 +30,8 @@ class DataFrameCreator:
     """
 
     def __init__(self, config_dataframe: dict, h5_path: Path, 
-                 is_first_file: bool = True, base_timestamp: pd.Timestamp = None) -> None:
+                 is_first_file: bool = True, base_timestamp: pd.Timestamp = None,
+                 index_offset: int = 0) -> None:
         """
         Initializes the DataFrameCreator class.
 
@@ -39,22 +40,20 @@ def __init__(self, config_dataframe: dict, h5_path: Path,
             h5_path (Path): Path to the h5 file.
             is_first_file (bool): Whether this is the first file in a multi-file run.
             base_timestamp (pd.Timestamp): Base timestamp from the first file (for subsequent files).
+            index_offset (int): Offset to apply to the index (countId) for multi-file runs.
         """
         self.h5_file = h5py.File(h5_path, "r")
         self._config = config_dataframe
         self.is_first_file = is_first_file
         self.base_timestamp = base_timestamp
+        self.index_offset = index_offset
 
         index_alias = self._config.get("index", ["countId"])[0]
-        # # all values except the last as slow data starts from start of file
-        # somehow written something else as this line is doing
-        # self.index = np.cumsum([0, *self.get_dataset_array(index_alias)])
+        
         # get cumulative counts, but drop last because slow data only covers N-1 intervals
-        self.index = np.cumsum([0, *self.get_dataset_array(index_alias)])[:-1]
-        # cumulative sum starting from the first acquisition count, No artificial 0 at the start
-        # makes identical len of TimeStamp and index, but cuts last TimeStamp
-        # self.index = np.cumsum(self.get_dataset_array(index_alias))
-        print(f"len of self.index: {len(self.index)}")
+        # Add index_offset
+        self.index = np.cumsum([0, *self.get_dataset_array(index_alias)])[:-1] + index_offset
+
 
     def get_dataset_key(self, channel: str) -> str:
         """
@@ -121,7 +120,16 @@ def df_electron(self) -> pd.DataFrame:
         if channels == []:
             return pd.DataFrame()
 
-        series = {channel: pd.Series(self.get_dataset_array(channel)) for channel in channels}
+        series = {
+            channel: pd.Series(
+                self.get_dataset_array(channel),
+                index=pd.RangeIndex(
+                    self.index_offset,
+                    self.index_offset + len(self.get_dataset_array(channel)),
+                ),
+            )
+            for channel in channels
+        }
         dataframe = pd.concat(series, axis=1)
         return dataframe.dropna()
 
@@ -241,9 +249,6 @@ def df_timestamp(self) -> pd.DataFrame:
         # ------------------------------------------------------------
         ts_alias = self._config["columns"].get("timestamp", "timeStamp")
         df = pd.DataFrame({ts_alias: unix_seconds}, index=self.index)
-        print(f"Len of TimeStamps: {len(unix_seconds)}, len of Index: {len(self.index)}")
-        pd.set_option("display.float_format", "{:.6f}".format)
-        print(df)
 
         # # # Suppose df is your timestamp DataFrame
         # print("DEBUG of df")
diff --git a/src/sed/loader/cfel/loader.py b/src/sed/loader/cfel/loader.py
@@ -451,10 +451,22 @@ def get_count_rate_ms(
         # 2) Compute point-resolved rates
         # -------------------------------
         if mode == "point":
+            bin_size = kwds.pop("bin_size", 1)
             dt = np.diff(ms_concat) * 1e-3
             if np.any(dt <= 0):
-                raise ValueError("Non-positive time step detected in millisecCounter")
+                # Handle potential duplicate timestamps or jump back (should not happen with sort)
+                dt[dt <= 0] = 1e-6 # small epsilon
             rates_point = counts_concat[1:] / dt
+            
+            if bin_size > 1:
+                # Apply rolling average for smoothing
+                rates_point = (
+                    pd.Series(rates_point)
+                    .rolling(window=bin_size, center=True, min_periods=1)
+                    .mean()
+                    .values
+                )
+            
             times_point = ms_concat[1:] * 1e-3
             return rates_point, times_point
     
@@ -463,16 +475,24 @@ def get_count_rate_ms(
         # -------------------------------
         rates_file = []
         times_file = []
-        prev_ms_max = 0.0  # global start
-    
         for idx, (ms_min, ms_max) in enumerate(file_ms_min_max):
-            # Duration = internal file window + gap since previous file
-            file_duration = (ms_max - ms_min) + (ms_min - prev_ms_max)
+            # Duration = internal file window
+            file_duration = ms_max - ms_min
             if file_duration <= 0:
-                raise ValueError(f"Non-positive duration for file {fids_resolved[idx]}")
-    
-            print(f"Total counts: {file_counts_total[idx]}")
-            print(f"File duration: {file_duration}")
+                # If single point or overlapping min/max, fallback or raise?
+                # For single point (duration 0), rate is undefined (inf).
+                # Start/End timestamps usually imply a range.
+                # If strictly 0, we can't calculate rate.
+                logger.warning(
+                    f"[get_count_rate_ms] File {fids_resolved[idx]} has duration <= 0 ({file_duration}). "
+                    "Skipping rate calculation for this file (set to NaN).",
+                )
+                rates_file.append(np.nan)
+                times_file.append((ms_min + ms_max) / 2 * 1e-3)
+                continue
+
+            # print(f"Total counts: {file_counts_total[idx]}")
+            # print(f"File duration: {file_duration}")
             rate = file_counts_total[idx] / (file_duration * 1e-3)
             rates_file.append(rate)
             # times_file.append(ms_max * 1e-3)  # last time in file
@@ -484,8 +504,6 @@ def get_count_rate_ms(
                 f"counts={file_counts_total[idx]}, duration={file_duration} ms, rate={rate:.2f} Hz"
             )
     
-            prev_ms_max = ms_max
-    
         return np.array(rates_file), np.array(times_file)
            
 
@@ -529,58 +547,28 @@ def get_count_rate(
         self,
         fids: Sequence[int] | None = None,
         runs: Sequence[int] | None = None,
+        **kwds,
     ) -> tuple[np.ndarray, np.ndarray]:
         """
-        Returns the count rate per file using the total number of detected events
-        and the file acquisition duration.
-        
-        This method computes:
-        - one count-rate value per file (Hz)
-        - one global time value per file, given by the midpoint of the file
-          acquisition window, measured in seconds since the scan start
-        
-        The calculation is based on metadata produced by `read_dataframe`
-        and therefore does not require loading raw event data.
-        This makes the method fast but limited to file-level resolution.
+        Returns the count rate. By default, returns high-resolution
+        point-resolved rates using the millisecond counter.
         
         Args:
             fids (Sequence[int], optional):
                 File IDs to include. Defaults to all files.
             runs (Sequence[int], optional):
                 Run IDs to include. If provided, overrides `fids`.
+            **kwds:
+                Additional arguments passed to `get_count_rate_ms`.
+                - mode: "point" (default) or "file".
         
         Returns:
             tuple[np.ndarray, np.ndarray]:
-                - count_rate : array of count rates in Hz (one per file)
+                - count_rate : array of count rates in Hz
                 - time       : array of global times in seconds since scan start
-                               (file midpoint)
-        
-        Raises:
-            KeyError:
-                If required file statistics are missing. Call `read_dataframe` first.
         """
-
-        fids_resolved = self._resolve_fids(fids=fids, runs=runs)
-    
-        ts_alias = self._config["dataframe"]["columns"].get("timestamp", "timeStamp")
-        t0 = self.metadata["file_statistics"]["timed"]["0"]["columns"][ts_alias]["min"]
-    
-        rates = []
-        times = []
-    
-        for fid in fids_resolved:
-            counts = self.metadata["file_statistics"]["electron"][str(fid)]["num_rows"]
-            ts = self.metadata["file_statistics"]["timed"][str(fid)]["columns"][ts_alias]
-    
-            dt = ts["max"] - ts["min"]
-            print(f"File duration: {dt} seconds")
-            if dt <= 0:
-                raise ValueError(f"Non-positive elapsed time for file {fid}")
-    
-            rates.append(counts / dt)
-            times.append(0.5 * (ts["min"] + ts["max"]) - t0)
-    
-        return np.asarray(rates), np.asarray(times)
+        mode = kwds.pop("mode", "point")
+        return self.get_count_rate_ms(fids=fids, mode=mode, runs=runs, **kwds)
 
     # -------------------------------
     # Time-resolved count rate (binned)
diff --git a/src/sed/loader/flash/loader.py b/src/sed/loader/flash/loader.py
@@ -223,10 +223,85 @@ def parse_metadata(self, token: str = None) -> dict:
 
     def get_count_rate(
         self,
-        fids: Sequence[int] = None,  # noqa: ARG002
-        **kwds,  # noqa: ARG002
-    ):
-        return None, None
+        fids: Sequence[int] = None,
+        **kwds,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Calculates the count rate for the specified files.
+        Returns high-resolution (per-train) rates by counting electrons per trainId.
+        
+        Args:
+            fids (Sequence[int]): A sequence of file IDs. Defaults to all files.
+            **kwds: Keyword arguments.
+        
+        Returns:
+            tuple[np.ndarray, np.ndarray]: The count rate array (Hz) and time array (seconds).
+        """
+        import h5py
+        import numpy as np
+        import pandas as pd
+        
+        if fids is None:
+            fids = range(len(self.files))
+        
+        # Get the electron channel configuration
+        per_electron_channels = get_channels(self._config["dataframe"], "per_electron")
+        if not per_electron_channels:
+             return None, None
+        
+        # We need the 'index_key' (trainId) for an electron channel
+        first_channel = per_electron_channels[0]
+        channel_config = self._config["dataframe"]["channels"][first_channel]
+        index_key = channel_config["index_key"]
+        
+        all_counts = []
+        all_times = []
+        
+        # FLASH repetition rate is usually 10Hz. 
+        # We try to use timestamps if available, otherwise fallback to trainId gaps.
+        time_stamp_alias = self._config["dataframe"].get("time_stamp_alias", "timeStamp")
+        
+        # We need a reference time (t0) from the first selected file
+        with h5py.File(self.files[fids[0]], "r") as h5:
+             # Try to find a global start time if any, otherwise use relative
+             t0 = 0
+             if time_stamp_alias in h5:
+                  # This depends on how timestamps are stored in FLASH files
+                  # For now, we use a simple relative time if not easily found.
+                  pass
+
+        for fid in fids:
+            with h5py.File(self.files[fid], "r") as h5:
+                # Read trainIds of all electron events
+                train_ids = np.asarray(h5[index_key])
+                
+                if len(train_ids) == 0:
+                    continue
+                
+                # Count electrons per train
+                df_counts = pd.Series(train_ids).value_counts().sort_index()
+                counts = df_counts.values
+                u_train_ids = df_counts.index.values
+                
+                # Convert trainIds to relative seconds (assuming 10Hz)
+                # Note: This is an approximation. A better way would be to 
+                # use the actual timestamps of the trains.
+                if fid == fids[0]:
+                    t_start_id = u_train_ids[0]
+                
+                times = (u_train_ids - t_start_id) * 0.1
+                
+                # Rate per trainId interval (usually 0.1s)
+                # If we assume exactly 10Hz, duration is 0.1s
+                rates = counts / 0.1 
+                
+                all_counts.append(rates)
+                all_times.append(times)
+        
+        if not all_counts:
+             return None, None
+             
+        return np.concatenate(all_counts), np.concatenate(all_times)
 
     def get_elapsed_time(self, fids: Sequence[int] = None, **kwds) -> float | list[float]:  # type: ignore[override]
         """
diff --git a/tests/loader/cfel/test_metadata.py b/tests/loader/cfel/test_metadata.py