Adjustments to support multiple files

Aserhisham · Aserhisham · commit 475eb8cea904 · 2025-07-27T21:54:56.000+02:00
diff --git a/src/sed/config/lab_example_config.yaml b/src/sed/config/lab_example_config.yaml
@@ -35,6 +35,7 @@ dataframe:
 
   first_event_time_stamp_key: /ScanParam/StartTime
   ms_markers_key: /SlowData/exposure_time
+  millis_counter_key: /DLD/millisecCounter
 
   # Time and binning settings
   tof_binwidth: 2.0576131995767355E-11            # Base time-of-flight bin width in seconds
diff --git a/src/sed/core/config_model.py b/src/sed/core/config_model.py
@@ -133,6 +133,8 @@ class DataframeModel(BaseModel):
     # mpes specific settings
     first_event_time_stamp_key: Optional[str] = None
     ms_markers_key: Optional[str] = None
+    # cfel specific settings
+    millis_counter_key: Optional[str] = None
     # flash specific settings
     forward_fill_iterations: Optional[int] = None
     ubid_offset: Optional[int] = None
diff --git a/src/sed/loader/cfel/buffer_handler.py b/src/sed/loader/cfel/buffer_handler.py
@@ -4,6 +4,8 @@
 from pathlib import Path
 
 import dask.dataframe as dd
+from joblib import delayed
+from joblib import Parallel
 
 from sed.core.logging import setup_logging
 from sed.loader.cfel.dataframe import DataFrameCreator
@@ -45,14 +47,88 @@ def _validate_h5_files(self, config, h5_paths: list[Path]) -> list[Path]:
 
         return valid_h5_paths    
 
-    def _save_buffer_file(self, paths: dict[str, Path]) -> None:
-        """Creates the electron and timed buffer files from the raw H5 file."""
-        logger.debug(f"Processing file: {paths['raw'].stem}")
-        start_time = time.time()
+    def _save_buffer_files(self, force_recreate: bool, debug: bool) -> None:
+        """
+        Creates the buffer files that are missing, handling multi-file runs properly.
 
-        # Create DataFrameCreator and get get dataframe
-        dfc = DataFrameCreator(config_dataframe=self._config, h5_path=paths["raw"])
+        Args:
+            force_recreate (bool): Flag to force recreation of buffer files.
+            debug (bool): Flag to enable debug mode, which serializes the creation.
+        """
+        file_sets = self.fp.file_sets_to_process(force_recreate)
+        logger.info(f"Reading files: {len(file_sets)} new files of {len(self.fp)} total.")
+        
+        if len(file_sets) == 0:
+            return
+            
+        # Sort file sets by filename to ensure proper order
+        file_sets = sorted(file_sets, key=lambda x: x['raw'].name)
+        
+        # Get base timestamp from the first file if we have multiple files
+        base_timestamp = None
+        if len(file_sets) > 1:
+            try:
+                # Find the first file (ends with _0000)
+                first_file_set = None
+                for file_set in file_sets:
+                    if file_set['raw'].stem.endswith('_0000'):
+                        first_file_set = file_set
+                        break
+                
+                if first_file_set:
+                    # Create a temporary DataFrameCreator to extract base timestamp
+                    first_dfc = DataFrameCreator(
+                        config_dataframe=self._config, 
+                        h5_path=first_file_set['raw'],
+                        is_first_file=True
+                    )
+                    base_timestamp = first_dfc.get_base_timestamp()
+                    first_dfc.h5_file.close()  # Clean up
+                    logger.info(f"Multi-file run detected. Base timestamp: {base_timestamp}")
+            except Exception as e:
+                logger.warning(f"Could not extract base timestamp: {e}. Processing files independently.")
+                base_timestamp = None
+        
+        n_cores = min(len(file_sets), self.n_cores)
+        if n_cores > 0:
+            if debug:
+                for file_set in file_sets:
+                    is_first_file = file_set['raw'].stem.endswith('_0000')
+                    self._save_buffer_file(file_set, is_first_file, base_timestamp)
+            else:
+                # For parallel processing, we need to be careful about the order
+                # Process all files in parallel with the correct parameters
+                from joblib import delayed, Parallel
+                
+                Parallel(n_jobs=n_cores, verbose=10)(
+                    delayed(self._save_buffer_file)(
+                        file_set, 
+                        file_set['raw'].stem.endswith('_0000'),
+                        base_timestamp
+                    ) 
+                    for file_set in file_sets
+                )
+
+    def _save_buffer_file(self, file_set, is_first_file=True, base_timestamp=None):
+        """
+        Saves an HDF5 file to a Parquet file using the DataFrameCreator class.
+        
+        Args:
+            file_set: Dictionary containing file paths
+            is_first_file: Whether this is the first file in a multi-file run
+            base_timestamp: Base timestamp from the first file (for subsequent files)
+        """
+        start_time = time.time()  # Add this line
+        paths = file_set
+        
+        dfc = DataFrameCreator(
+            config_dataframe=self._config, 
+            h5_path=paths["raw"],
+            is_first_file=is_first_file,
+            base_timestamp=base_timestamp
+        )
         df = dfc.df
+        df_timed = dfc.df_timed
 
         # Save electron resolved dataframe
         electron_channels = get_channels(self._config, "per_electron")
@@ -62,7 +138,6 @@ def _save_buffer_file(self, paths: dict[str, Path]) -> None:
         electron_df.to_parquet(paths["electron"])
 
         # Create and save timed dataframe
-        df_timed = dfc.df_timed
         dtypes = get_dtypes(self._config, df_timed.columns.values)
         timed_df = df_timed.astype(dtypes)
         logger.debug(f"Saving timed buffer with shape: {timed_df.shape}")
diff --git a/src/sed/loader/cfel/dataframe.py b/src/sed/loader/cfel/dataframe.py
@@ -29,16 +29,21 @@ class DataFrameCreator:
         _config (dict): The configuration dictionary for the DataFrame.
     """
 
-    def __init__(self, config_dataframe: dict, h5_path: Path) -> None:
+    def __init__(self, config_dataframe: dict, h5_path: Path, 
+                 is_first_file: bool = True, base_timestamp: pd.Timestamp = None) -> None:
         """
         Initializes the DataFrameCreator class.
 
         Args:
             config_dataframe (dict): The configuration dictionary with only the dataframe key.
             h5_path (Path): Path to the h5 file.
+            is_first_file (bool): Whether this is the first file in a multi-file run.
+            base_timestamp (pd.Timestamp): Base timestamp from the first file (for subsequent files).
         """
         self.h5_file = h5py.File(h5_path, "r")
         self._config = config_dataframe
+        self.is_first_file = is_first_file
+        self.base_timestamp = base_timestamp
 
         index_alias = self._config.get("index", ["countId"])[0]
         # all values except the last as slow data starts from start of file
@@ -83,6 +88,19 @@ def get_dataset_array(
 
         return dataset
 
+    def get_base_timestamp(self) -> pd.Timestamp:
+        """
+        Extracts the base timestamp from the first file to be used for subsequent files.
+        
+        Returns:
+            pd.Timestamp: The base timestamp from the first file.
+        """
+        if not self.is_first_file:
+            raise ValueError("get_base_timestamp() should only be called on the first file")
+        
+        first_timestamp = self.h5_file[self._config.get("first_event_time_stamp_key")][0]
+        return pd.to_datetime(first_timestamp.decode())
+
     @property
     def df_electron(self) -> pd.DataFrame:
         """
@@ -141,14 +159,72 @@ def df_train(self) -> pd.DataFrame:
     @property
     def df_timestamp(self) -> pd.DataFrame:
         """
-        Uses the first_event_time_stamp_key to get initial timestamp and the
-        ms_markers_key which is a dataset of exposure times same size as the index."""
+        For files with first_event_time_stamp_key: Uses that as initial timestamp.
+        For files with only millis_counter_key: Uses that as absolute timestamp.
+        Both use ms_markers_key for exposure times within the file.
+        """
 
-        first_timestamp = self.h5_file[self._config.get("first_event_time_stamp_key")][
-            0
-        ]  # single value
-        ts_start = pd.to_datetime(first_timestamp.decode())
-        # actually in seconds but using milliseconds for consistency with mpes loader
+        # Try to determine which timestamp approach to use based on available data
+        first_timestamp_key = self._config.get("first_event_time_stamp_key")
+        millis_counter_key = self._config.get("millis_counter_key", "/DLD/millisecCounter")
+        
+        has_first_timestamp = (first_timestamp_key is not None and 
+                             first_timestamp_key in self.h5_file and 
+                             len(self.h5_file[first_timestamp_key]) > 0)
+        
+        has_millis_counter = (millis_counter_key in self.h5_file and 
+                            len(self.h5_file[millis_counter_key]) > 0)
+
+        # Log millisecond counter values for ALL files
+        if has_millis_counter:
+            millis_counter_values = self.h5_file[millis_counter_key][()]
+
+        if self.is_first_file and has_first_timestamp:
+            logger.warning("DEBUG: Taking first file with scan start timestamp path")
+            # First file with scan start timestamp
+            first_timestamp = self.h5_file[first_timestamp_key][0]
+            base_ts = pd.to_datetime(first_timestamp.decode())
+            
+            # Also log millisecond counter values for first file if available
+            if has_millis_counter:
+                millis_counter_values = self.h5_file[millis_counter_key][()]
+                millis_min = millis_counter_values[0]   # First value
+                millis_max = millis_counter_values[-1]  # Last value
+
+                # Add the first millisecond counter value to the base timestamp
+                ts_start = base_ts + pd.Timedelta(milliseconds=millis_min)   
+
+                # Calculate what these would be as timestamps
+                ts_min_from_millis = base_ts + pd.Timedelta(milliseconds=millis_min)
+                ts_max_from_millis = base_ts + pd.Timedelta(milliseconds=millis_max)
+            else:
+                # Fallback if no millisecond counter
+                ts_start = base_ts
+        elif not self.is_first_file and self.base_timestamp is not None and has_millis_counter:
+            # Subsequent files: use base timestamp + millisecond counter offset
+            millis_counter_values = self.h5_file[millis_counter_key][()]  # Get all values
+            
+            # Get min (first) and max (last) millisecond values
+            millis_min = millis_counter_values[0]   # First value
+            millis_max = millis_counter_values[-1]  # Last value
+            
+            # Calculate timestamps for min and max
+            ts_min = self.base_timestamp + pd.Timedelta(milliseconds=millis_min)
+            ts_max = self.base_timestamp + pd.Timedelta(milliseconds=millis_max)
+            
+            logger.warning(f"DEBUG: Timestamp for min: {ts_min}")
+            logger.warning(f"DEBUG: Timestamp for max: {ts_max}")
+            
+            # Use the first value (start time) for calculating offset
+            millis_counter = millis_counter_values[0]  # First element is the start time
+            offset = pd.Timedelta(milliseconds=millis_counter)
+            ts_start = self.base_timestamp + offset
+        else:
+            logger.warning("DEBUG: Falling through to undefined ts_start - THIS IS THE PROBLEM!")
+            logger.warning(f"DEBUG: Condition 1: is_first_file={self.is_first_file} AND has_first_timestamp={has_first_timestamp} = {self.is_first_file and has_first_timestamp}")
+            logger.warning(f"DEBUG: Condition 2: not is_first_file={not self.is_first_file} AND base_timestamp is not None={self.base_timestamp is not None} AND has_millis_counter={has_millis_counter} = {not self.is_first_file and self.base_timestamp is not None and has_millis_counter}")
+
+        # Get exposure times (in seconds) for this file
         exposure_time = self.h5_file[self._config.get("ms_markers_key")][()]
 
         # Calculate cumulative exposure times
diff --git a/src/sed/loader/cfel/loader.py b/src/sed/loader/cfel/loader.py