1717import re
1818import ast
1919
20+ # Numba acceleration (optional)
21+ try :
22+ from _numba_accelerators import (
23+ NUMBA_AVAILABLE , NUMBA_MIN_ROWS ,
24+ numba_scatter , numba_compute_join_indices , get_numba_info
25+ )
26+ except ImportError :
27+ NUMBA_AVAILABLE = False
28+ NUMBA_MIN_ROWS = 10000
29+ numba_scatter = None
30+ numba_compute_join_indices = None
31+ get_numba_info = lambda : {'available' : False , 'version' : None }
32+
2033# =============================================================================
2134# SECTION 0: Schema & Metadata Constants
2235# =============================================================================
@@ -711,7 +724,7 @@ class AliasDataFrame:
711724 Phase 4: Uses unified _schema dict as single source of truth.
712725 """
713726
714- def __init__ (self , df , schema_id = None ):
727+ def __init__ (self , df , schema_id = None , use_numba = None ):
715728 """
716729 Initialize AliasDataFrame with unified schema structure.
717730
@@ -722,6 +735,10 @@ def __init__(self, df, schema_id=None):
722735 schema_id : str, optional
723736 User-defined identifier for this schema (e.g., "miranov_lxplus_TPC_calib_v3").
724737 Useful for parameter scans, test studies, and provenance tracking.
738+ use_numba : bool, optional
739+ Enable/disable Numba acceleration for subframe joins.
740+ If None (default), auto-detect: use Numba if available.
741+ Set to False to force pure NumPy/Pandas operations.
725742
726743 The _schema dict is the single source of truth for:
727744 - __meta__: schema version, timestamps, user-defined ID
@@ -809,6 +826,13 @@ def __init__(self, df, schema_id=None):
809826 self ._join_index_cache = {} # {sf_name: {indices, missing_mask, n_rows, subframe_id}}
810827 self ._join_cache_hits = 0
811828 self ._join_cache_misses = 0
829+
830+ # Phase 8: Numba acceleration configuration
831+ # Auto-detect if not specified: use Numba when available
832+ if use_numba is None :
833+ self ._use_numba = NUMBA_AVAILABLE
834+ else :
835+ self ._use_numba = use_numba and NUMBA_AVAILABLE
812836
813837 # =========================================================================
814838 # SECTION 1: Core DataFrame Operations & Schema Properties
@@ -855,6 +879,30 @@ def set_schema_id(self, schema_id):
855879 self .schema_id = schema_id
856880 return self
857881
882+ @property
883+ def numba_info (self ):
884+ """
885+ Get information about Numba acceleration status.
886+
887+ Returns
888+ -------
889+ dict
890+ Contains:
891+ - available: bool - whether Numba is installed
892+ - enabled: bool - whether this ADF instance uses Numba
893+ - version: str or None - Numba version if available
894+ - min_rows: int - minimum rows to use Numba (JIT overhead threshold)
895+
896+ Example
897+ -------
898+ >>> adf.numba_info
899+ {'available': True, 'enabled': True, 'version': '0.57.0', 'min_rows': 10000}
900+ """
901+ info = get_numba_info ()
902+ info ['enabled' ] = self ._use_numba
903+ info ['min_rows' ] = NUMBA_MIN_ROWS
904+ return info
905+
858906 # =========================================================================
859907 # Phase 4: Backward Compatibility Properties
860908 # =========================================================================
@@ -1765,8 +1813,8 @@ def _compute_join_indices(self, sf_name, index_cols):
17651813 """
17661814 Compute join index mapping from main DataFrame to subframe rows.
17671815
1768- Uses lightweight merge (keys only) to build index mapping without
1769- copying full subframe data .
1816+ Uses Numba JIT-compiled lookup (Phase 8b) for single-column integer keys,
1817+ falls back to lightweight merge (keys only) for complex cases .
17701818
17711819 Parameters
17721820 ----------
@@ -1789,23 +1837,49 @@ def _compute_join_indices(self, sf_name, index_cols):
17891837 - Deduplicates subframe on index_cols only (not full columns)
17901838 - Takes first match for duplicate keys (keep='first')
17911839 - Indices refer to ORIGINAL subframe rows (before deduplication)
1840+ - Phase 8b: Uses Numba for single-column integer keys (>10K rows)
17921841 """
17931842 sub_adf = self .get_subframe (sf_name )
17941843 sub_df = sub_adf .df
1844+ n_main = len (self .df )
17951845
1846+ # Phase 8b: Try Numba path for single-column integer keys
1847+ if (self ._use_numba
1848+ and numba_compute_join_indices is not None
1849+ and len (index_cols ) == 1
1850+ and n_main >= NUMBA_MIN_ROWS ):
1851+
1852+ col = index_cols [0 ]
1853+ main_keys = self .df [col ].to_numpy ()
1854+ sub_keys = sub_df [col ].to_numpy ()
1855+
1856+ # Check if keys are integer-compatible
1857+ if (np .issubdtype (main_keys .dtype , np .integer ) and
1858+ np .issubdtype (sub_keys .dtype , np .integer )):
1859+
1860+ # Use Numba index lookup
1861+ indices , missing_mask , used_numba = numba_compute_join_indices (
1862+ main_keys .astype (np .int64 ),
1863+ sub_keys .astype (np .int64 )
1864+ )
1865+
1866+ if used_numba :
1867+ return indices , missing_mask
1868+
1869+ # Fallback: Pandas merge for multi-column or non-integer keys
17961870 # Build lightweight key table with row indices into ORIGINAL subframe
17971871 # Critical: Add __sub_row__ BEFORE deduplication so indices map to original rows
1798- sub_keys = sub_df [index_cols ].copy ()
1799- sub_keys ['__sub_row__' ] = np .arange (len (sub_df ), dtype = np .int64 )
1872+ sub_keys_df = sub_df [index_cols ].copy ()
1873+ sub_keys_df ['__sub_row__' ] = np .arange (len (sub_df ), dtype = np .int64 )
18001874
18011875 # Deduplicate on index_cols only, keeping first match
1802- if sub_keys .duplicated (subset = index_cols ).any ():
1803- sub_keys = sub_keys .drop_duplicates (subset = index_cols , keep = 'first' )
1876+ if sub_keys_df .duplicated (subset = index_cols ).any ():
1877+ sub_keys_df = sub_keys_df .drop_duplicates (subset = index_cols , keep = 'first' )
18041878
18051879 # Lightweight merge: main keys -> subframe row indices
18061880 # Left merge preserves main DataFrame row order (Many-to-One join)
1807- main_keys = self .df [index_cols ]
1808- merged = main_keys .merge (sub_keys , on = index_cols , how = 'left' , sort = False )
1881+ main_keys_df = self .df [index_cols ]
1882+ merged = main_keys_df .merge (sub_keys_df , on = index_cols , how = 'left' , sort = False )
18091883
18101884 # Extract indices and missing mask
18111885 indices = merged ['__sub_row__' ].fillna (- 1 ).astype (np .int64 ).to_numpy ()
@@ -1817,7 +1891,8 @@ def _extract_subframe_values_cached(self, sf_name, sf_col, indices, missing_mask
18171891 """
18181892 Extract subframe column values using cached indices.
18191893
1820- Uses NumPy advanced indexing for fast value extraction.
1894+ Uses Numba JIT-compiled scatter (Phase 8a) when available,
1895+ falls back to NumPy advanced indexing.
18211896
18221897 Parameters
18231898 ----------
@@ -1856,11 +1931,16 @@ def _extract_subframe_values_cached(self, sf_name, sf_col, indices, missing_mask
18561931 else :
18571932 values = np .full (n , np .nan , dtype = np .float64 )
18581933
1859- # NumPy advanced indexing - fast C-level operation
1860- valid = indices >= 0
1861- values [valid ] = sub_values [indices [valid ]]
1934+ # Phase 8a: Use Numba scatter if available and worthwhile
1935+ if self ._use_numba and n >= NUMBA_MIN_ROWS and numba_scatter is not None :
1936+ # Numba scatter modifies values in-place
1937+ numba_scatter (sub_values , indices , values )
1938+ else :
1939+ # NumPy advanced indexing - fast C-level operation
1940+ valid = indices >= 0
1941+ values [valid ] = sub_values [indices [valid ]]
18621942
1863- # Apply fill configuration
1943+ # Apply fill configuration (policy stays in Python - GPT's rule)
18641944 values = self ._apply_fill_config (sf_name , values , missing_mask , n )
18651945
18661946 return values
0 commit comments