Skip to content

Commit adac6b4

Browse files
author
miranov25
committed
feat(AliasDataFrame): Add Numba acceleration (Phase 8a + 8b foundation)
Phase 8a: Numba scatter for value extraction - JIT-compiled numba_scatter() with f32/f64/i64 dispatch - Parallel execution with prange for large arrays - Scatter operation: 0.032s → 0.004s (8x faster) Phase 8b: Numba index lookup (single-column integer keys) - JIT-compiled numba_compute_join_indices() - Auto-select direct addressing vs hash-based lookup - Falls back to pandas for multi-column keys API: - Add use_numba parameter to constructor (None/True/False) - Add numba_info property for status inspection - Graceful fallback when Numba unavailable Performance impact: - Scatter operations: 8x faster - Overall subframe materialization: ~10-15% improvement - Primary benchmark uses multi-column keys (Phase 8b not exercised) Note: Phase 8c will add multi-column key linearization for full benefit. Requires: numba>=0.50 (optional dependency)
1 parent e8ba0c4 commit adac6b4

3 files changed

Lines changed: 902 additions & 14 deletions

File tree

UTILS/dfextensions/AliasDataFrame/AliasDataFrame.py

Lines changed: 94 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,19 @@
1717
import re
1818
import ast
1919

20+
# Numba acceleration (optional)
21+
try:
22+
from _numba_accelerators import (
23+
NUMBA_AVAILABLE, NUMBA_MIN_ROWS,
24+
numba_scatter, numba_compute_join_indices, get_numba_info
25+
)
26+
except ImportError:
27+
NUMBA_AVAILABLE = False
28+
NUMBA_MIN_ROWS = 10000
29+
numba_scatter = None
30+
numba_compute_join_indices = None
31+
get_numba_info = lambda: {'available': False, 'version': None}
32+
2033
# =============================================================================
2134
# SECTION 0: Schema & Metadata Constants
2235
# =============================================================================
@@ -711,7 +724,7 @@ class AliasDataFrame:
711724
Phase 4: Uses unified _schema dict as single source of truth.
712725
"""
713726

714-
def __init__(self, df, schema_id=None):
727+
def __init__(self, df, schema_id=None, use_numba=None):
715728
"""
716729
Initialize AliasDataFrame with unified schema structure.
717730
@@ -722,6 +735,10 @@ def __init__(self, df, schema_id=None):
722735
schema_id : str, optional
723736
User-defined identifier for this schema (e.g., "miranov_lxplus_TPC_calib_v3").
724737
Useful for parameter scans, test studies, and provenance tracking.
738+
use_numba : bool, optional
739+
Enable/disable Numba acceleration for subframe joins.
740+
If None (default), auto-detect: use Numba if available.
741+
Set to False to force pure NumPy/Pandas operations.
725742
726743
The _schema dict is the single source of truth for:
727744
- __meta__: schema version, timestamps, user-defined ID
@@ -809,6 +826,13 @@ def __init__(self, df, schema_id=None):
809826
self._join_index_cache = {} # {sf_name: {indices, missing_mask, n_rows, subframe_id}}
810827
self._join_cache_hits = 0
811828
self._join_cache_misses = 0
829+
830+
# Phase 8: Numba acceleration configuration
831+
# Auto-detect if not specified: use Numba when available
832+
if use_numba is None:
833+
self._use_numba = NUMBA_AVAILABLE
834+
else:
835+
self._use_numba = use_numba and NUMBA_AVAILABLE
812836

813837
# =========================================================================
814838
# SECTION 1: Core DataFrame Operations & Schema Properties
@@ -855,6 +879,30 @@ def set_schema_id(self, schema_id):
855879
self.schema_id = schema_id
856880
return self
857881

882+
@property
883+
def numba_info(self):
884+
"""
885+
Get information about Numba acceleration status.
886+
887+
Returns
888+
-------
889+
dict
890+
Contains:
891+
- available: bool - whether Numba is installed
892+
- enabled: bool - whether this ADF instance uses Numba
893+
- version: str or None - Numba version if available
894+
- min_rows: int - minimum rows to use Numba (JIT overhead threshold)
895+
896+
Example
897+
-------
898+
>>> adf.numba_info
899+
{'available': True, 'enabled': True, 'version': '0.57.0', 'min_rows': 10000}
900+
"""
901+
info = get_numba_info()
902+
info['enabled'] = self._use_numba
903+
info['min_rows'] = NUMBA_MIN_ROWS
904+
return info
905+
858906
# =========================================================================
859907
# Phase 4: Backward Compatibility Properties
860908
# =========================================================================
@@ -1765,8 +1813,8 @@ def _compute_join_indices(self, sf_name, index_cols):
17651813
"""
17661814
Compute join index mapping from main DataFrame to subframe rows.
17671815
1768-
Uses lightweight merge (keys only) to build index mapping without
1769-
copying full subframe data.
1816+
Uses Numba JIT-compiled lookup (Phase 8b) for single-column integer keys,
1817+
falls back to lightweight merge (keys only) for complex cases.
17701818
17711819
Parameters
17721820
----------
@@ -1789,23 +1837,49 @@ def _compute_join_indices(self, sf_name, index_cols):
17891837
- Deduplicates subframe on index_cols only (not full columns)
17901838
- Takes first match for duplicate keys (keep='first')
17911839
- Indices refer to ORIGINAL subframe rows (before deduplication)
1840+
- Phase 8b: Uses Numba for single-column integer keys (>10K rows)
17921841
"""
17931842
sub_adf = self.get_subframe(sf_name)
17941843
sub_df = sub_adf.df
1844+
n_main = len(self.df)
17951845

1846+
# Phase 8b: Try Numba path for single-column integer keys
1847+
if (self._use_numba
1848+
and numba_compute_join_indices is not None
1849+
and len(index_cols) == 1
1850+
and n_main >= NUMBA_MIN_ROWS):
1851+
1852+
col = index_cols[0]
1853+
main_keys = self.df[col].to_numpy()
1854+
sub_keys = sub_df[col].to_numpy()
1855+
1856+
# Check if keys are integer-compatible
1857+
if (np.issubdtype(main_keys.dtype, np.integer) and
1858+
np.issubdtype(sub_keys.dtype, np.integer)):
1859+
1860+
# Use Numba index lookup
1861+
indices, missing_mask, used_numba = numba_compute_join_indices(
1862+
main_keys.astype(np.int64),
1863+
sub_keys.astype(np.int64)
1864+
)
1865+
1866+
if used_numba:
1867+
return indices, missing_mask
1868+
1869+
# Fallback: Pandas merge for multi-column or non-integer keys
17961870
# Build lightweight key table with row indices into ORIGINAL subframe
17971871
# Critical: Add __sub_row__ BEFORE deduplication so indices map to original rows
1798-
sub_keys = sub_df[index_cols].copy()
1799-
sub_keys['__sub_row__'] = np.arange(len(sub_df), dtype=np.int64)
1872+
sub_keys_df = sub_df[index_cols].copy()
1873+
sub_keys_df['__sub_row__'] = np.arange(len(sub_df), dtype=np.int64)
18001874

18011875
# Deduplicate on index_cols only, keeping first match
1802-
if sub_keys.duplicated(subset=index_cols).any():
1803-
sub_keys = sub_keys.drop_duplicates(subset=index_cols, keep='first')
1876+
if sub_keys_df.duplicated(subset=index_cols).any():
1877+
sub_keys_df = sub_keys_df.drop_duplicates(subset=index_cols, keep='first')
18041878

18051879
# Lightweight merge: main keys -> subframe row indices
18061880
# Left merge preserves main DataFrame row order (Many-to-One join)
1807-
main_keys = self.df[index_cols]
1808-
merged = main_keys.merge(sub_keys, on=index_cols, how='left', sort=False)
1881+
main_keys_df = self.df[index_cols]
1882+
merged = main_keys_df.merge(sub_keys_df, on=index_cols, how='left', sort=False)
18091883

18101884
# Extract indices and missing mask
18111885
indices = merged['__sub_row__'].fillna(-1).astype(np.int64).to_numpy()
@@ -1817,7 +1891,8 @@ def _extract_subframe_values_cached(self, sf_name, sf_col, indices, missing_mask
18171891
"""
18181892
Extract subframe column values using cached indices.
18191893
1820-
Uses NumPy advanced indexing for fast value extraction.
1894+
Uses Numba JIT-compiled scatter (Phase 8a) when available,
1895+
falls back to NumPy advanced indexing.
18211896
18221897
Parameters
18231898
----------
@@ -1856,11 +1931,16 @@ def _extract_subframe_values_cached(self, sf_name, sf_col, indices, missing_mask
18561931
else:
18571932
values = np.full(n, np.nan, dtype=np.float64)
18581933

1859-
# NumPy advanced indexing - fast C-level operation
1860-
valid = indices >= 0
1861-
values[valid] = sub_values[indices[valid]]
1934+
# Phase 8a: Use Numba scatter if available and worthwhile
1935+
if self._use_numba and n >= NUMBA_MIN_ROWS and numba_scatter is not None:
1936+
# Numba scatter modifies values in-place
1937+
numba_scatter(sub_values, indices, values)
1938+
else:
1939+
# NumPy advanced indexing - fast C-level operation
1940+
valid = indices >= 0
1941+
values[valid] = sub_values[indices[valid]]
18621942

1863-
# Apply fill configuration
1943+
# Apply fill configuration (policy stays in Python - GPT's rule)
18641944
values = self._apply_fill_config(sf_name, values, missing_mask, n)
18651945

18661946
return values

0 commit comments

Comments
 (0)