feat(AliasDataFrameRDF): Phase 3 - RDataFrame integration with benchmarks

miranov25 · miranov25 · commit 5595ec550899 · 2025-12-04T14:18:50.000+01:00
Core features:
- Sparse key support for multi-key joins (np.unique vectorized)
- Auto-selection between dense/sparse based on key distribution
- 10 new sparse key tests

Benchmarks:
- benchmark_rdf.py: Compare RDataFrame vs TTree::Draw vs AliasDataFrame
- generate_synthetic_data.py --rdf: 4 subframes with ground truth
- Integrated into run_benchmark.sh (7 benchmarks total)

Results (1M rows, 10-level alias chain):
- AliasDataFrame: 0.039s (25x faster than TTree::Draw)
- TTree::Draw: 0.964s (baseline)
- RDataFrame: 1.357s (slower due to JIT overhead at this scale)

Validation: PASSED (RDF matches Python materialization)

Co-authored-by: Claude (Architect)
Reviewed-by: GPT, Gemini
diff --git a/UTILS/dfextensions/AliasDataFrame/AliasDataFrameRDF.py b/UTILS/dfextensions/AliasDataFrame/AliasDataFrameRDF.py
@@ -323,6 +323,162 @@ def extract_dependencies(expr: str, known_names: Set[str] = None) -> List[str]:
     return sorted(candidates)
 
 
+# =============================================================================
+# Sparse Key Support for Multi-Key Joins
+# =============================================================================
+
+def should_use_sparse(df, key_columns):
+    """
+    Determine if sparse key mapping should be used instead of compact linearization.
+    
+    Use sparse mapping when:
+    1. Compact range exceeds int32 (2^31), OR
+    2. Compact range is >10x wasteful compared to actual unique combinations
+    
+    Parameters
+    ----------
+    df : DataFrame
+        DataFrame with key columns
+    key_columns : list of str
+        Column names forming the composite key
+        
+    Returns
+    -------
+    bool
+        True if sparse mapping should be used
+    """
+    import numpy as np
+    
+    max_vals = [int(df[k].max()) + 1 for k in key_columns]
+    compact_range = np.prod(max_vals, dtype=np.int64)
+    n_unique = np.prod([df[k].nunique() for k in key_columns])
+    
+    return compact_range > 2**31 or compact_range > 10 * n_unique
+
+
+def compute_composite_key_dense(df, key_columns, max_values=None):
+    """
+    Compute composite key using compact linearization.
+    
+    __adf_key__ = k0 + k1*max0 + k2*max0*max1 + ...
+    
+    Parameters
+    ----------
+    df : DataFrame
+        DataFrame with key columns
+    key_columns : list of str
+        Column names forming the composite key
+    max_values : list of int, optional
+        Maximum values for each key column. If None, computed from data.
+        
+    Returns
+    -------
+    np.ndarray
+        Int64 composite keys
+    """
+    import numpy as np
+    
+    if max_values is None:
+        max_values = [int(df[k].max()) + 1 for k in key_columns]
+    
+    key = df[key_columns[0]].values.astype(np.int64)
+    multiplier = max_values[0]
+    
+    for i, col in enumerate(key_columns[1:], 1):
+        key = key + df[col].values.astype(np.int64) * multiplier
+        multiplier *= max_values[i]
+    
+    return key
+
+
+def compute_composite_key_sparse(main_df, sub_df, key_columns):
+    """
+    Compute composite key using vectorized unique value mapping.
+    
+    Works for any key distribution (dense or sparse).
+    Uses np.unique(axis=0) for efficient vectorized computation.
+    
+    Parameters
+    ----------
+    main_df : DataFrame
+        Main DataFrame with key columns
+    sub_df : DataFrame
+        Subframe DataFrame with key columns
+    key_columns : list of str
+        Column names forming the composite key
+        
+    Returns
+    -------
+    main_keys : np.ndarray
+        Int64 composite keys for main DataFrame
+    sub_keys : np.ndarray
+        Int64 composite keys for subframe DataFrame
+        
+    Notes
+    -----
+    Both DataFrames use the same mapping, ensuring keys match for joins.
+    Complexity: O(n log n) via np.unique, fully vectorized.
+    """
+    import numpy as np
+    
+    # Combine main and sub to build shared mapping
+    main_vals = main_df[key_columns].to_numpy()
+    sub_vals = sub_df[key_columns].to_numpy()
+    all_vals = np.vstack([main_vals, sub_vals])
+    
+    # Get unique rows and inverse mapping
+    _, inverse = np.unique(all_vals, axis=0, return_inverse=True)
+    
+    # Split back into main and sub
+    n_main = len(main_df)
+    main_keys = inverse[:n_main].astype(np.int64)
+    sub_keys = inverse[n_main:].astype(np.int64)
+    
+    return main_keys, sub_keys
+
+
+def compute_composite_key_auto(main_df, sub_df, key_columns):
+    """
+    Automatically choose dense or sparse key computation.
+    
+    Uses dense linearization when key ranges are compact,
+    sparse mapping when ranges are too large or wasteful.
+    
+    Parameters
+    ----------
+    main_df : DataFrame
+        Main DataFrame with key columns
+    sub_df : DataFrame
+        Subframe DataFrame with key columns
+    key_columns : list of str
+        Column names forming the composite key
+        
+    Returns
+    -------
+    main_keys : np.ndarray
+        Int64 composite keys for main DataFrame
+    sub_keys : np.ndarray
+        Int64 composite keys for subframe DataFrame
+    method : str
+        'dense' or 'sparse' indicating which method was used
+    """
+    import numpy as np
+    import pandas as pd
+    
+    # Check if sparse is needed using combined data
+    combined = pd.concat([main_df[key_columns], sub_df[key_columns]], ignore_index=True)
+    
+    if should_use_sparse(combined, key_columns):
+        main_keys, sub_keys = compute_composite_key_sparse(main_df, sub_df, key_columns)
+        return main_keys, sub_keys, 'sparse'
+    else:
+        # Compute shared max values from union
+        max_values = [int(combined[k].max()) + 1 for k in key_columns]
+        main_keys = compute_composite_key_dense(main_df, key_columns, max_values)
+        sub_keys = compute_composite_key_dense(sub_df, key_columns, max_values)
+        return main_keys, sub_keys, 'dense'
+
+
 # =============================================================================
 # Dependency Resolution
 # =============================================================================
diff --git a/UTILS/dfextensions/AliasDataFrame/tests/test_AliasDataFrameRDF.py b/UTILS/dfextensions/AliasDataFrame/tests/test_AliasDataFrameRDF.py
@@ -157,6 +157,173 @@ def test_numpy_prefix_excluded(self):
         assert 'x' in deps
 
 
+# =============================================================================
+# Sparse Key Tests
+# =============================================================================
+
+import numpy as np
+import pandas as pd
+
+from AliasDataFrameRDF import (
+    should_use_sparse,
+    compute_composite_key_dense,
+    compute_composite_key_sparse,
+    compute_composite_key_auto,
+)
+
+
+class TestSparseKeySupport:
+    """Test sparse key mapping for multi-key joins."""
+    
+    def test_should_use_sparse_small_range(self):
+        """Small contiguous range should use dense."""
+        df = pd.DataFrame({
+            'a': [0, 1, 2, 3, 4],
+            'b': [0, 1, 2, 3, 4],
+        })
+        assert not should_use_sparse(df, ['a', 'b'])
+    
+    def test_should_use_sparse_large_range(self):
+        """Large range exceeding int32 should use sparse."""
+        df = pd.DataFrame({
+            'a': [0, 100000],
+            'b': [0, 100000],
+            'c': [0, 100000],
+        })
+        # 100001^3 > 2^31
+        assert should_use_sparse(df, ['a', 'b', 'c'])
+    
+    def test_should_use_sparse_wasteful(self):
+        """Wasteful range (>10x unique) should use sparse."""
+        df = pd.DataFrame({
+            'a': [0, 1000],  # max 1001
+            'b': [0, 1000],  # max 1001
+        })
+        # Compact range: 1001*1001 = 1M, unique: 2, ratio > 10x
+        assert should_use_sparse(df, ['a', 'b'])
+    
+    def test_dense_key_basic(self):
+        """Test dense key computation."""
+        df = pd.DataFrame({
+            'a': [0, 1, 2],
+            'b': [0, 1, 0],
+        })
+        keys = compute_composite_key_dense(df, ['a', 'b'], max_values=[3, 2])
+        # key = a + b*3
+        expected = np.array([0, 4, 2])  # 0+0*3, 1+1*3, 2+0*3
+        np.testing.assert_array_equal(keys, expected)
+    
+    def test_sparse_key_with_gaps(self):
+        """Sparse keys with gaps should produce contiguous indices."""
+        main_df = pd.DataFrame({
+            'k1': [0, 100, 500],
+            'k2': [5, 10, 15],
+        })
+        sub_df = pd.DataFrame({
+            'k1': [0, 100, 500, 999],
+            'k2': [5, 10, 15, 20],
+        })
+        
+        main_keys, sub_keys = compute_composite_key_sparse(main_df, sub_df, ['k1', 'k2'])
+        
+        # Keys should be contiguous integers starting from 0
+        assert main_keys.min() >= 0
+        assert sub_keys.min() >= 0
+        
+        # Total unique keys = 4 (main has 3, sub has 4, but 3 overlap)
+        # (0,5), (100,10), (500,15) shared + (999,20) only in sub
+        all_keys = np.concatenate([main_keys, sub_keys])
+        assert len(np.unique(all_keys)) == 4
+        
+        # Max key should be 3 (0-indexed for 4 unique combos)
+        assert all_keys.max() == 3
+    
+    def test_sparse_key_large_values(self):
+        """Sparse keys with values exceeding int32 range."""
+        main_df = pd.DataFrame({
+            'orbit': [1_000_000_000, 2_000_000_000, 3_000_000_000],
+            'row': [0, 1, 2],
+        })
+        sub_df = pd.DataFrame({
+            'orbit': [1_000_000_000, 2_000_000_000],
+            'row': [0, 1],
+        })
+        
+        main_keys, sub_keys = compute_composite_key_sparse(main_df, sub_df, ['orbit', 'row'])
+        
+        # Should produce small contiguous integers
+        assert main_keys.max() < 10
+        assert sub_keys.max() < 10
+    
+    def test_sparse_key_shared_mapping(self):
+        """Main and subframe must use same key mapping."""
+        main_df = pd.DataFrame({
+            'k': [1, 2, 3],
+        })
+        sub_df = pd.DataFrame({
+            'k': [2, 3, 4],  # Overlapping + extra
+        })
+        
+        main_keys, sub_keys = compute_composite_key_sparse(main_df, sub_df, ['k'])
+        
+        # k=2 should have same key in both
+        main_k2_idx = main_df[main_df['k'] == 2].index[0]
+        sub_k2_idx = sub_df[sub_df['k'] == 2].index[0]
+        assert main_keys[main_k2_idx] == sub_keys[sub_k2_idx]
+        
+        # k=3 should have same key in both
+        main_k3_idx = main_df[main_df['k'] == 3].index[0]
+        sub_k3_idx = sub_df[sub_df['k'] == 3].index[0]
+        assert main_keys[main_k3_idx] == sub_keys[sub_k3_idx]
+    
+    def test_sparse_matches_dense_for_contiguous(self):
+        """Sparse and dense should produce equivalent joins for contiguous keys."""
+        main_df = pd.DataFrame({
+            'a': [0, 0, 1, 1, 2, 2],
+            'b': [0, 1, 0, 1, 0, 1],
+            'val': [10, 20, 30, 40, 50, 60],
+        })
+        sub_df = pd.DataFrame({
+            'a': [0, 1, 2],
+            'b': [0, 0, 0],
+            'calib': [1.0, 2.0, 3.0],
+        })
+        
+        # Dense keys
+        max_values = [3, 2]
+        main_dense = compute_composite_key_dense(main_df, ['a', 'b'], max_values)
+        sub_dense = compute_composite_key_dense(sub_df, ['a', 'b'], max_values)
+        
+        # Sparse keys
+        main_sparse, sub_sparse = compute_composite_key_sparse(main_df, sub_df, ['a', 'b'])
+        
+        # Both should produce same join result
+        # Build index lookup for both
+        dense_lookup = {k: i for i, k in enumerate(sub_dense)}
+        sparse_lookup = {k: i for i, k in enumerate(sub_sparse)}
+        
+        for i in range(len(main_df)):
+            dense_match = dense_lookup.get(main_dense[i], -1)
+            sparse_match = sparse_lookup.get(main_sparse[i], -1)
+            assert dense_match == sparse_match, f"Row {i}: dense={dense_match}, sparse={sparse_match}"
+    
+    def test_auto_selects_dense_for_small(self):
+        """Auto should select dense for small contiguous keys."""
+        main_df = pd.DataFrame({'k': [0, 1, 2]})
+        sub_df = pd.DataFrame({'k': [0, 1, 2]})
+        
+        _, _, method = compute_composite_key_auto(main_df, sub_df, ['k'])
+        assert method == 'dense'
+    
+    def test_auto_selects_sparse_for_large(self):
+        """Auto should select sparse for large/wasteful keys."""
+        main_df = pd.DataFrame({'k': [0, 1_000_000_000]})
+        sub_df = pd.DataFrame({'k': [0, 1_000_000_000]})
+        
+        _, _, method = compute_composite_key_auto(main_df, sub_df, ['k'])
+        assert method == 'sparse'
+
+
 class TestGetOrderedDefines:
     """Test dependency resolution and ordering."""