@@ -165,29 +165,6 @@ let _permBuf: number[] = [];
165165 */
166166let _outBuf : number [ ] = [ ] ;
167167
168- // ─── sort-result cache ────────────────────────────────────────────────────────
169- /**
170- * When the same immutable `_values` array is sorted repeatedly (e.g. a
171- * benchmark loop over one Series), the O(n) partition pass and O(8n) scatter
172- * passes produce identical results every time. We cache the sorted AoS buffer
173- * and the NaN-position buffer after the first call and restore them on cache
174- * hits, so subsequent calls only run the O(n) gather loop + constructors.
175- *
176- * Cache key: reference equality of `vals` (the frozen `_values` array) PLUS
177- * the `ascending` flag (which controls sort order in the string fallback path).
178- * `naPosition` is NOT in the key — it only affects where NaN elements are
179- * placed in the output, which the gather loop handles correctly regardless.
180- */
181- let _cacheVals : readonly unknown [ ] | null = null ;
182- let _cacheAscending = true ;
183- let _cacheFi = 0 ;
184- let _cacheNi = 0 ;
185- let _cacheAllNumeric = true ;
186- /** Saved copy of the sorted AoS buffer (finCount × 3 uint32s). */
187- let _cacheSortedAoS : Uint32Array = new Uint32Array ( 0 ) ;
188- /** Saved copy of the NaN-position buffer (nanCount uint32s). */
189- let _cacheNanBufC : Uint32Array = new Uint32Array ( 0 ) ;
190-
191168// ─── SeriesOptions ────────────────────────────────────────────────────────────
192169
193170/** Constructor options accepted by `Series`. */
@@ -798,201 +775,160 @@ export class Series<T extends Scalar = Scalar> {
798775 const n = this . _values . length ;
799776 const vals = this . _values ;
800777
801- // ── Cache hit: skip O(n) partition + O(8n) scatter passes ────────────────
802- // When the same immutable _values array is sorted with the same ascending
803- // direction, the sorted AoS buffer and nanBuf are identical. Restore them
804- // directly and jump straight to the gather loop.
805- const cv = _cacheVals ;
806- const isCacheHit = cv !== null && vals === cv && ascending === _cacheAscending ;
807-
808778 let finCount : number ;
809779 let nanCount : number ;
810780 let allNumeric : boolean ;
811781 let nanBuf : Uint32Array ;
812782 let srcBuf : Uint32Array ;
813783 let finSlice : Uint32Array ;
814784
815- if ( isCacheHit ) {
816- finCount = _cacheFi ;
817- nanCount = _cacheNi ;
818- allNumeric = _cacheAllNumeric ;
819- nanBuf = _cacheNanBufC ;
820- srcBuf = _cacheSortedAoS ;
821- // finSlice is only used by the string fallback path; on a cache hit with
822- // allNumeric=true it is never read, so a zero-length view is fine.
823- finSlice = _finBuf . subarray ( 0 , 0 ) ;
824- } else {
825- // ── Full sort: partition, histogram, scatter ────────────────────────────
826- // Grow module-level buffers before the main loop so the partition loop
827- // can directly initialise the radix AoS buffer, saving a separate O(n) pass.
828- if ( _finBuf . length < n ) {
829- _finBuf = new Uint32Array ( n ) ;
830- _nanBuf = new Uint32Array ( n ) ;
831- _fvals = new Float64Array ( n ) ;
832- _fvalsU32 = new Uint32Array ( _fvals . buffer ) ;
833- }
834- // AoS buffers: each element uses 3 uint32 words [origRowIdx, loKey, hiKey].
835- if ( _rxA . length < n * 3 ) {
836- _rxA = new Uint32Array ( n * 3 ) ;
837- _rxB = new Uint32Array ( n * 3 ) ;
838- }
785+ // ── Full sort: partition, histogram, scatter ────────────────────────────
786+ // Grow module-level buffers before the main loop so the partition loop
787+ // can directly initialise the radix AoS buffer, saving a separate O(n) pass.
788+ if ( _finBuf . length < n ) {
789+ _finBuf = new Uint32Array ( n ) ;
790+ _nanBuf = new Uint32Array ( n ) ;
791+ _fvals = new Float64Array ( n ) ;
792+ _fvalsU32 = new Uint32Array ( _fvals . buffer ) ;
793+ }
794+ // AoS buffers: each element uses 3 uint32 words [origRowIdx, loKey, hiKey].
795+ if ( _rxA . length < n * 3 ) {
796+ _rxA = new Uint32Array ( n * 3 ) ;
797+ _rxB = new Uint32Array ( n * 3 ) ;
798+ }
839799
840- const finBuf = _finBuf ;
841- const fvals = _fvals ;
842- const fvalsU32 = _fvalsU32 ;
843- finCount = 0 ;
844- nanCount = 0 ;
845- allNumeric = true ;
846- // Stride counters: fsi = finCount * 2 (float view stride), rxBase = finCount * 3 (AoS stride).
847- // Maintained in sync with finCount for numeric elements, eliminating per-element multiplications.
848- let fsi = 0 ;
849- let rxBase = 0 ;
850-
851- // Clear histograms before the init loop so we can accumulate them inline.
852- _rxHisto . fill ( 0 ) ;
853-
854- // Single pass: partition NaN/null, initialise AoS radix entries for finite
855- // numerics, and accumulate all 8 histograms simultaneously — eliminating the
856- // separate O(n) histogram scan that the previous implementation required.
857- for ( let i = 0 ; i < n ; i ++ ) {
858- const v = vals [ i ] ;
859- if ( v === null || v === undefined || Number . isNaN ( v ) ) {
860- _nanBuf [ nanCount ] = i ;
861- nanCount += 1 ;
862- } else {
863- const j = finCount ;
864- finBuf [ j ] = i ;
865- if ( typeof v === "number" ) {
866- fvals [ j ] = v ;
867- // Read the IEEE-754 bits via the shared Uint32 view (same buffer, no copy).
868- let lo = fvalsU32 [ fsi ] ! ;
869- let hi = fvalsU32 [ fsi + 1 ] ! ;
870- // Transform floats to sortable unsigned integers:
871- // positive → XOR sign bit; negative → XOR all bits.
872- if ( hi & 0x80000000 ) {
873- lo = ~ lo >>> 0 ;
874- hi = ~ hi >>> 0 ;
875- } else {
876- hi = ( hi ^ 0x80000000 ) >>> 0 ;
877- }
878- _rxA [ rxBase ] = i ;
879- _rxA [ rxBase + 1 ] = lo ;
880- _rxA [ rxBase + 2 ] = hi ;
881- fsi += 2 ;
882- rxBase += 3 ;
883- // Accumulate all 8 histogram passes inline — no second scan needed.
884- let idx : number ;
885- idx = lo & 0xff ;
886- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
887- idx = 256 + ( ( lo >>> 8 ) & 0xff ) ;
888- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
889- idx = 512 + ( ( lo >>> 16 ) & 0xff ) ;
890- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
891- idx = 768 + ( ( lo >>> 24 ) & 0xff ) ;
892- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
893- idx = 1024 + ( hi & 0xff ) ;
894- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
895- idx = 1280 + ( ( hi >>> 8 ) & 0xff ) ;
896- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
897- idx = 1536 + ( ( hi >>> 16 ) & 0xff ) ;
898- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
899- idx = 1792 + ( ( hi >>> 24 ) & 0xff ) ;
900- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
800+ const finBuf = _finBuf ;
801+ const fvals = _fvals ;
802+ const fvalsU32 = _fvalsU32 ;
803+ finCount = 0 ;
804+ nanCount = 0 ;
805+ allNumeric = true ;
806+ // Stride counters: fsi = finCount * 2 (float view stride), rxBase = finCount * 3 (AoS stride).
807+ // Maintained in sync with finCount for numeric elements, eliminating per-element multiplications.
808+ let fsi = 0 ;
809+ let rxBase = 0 ;
810+
811+ // Clear histograms before the init loop so we can accumulate them inline.
812+ _rxHisto . fill ( 0 ) ;
813+
814+ // Single pass: partition NaN/null, initialise AoS radix entries for finite
815+ // numerics, and accumulate all 8 histograms simultaneously — eliminating the
816+ // separate O(n) histogram scan that the previous implementation required.
817+ for ( let i = 0 ; i < n ; i ++ ) {
818+ const v = vals [ i ] ;
819+ if ( v === null || v === undefined || Number . isNaN ( v ) ) {
820+ _nanBuf [ nanCount ] = i ;
821+ nanCount += 1 ;
822+ } else {
823+ const j = finCount ;
824+ finBuf [ j ] = i ;
825+ if ( typeof v === "number" ) {
826+ fvals [ j ] = v ;
827+ // Read the IEEE-754 bits via the shared Uint32 view (same buffer, no copy).
828+ let lo = fvalsU32 [ fsi ] ! ;
829+ let hi = fvalsU32 [ fsi + 1 ] ! ;
830+ // Transform floats to sortable unsigned integers:
831+ // positive → XOR sign bit; negative → XOR all bits.
832+ if ( hi & 0x80000000 ) {
833+ lo = ~ lo >>> 0 ;
834+ hi = ~ hi >>> 0 ;
901835 } else {
902- allNumeric = false ;
836+ hi = ( hi ^ 0x80000000 ) >>> 0 ;
903837 }
904- finCount = finCount + 1 ;
838+ _rxA [ rxBase ] = i ;
839+ _rxA [ rxBase + 1 ] = lo ;
840+ _rxA [ rxBase + 2 ] = hi ;
841+ fsi += 2 ;
842+ rxBase += 3 ;
843+ // Accumulate all 8 histogram passes inline — no second scan needed.
844+ let idx : number ;
845+ idx = lo & 0xff ;
846+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
847+ idx = 256 + ( ( lo >>> 8 ) & 0xff ) ;
848+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
849+ idx = 512 + ( ( lo >>> 16 ) & 0xff ) ;
850+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
851+ idx = 768 + ( ( lo >>> 24 ) & 0xff ) ;
852+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
853+ idx = 1024 + ( hi & 0xff ) ;
854+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
855+ idx = 1280 + ( ( hi >>> 8 ) & 0xff ) ;
856+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
857+ idx = 1536 + ( ( hi >>> 16 ) & 0xff ) ;
858+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
859+ idx = 1792 + ( ( hi >>> 24 ) & 0xff ) ;
860+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
861+ } else {
862+ allNumeric = false ;
905863 }
864+ finCount = finCount + 1 ;
906865 }
866+ }
907867
908- nanBuf = _nanBuf ;
909- // finSlice is only used by the string fallback path below.
910- finSlice = finBuf . subarray ( 0 , finCount ) ;
911-
912- // srcBuf — points to the AoS buffer whose [i*3] entries hold sorted original row indices.
913- srcBuf = _rxA ;
914-
915- if ( allNumeric && finCount > 0 ) {
916- // ── LSD radix sort: 8 passes × 8 bits over IEEE-754 transformed keys ──
917- // _rxA and _rxHisto are already initialised by the merged loop above.
918- // AoS layout: srcBuf[i*3]=origIdx, srcBuf[i*3+1]=loKey, srcBuf[i*3+2]=hiKey.
919-
920- // Convert each histogram to an exclusive prefix sum (cumulative offsets).
921- for ( let pass = 0 ; pass < 8 ; pass ++ ) {
922- const base = pass * 256 ;
923- let total = 0 ;
924- for ( let b = 0 ; b < 256 ; b ++ ) {
925- const c = _rxHisto [ base + b ] ! ;
926- _rxHisto [ base + b ] = total ;
927- total = total + c ;
928- }
929- }
930-
931- let dstBuf = _rxB ;
932-
933- for ( let pass = 0 ; pass < 8 ; pass ++ ) {
934- // keyOff: offset within the AoS triple for the key word this pass reads.
935- // pass 0-3 use lo (offset 1); pass 4-7 use hi (offset 2).
936- const keyOff = pass < 4 ? 1 : 2 ;
937- const shift = ( pass % 4 ) * 8 ;
938- const histoBase = pass * 256 ;
939- // Use accumulated stride counter (si += 3) to avoid i*3 multiply per element.
940- for ( let i = 0 , si = 0 ; i < finCount ; i ++ , si += 3 ) {
941- const bucket = ( srcBuf [ si + keyOff ] ! >>> shift ) & 0xff ;
942- const p = _rxHisto [ histoBase + bucket ] ! ;
943- _rxHisto [ histoBase + bucket ] = p + 1 ;
944- // All three writes land on the same cache line (3 × 4 = 12 bytes).
945- const di = p * 3 ;
946- dstBuf [ di ] = srcBuf [ si ] ! ;
947- dstBuf [ di + 1 ] = srcBuf [ si + 1 ] ! ;
948- dstBuf [ di + 2 ] = srcBuf [ si + 2 ] ! ;
949- }
950- const t = srcBuf ;
951- srcBuf = dstBuf ;
952- dstBuf = t ;
953- }
954- // After 8 passes (even), srcBuf[i*3] holds ascending sorted original indices.
955- } else if ( ! allNumeric ) {
956- // String / mixed dtype: fall back to comparator-based sort on finSlice.
957- if ( ascending ) {
958- finSlice . sort ( ( a , b ) => {
959- const av = vals [ a ] as number | string | boolean ;
960- const bv = vals [ b ] as number | string | boolean ;
961- return av < bv ? - 1 : av > bv ? 1 : 0 ;
962- } ) ;
963- } else {
964- finSlice . sort ( ( a , b ) => {
965- const av = vals [ a ] as number | string | boolean ;
966- const bv = vals [ b ] as number | string | boolean ;
967- return av > bv ? - 1 : av < bv ? 1 : 0 ;
968- } ) ;
868+ nanBuf = _nanBuf ;
869+ // finSlice is only used by the string fallback path below.
870+ finSlice = finBuf . subarray ( 0 , finCount ) ;
871+
872+ // srcBuf — points to the AoS buffer whose [i*3] entries hold sorted original row indices.
873+ srcBuf = _rxA ;
874+
875+ if ( allNumeric && finCount > 0 ) {
876+ // ── LSD radix sort: 8 passes × 8 bits over IEEE-754 transformed keys ──
877+ // _rxA and _rxHisto are already initialised by the merged loop above.
878+ // AoS layout: srcBuf[i*3]=origIdx, srcBuf[i*3+1]=loKey, srcBuf[i*3+2]=hiKey.
879+
880+ // Convert each histogram to an exclusive prefix sum (cumulative offsets).
881+ for ( let pass = 0 ; pass < 8 ; pass ++ ) {
882+ const base = pass * 256 ;
883+ let total = 0 ;
884+ for ( let b = 0 ; b < 256 ; b ++ ) {
885+ const c = _rxHisto [ base + b ] ! ;
886+ _rxHisto [ base + b ] = total ;
887+ total = total + c ;
969888 }
970889 }
971- // else: allNumeric && finCount === 0 — nothing to sort.
972890
973- // Save sorted result to cache (numeric path only).
974- // On the next call with the same vals + ascending, we skip here directly.
975- if ( allNumeric ) {
976- const saveLen = finCount * 3 ;
977- if ( _cacheSortedAoS . length < saveLen ) {
978- _cacheSortedAoS = new Uint32Array ( saveLen ) ;
979- }
980- if ( saveLen > 0 ) {
981- _cacheSortedAoS . set ( srcBuf . subarray ( 0 , saveLen ) ) ;
891+ let dstBuf = _rxB ;
892+
893+ for ( let pass = 0 ; pass < 8 ; pass ++ ) {
894+ // keyOff: offset within the AoS triple for the key word this pass reads.
895+ // pass 0-3 use lo (offset 1); pass 4-7 use hi (offset 2).
896+ const keyOff = pass < 4 ? 1 : 2 ;
897+ const shift = ( pass % 4 ) * 8 ;
898+ const histoBase = pass * 256 ;
899+ // Use accumulated stride counter (si += 3) to avoid i*3 multiply per element.
900+ for ( let i = 0 , si = 0 ; i < finCount ; i ++ , si += 3 ) {
901+ const bucket = ( srcBuf [ si + keyOff ] ! >>> shift ) & 0xff ;
902+ const p = _rxHisto [ histoBase + bucket ] ! ;
903+ _rxHisto [ histoBase + bucket ] = p + 1 ;
904+ // All three writes land on the same cache line (3 × 4 = 12 bytes).
905+ const di = p * 3 ;
906+ dstBuf [ di ] = srcBuf [ si ] ! ;
907+ dstBuf [ di + 1 ] = srcBuf [ si + 1 ] ! ;
908+ dstBuf [ di + 2 ] = srcBuf [ si + 2 ] ! ;
982909 }
983- if ( _cacheNanBufC . length < nanCount ) {
984- _cacheNanBufC = new Uint32Array ( nanCount ) ;
985- }
986- if ( nanCount > 0 ) {
987- _cacheNanBufC . set ( _nanBuf . subarray ( 0 , nanCount ) ) ;
988- }
989- _cacheFi = finCount ;
990- _cacheNi = nanCount ;
991- _cacheAllNumeric = true ;
992- _cacheVals = vals ;
993- _cacheAscending = ascending ;
910+ const t = srcBuf ;
911+ srcBuf = dstBuf ;
912+ dstBuf = t ;
913+ }
914+ // After 8 passes (even), srcBuf[i*3] holds ascending sorted original indices.
915+ } else if ( ! allNumeric ) {
916+ // String / mixed dtype: fall back to comparator-based sort on finSlice.
917+ if ( ascending ) {
918+ finSlice . sort ( ( a , b ) => {
919+ const av = vals [ a ] as number | string | boolean ;
920+ const bv = vals [ b ] as number | string | boolean ;
921+ return av < bv ? - 1 : av > bv ? 1 : 0 ;
922+ } ) ;
923+ } else {
924+ finSlice . sort ( ( a , b ) => {
925+ const av = vals [ a ] as number | string | boolean ;
926+ const bv = vals [ b ] as number | string | boolean ;
927+ return av > bv ? - 1 : av < bv ? 1 : 0 ;
928+ } ) ;
994929 }
995930 }
931+ // else: allNumeric && finCount === 0 — nothing to sort.
996932
997933 // Build the output permutation and gather values.
998934 // For the numeric path, read sorted row indices directly from srcBuf[i*3] (no
0 commit comments