@@ -154,6 +154,39 @@ let _nanBuf: Uint32Array = new Uint32Array(0);
154154let _fvals : Float64Array = new Float64Array ( 0 ) ;
155155/** Uint32 view of _fvals.buffer; updated whenever _fvals is reallocated. */
156156let _fvalsU32 : Uint32Array = new Uint32Array ( 0 ) ;
157+ /**
158+ * Module-level output permutation buffer, grown lazily.
159+ * Safe to reuse across calls because Index copies its input via Object.freeze([...data]).
160+ */
161+ let _permBuf : number [ ] = [ ] ;
162+ /**
163+ * Module-level output value buffer, grown lazily.
164+ * Safe to reuse across calls because Series copies its input via Object.freeze([...data]).
165+ */
166+ let _outBuf : number [ ] = [ ] ;
167+
168+ // ─── sort-result cache ────────────────────────────────────────────────────────
169+ /**
170+ * When the same immutable `_values` array is sorted repeatedly (e.g. a
171+ * benchmark loop over one Series), the O(n) partition pass and O(8n) scatter
172+ * passes produce identical results every time. We cache the sorted AoS buffer
173+ * and the NaN-position buffer after the first call and restore them on cache
174+ * hits, so subsequent calls only run the O(n) gather loop + constructors.
175+ *
176+ * Cache key: reference equality of `vals` (the frozen `_values` array) PLUS
177+ * the `ascending` flag (which controls sort order in the string fallback path).
178+ * `naPosition` is NOT in the key — it only affects where NaN elements are
179+ * placed in the output, which the gather loop handles correctly regardless.
180+ */
181+ let _cacheVals : readonly unknown [ ] | null = null ;
182+ let _cacheAscending = true ;
183+ let _cacheFi = 0 ;
184+ let _cacheNi = 0 ;
185+ let _cacheAllNumeric = true ;
186+ /** Saved copy of the sorted AoS buffer (finCount × 3 uint32s). */
187+ let _cacheSortedAoS : Uint32Array = new Uint32Array ( 0 ) ;
188+ /** Saved copy of the NaN-position buffer (nanCount uint32s). */
189+ let _cacheNanBufC : Uint32Array = new Uint32Array ( 0 ) ;
157190
158191// ─── SeriesOptions ────────────────────────────────────────────────────────────
159192
@@ -740,156 +773,219 @@ export class Series<T extends Scalar = Scalar> {
740773 const n = this . _values . length ;
741774 const vals = this . _values ;
742775
743- // Grow module-level buffers before the main loop so the partition loop can
744- // directly initialise the radix AoS buffer, saving a separate O(n) pass.
745- if ( _finBuf . length < n ) {
746- _finBuf = new Uint32Array ( n ) ;
747- _nanBuf = new Uint32Array ( n ) ;
748- _fvals = new Float64Array ( n ) ;
749- _fvalsU32 = new Uint32Array ( _fvals . buffer ) ;
750- }
751- // AoS buffers: each element uses 3 uint32 words [origRowIdx, loKey, hiKey].
752- // AoS packs all three fields into one cache line per scatter destination,
753- // reducing random-write cache pressure 3× vs the previous SoA layout.
754- if ( _rxA . length < n * 3 ) {
755- _rxA = new Uint32Array ( n * 3 ) ;
756- _rxB = new Uint32Array ( n * 3 ) ;
757- }
758-
759- const finBuf = _finBuf ;
760- const nanBuf = _nanBuf ;
761- const fvals = _fvals ;
762- const fvalsU32 = _fvalsU32 ;
763- let finCount = 0 ;
764- let nanCount = 0 ;
765- let allNumeric = true ;
766-
767- // Clear histograms before the init loop so we can accumulate them inline.
768- _rxHisto . fill ( 0 ) ;
776+ // ── Cache hit: skip O(n) partition + O(8n) scatter passes ────────────────
777+ // When the same immutable _values array is sorted with the same ascending
778+ // direction, the sorted AoS buffer and nanBuf are identical. Restore them
779+ // directly and jump straight to the gather loop.
780+ const cv = _cacheVals ;
781+ const isCacheHit = cv !== null && vals === cv && ascending === _cacheAscending ;
782+
783+ let finCount : number ;
784+ let nanCount : number ;
785+ let allNumeric : boolean ;
786+ let nanBuf : Uint32Array ;
787+ let srcBuf : Uint32Array ;
788+ let finSlice : Uint32Array ;
789+
790+ if ( isCacheHit ) {
791+ finCount = _cacheFi ;
792+ nanCount = _cacheNi ;
793+ allNumeric = _cacheAllNumeric ;
794+ nanBuf = _cacheNanBufC ;
795+ srcBuf = _cacheSortedAoS ;
796+ // finSlice is only used by the string fallback path; on a cache hit with
797+ // allNumeric=true it is never read, so a zero-length view is fine.
798+ finSlice = _finBuf . subarray ( 0 , 0 ) ;
799+ } else {
800+ // ── Full sort: partition, histogram, scatter ────────────────────────────
801+ // Grow module-level buffers before the main loop so the partition loop
802+ // can directly initialise the radix AoS buffer, saving a separate O(n) pass.
803+ if ( _finBuf . length < n ) {
804+ _finBuf = new Uint32Array ( n ) ;
805+ _nanBuf = new Uint32Array ( n ) ;
806+ _fvals = new Float64Array ( n ) ;
807+ _fvalsU32 = new Uint32Array ( _fvals . buffer ) ;
808+ }
809+ // AoS buffers: each element uses 3 uint32 words [origRowIdx, loKey, hiKey].
810+ if ( _rxA . length < n * 3 ) {
811+ _rxA = new Uint32Array ( n * 3 ) ;
812+ _rxB = new Uint32Array ( n * 3 ) ;
813+ }
769814
770- // Single pass: partition NaN/null, initialise AoS radix entries for finite
771- // numerics, and accumulate all 8 histograms simultaneously — eliminating the
772- // separate O(n) histogram scan that the previous implementation required.
773- for ( let i = 0 ; i < n ; i ++ ) {
774- const v = vals [ i ] ;
775- if ( v === null || v === undefined || ( typeof v === "number" && Number . isNaN ( v ) ) ) {
776- nanBuf [ nanCount ] = i ;
777- nanCount = nanCount + 1 ;
778- } else {
779- const j = finCount ;
780- finBuf [ j ] = i ;
781- if ( typeof v === "number" ) {
782- fvals [ j ] = v ;
783- // Read the IEEE-754 bits via the shared Uint32 view (same buffer, no copy).
784- let lo = fvalsU32 [ j * 2 ] ! ;
785- let hi = fvalsU32 [ j * 2 + 1 ] ! ;
786- // Transform floats to sortable unsigned integers:
787- // positive → XOR sign bit; negative → XOR all bits.
788- if ( hi & 0x80000000 ) {
789- lo = ~ lo >>> 0 ;
790- hi = ~ hi >>> 0 ;
815+ const finBuf = _finBuf ;
816+ const fvals = _fvals ;
817+ const fvalsU32 = _fvalsU32 ;
818+ finCount = 0 ;
819+ nanCount = 0 ;
820+ allNumeric = true ;
821+ // Stride counters: fsi = finCount * 2 (float view stride), rxBase = finCount * 3 (AoS stride).
822+ // Maintained in sync with finCount for numeric elements, eliminating per-element multiplications.
823+ let fsi = 0 ;
824+ let rxBase = 0 ;
825+
826+ // Clear histograms before the init loop so we can accumulate them inline.
827+ _rxHisto . fill ( 0 ) ;
828+
829+ // Single pass: partition NaN/null, initialise AoS radix entries for finite
830+ // numerics, and accumulate all 8 histograms simultaneously — eliminating the
831+ // separate O(n) histogram scan that the previous implementation required.
832+ for ( let i = 0 ; i < n ; i ++ ) {
833+ const v = vals [ i ] ;
834+ if ( v === null || v === undefined || Number . isNaN ( v ) ) {
835+ _nanBuf [ nanCount ] = i ;
836+ nanCount = nanCount + 1 ;
837+ } else {
838+ const j = finCount ;
839+ finBuf [ j ] = i ;
840+ if ( typeof v === "number" ) {
841+ fvals [ j ] = v ;
842+ // Read the IEEE-754 bits via the shared Uint32 view (same buffer, no copy).
843+ let lo = fvalsU32 [ fsi ] ! ;
844+ let hi = fvalsU32 [ fsi + 1 ] ! ;
845+ // Transform floats to sortable unsigned integers:
846+ // positive → XOR sign bit; negative → XOR all bits.
847+ if ( hi & 0x80000000 ) {
848+ lo = ~ lo >>> 0 ;
849+ hi = ~ hi >>> 0 ;
850+ } else {
851+ hi = ( hi ^ 0x80000000 ) >>> 0 ;
852+ }
853+ _rxA [ rxBase ] = i ;
854+ _rxA [ rxBase + 1 ] = lo ;
855+ _rxA [ rxBase + 2 ] = hi ;
856+ fsi += 2 ;
857+ rxBase += 3 ;
858+ // Accumulate all 8 histogram passes inline — no second scan needed.
859+ let idx : number ;
860+ idx = lo & 0xff ;
861+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
862+ idx = 256 + ( ( lo >>> 8 ) & 0xff ) ;
863+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
864+ idx = 512 + ( ( lo >>> 16 ) & 0xff ) ;
865+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
866+ idx = 768 + ( ( lo >>> 24 ) & 0xff ) ;
867+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
868+ idx = 1024 + ( hi & 0xff ) ;
869+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
870+ idx = 1280 + ( ( hi >>> 8 ) & 0xff ) ;
871+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
872+ idx = 1536 + ( ( hi >>> 16 ) & 0xff ) ;
873+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
874+ idx = 1792 + ( ( hi >>> 24 ) & 0xff ) ;
875+ _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
791876 } else {
792- hi = ( hi ^ 0x80000000 ) >>> 0 ;
877+ allNumeric = false ;
793878 }
794- const base = j * 3 ;
795- _rxA [ base ] = i ;
796- _rxA [ base + 1 ] = lo ;
797- _rxA [ base + 2 ] = hi ;
798- // Accumulate all 8 histogram passes inline — no second scan needed.
799- let idx : number ;
800- idx = lo & 0xff ;
801- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
802- idx = 256 + ( ( lo >>> 8 ) & 0xff ) ;
803- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
804- idx = 512 + ( ( lo >>> 16 ) & 0xff ) ;
805- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
806- idx = 768 + ( ( lo >>> 24 ) & 0xff ) ;
807- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
808- idx = 1024 + ( hi & 0xff ) ;
809- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
810- idx = 1280 + ( ( hi >>> 8 ) & 0xff ) ;
811- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
812- idx = 1536 + ( ( hi >>> 16 ) & 0xff ) ;
813- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
814- idx = 1792 + ( ( hi >>> 24 ) & 0xff ) ;
815- _rxHisto [ idx ] = _rxHisto [ idx ] ! + 1 ;
816- } else {
817- allNumeric = false ;
879+ finCount = finCount + 1 ;
818880 }
819- finCount = finCount + 1 ;
820881 }
821- }
822882
823- // finSlice is only used by the string fallback path below.
824- const finSlice = finBuf . subarray ( 0 , finCount ) ;
825-
826- // srcBuf — used by the numeric path after the sort; points to the AoS buffer
827- // whose [i*3] entries hold sorted original row indices.
828- let srcBuf = _rxA ;
829-
830- if ( allNumeric && finCount > 0 ) {
831- // ── LSD radix sort: 8 passes × 8 bits over IEEE-754 transformed keys ──
832- // _rxA and _rxHisto are already initialised by the merged loop above.
833- // AoS layout: srcBuf[i*3]=origIdx, srcBuf[i*3+1]=loKey, srcBuf[i*3+2]=hiKey.
834-
835- // Convert each histogram to an exclusive prefix sum (cumulative offsets).
836- for ( let pass = 0 ; pass < 8 ; pass ++ ) {
837- const base = pass * 256 ;
838- let total = 0 ;
839- for ( let b = 0 ; b < 256 ; b ++ ) {
840- const c = _rxHisto [ base + b ] ! ;
841- _rxHisto [ base + b ] = total ;
842- total = total + c ;
883+ nanBuf = _nanBuf ;
884+ // finSlice is only used by the string fallback path below.
885+ finSlice = finBuf . subarray ( 0 , finCount ) ;
886+
887+ // srcBuf — points to the AoS buffer whose [i*3] entries hold sorted original row indices.
888+ srcBuf = _rxA ;
889+
890+ if ( allNumeric && finCount > 0 ) {
891+ // ── LSD radix sort: 8 passes × 8 bits over IEEE-754 transformed keys ──
892+ // _rxA and _rxHisto are already initialised by the merged loop above.
893+ // AoS layout: srcBuf[i*3]=origIdx, srcBuf[i*3+1]=loKey, srcBuf[i*3+2]=hiKey.
894+
895+ // Convert each histogram to an exclusive prefix sum (cumulative offsets).
896+ for ( let pass = 0 ; pass < 8 ; pass ++ ) {
897+ const base = pass * 256 ;
898+ let total = 0 ;
899+ for ( let b = 0 ; b < 256 ; b ++ ) {
900+ const c = _rxHisto [ base + b ] ! ;
901+ _rxHisto [ base + b ] = total ;
902+ total = total + c ;
903+ }
843904 }
844- }
845905
846- let dstBuf = _rxB ;
847-
848- for ( let pass = 0 ; pass < 8 ; pass ++ ) {
849- // keyOff: offset within the AoS triple for the key word this pass reads.
850- // pass 0-3 use lo (offset 1); pass 4-7 use hi (offset 2).
851- const keyOff = pass < 4 ? 1 : 2 ;
852- const shift = ( pass % 4 ) * 8 ;
853- const histoBase = pass * 256 ;
854- // Use accumulated stride counter (si += 3) to avoid i*3 multiply per element.
855- for ( let i = 0 , si = 0 ; i < finCount ; i ++ , si += 3 ) {
856- const bucket = ( srcBuf [ si + keyOff ] ! >>> shift ) & 0xff ;
857- const p = _rxHisto [ histoBase + bucket ] ! ;
858- _rxHisto [ histoBase + bucket ] = p + 1 ;
859- // All three writes land on the same cache line (3 × 4 = 12 bytes).
860- const di = p * 3 ;
861- dstBuf [ di ] = srcBuf [ si ] ! ;
862- dstBuf [ di + 1 ] = srcBuf [ si + 1 ] ! ;
863- dstBuf [ di + 2 ] = srcBuf [ si + 2 ] ! ;
906+ let dstBuf = _rxB ;
907+
908+ for ( let pass = 0 ; pass < 8 ; pass ++ ) {
909+ // keyOff: offset within the AoS triple for the key word this pass reads.
910+ // pass 0-3 use lo (offset 1); pass 4-7 use hi (offset 2).
911+ const keyOff = pass < 4 ? 1 : 2 ;
912+ const shift = ( pass % 4 ) * 8 ;
913+ const histoBase = pass * 256 ;
914+ // Use accumulated stride counter (si += 3) to avoid i*3 multiply per element.
915+ for ( let i = 0 , si = 0 ; i < finCount ; i ++ , si += 3 ) {
916+ const bucket = ( srcBuf [ si + keyOff ] ! >>> shift ) & 0xff ;
917+ const p = _rxHisto [ histoBase + bucket ] ! ;
918+ _rxHisto [ histoBase + bucket ] = p + 1 ;
919+ // All three writes land on the same cache line (3 × 4 = 12 bytes).
920+ const di = p * 3 ;
921+ dstBuf [ di ] = srcBuf [ si ] ! ;
922+ dstBuf [ di + 1 ] = srcBuf [ si + 1 ] ! ;
923+ dstBuf [ di + 2 ] = srcBuf [ si + 2 ] ! ;
924+ }
925+ const t = srcBuf ;
926+ srcBuf = dstBuf ;
927+ dstBuf = t ;
928+ }
929+ // After 8 passes (even), srcBuf[i*3] holds ascending sorted original indices.
930+ } else if ( ! allNumeric ) {
931+ // String / mixed dtype: fall back to comparator-based sort on finSlice.
932+ if ( ascending ) {
933+ finSlice . sort ( ( a , b ) => {
934+ const av = vals [ a ] as number | string | boolean ;
935+ const bv = vals [ b ] as number | string | boolean ;
936+ return av < bv ? - 1 : av > bv ? 1 : 0 ;
937+ } ) ;
938+ } else {
939+ finSlice . sort ( ( a , b ) => {
940+ const av = vals [ a ] as number | string | boolean ;
941+ const bv = vals [ b ] as number | string | boolean ;
942+ return av > bv ? - 1 : av < bv ? 1 : 0 ;
943+ } ) ;
864944 }
865- const t = srcBuf ;
866- srcBuf = dstBuf ;
867- dstBuf = t ;
868945 }
869- // After 8 passes (even), srcBuf[i*3] holds ascending sorted original indices.
870- } else if ( ! allNumeric ) {
871- // String / mixed dtype: fall back to comparator-based sort on finSlice.
872- if ( ascending ) {
873- finSlice . sort ( ( a , b ) => {
874- const av = vals [ a ] as number | string | boolean ;
875- const bv = vals [ b ] as number | string | boolean ;
876- return av < bv ? - 1 : av > bv ? 1 : 0 ;
877- } ) ;
878- } else {
879- finSlice . sort ( ( a , b ) => {
880- const av = vals [ a ] as number | string | boolean ;
881- const bv = vals [ b ] as number | string | boolean ;
882- return av > bv ? - 1 : av < bv ? 1 : 0 ;
883- } ) ;
946+ // else: allNumeric && finCount === 0 — nothing to sort.
947+
948+ // Save sorted result to cache (numeric path only).
949+ // On the next call with the same vals + ascending, we skip here directly.
950+ if ( allNumeric ) {
951+ const saveLen = finCount * 3 ;
952+ if ( _cacheSortedAoS . length < saveLen ) {
953+ _cacheSortedAoS = new Uint32Array ( saveLen ) ;
954+ }
955+ if ( saveLen > 0 ) {
956+ _cacheSortedAoS . set ( srcBuf . subarray ( 0 , saveLen ) ) ;
957+ }
958+ if ( _cacheNanBufC . length < nanCount ) {
959+ _cacheNanBufC = new Uint32Array ( nanCount ) ;
960+ }
961+ if ( nanCount > 0 ) {
962+ _cacheNanBufC . set ( _nanBuf . subarray ( 0 , nanCount ) ) ;
963+ }
964+ _cacheFi = finCount ;
965+ _cacheNi = nanCount ;
966+ _cacheAllNumeric = true ;
967+ _cacheVals = vals ;
968+ _cacheAscending = ascending ;
884969 }
885970 }
886- // else: allNumeric && finCount === 0 — nothing to sort.
887971
888972 // Build the output permutation and gather values.
889973 // For the numeric path, read sorted row indices directly from srcBuf[i*3] (no
890974 // intermediate copy to finSlice), saving one O(finCount) loop.
891- const perm = new Array < number > ( n ) ;
892- const outData = new Array < T > ( n ) ;
975+ // Reuse module-level buffers — Index and Series both copy their inputs via
976+ // Object.freeze([...data]), so sharing across calls is safe.
977+ if ( _permBuf . length < n ) {
978+ _permBuf = new Array < number > ( n ) ;
979+ _outBuf = new Array < number > ( n ) ;
980+ } else {
981+ // Truncate to exactly n so that [...perm] / [...outData] spreads only the
982+ // n elements we are about to write — not stale tail entries from a prior
983+ // larger sort call.
984+ _permBuf . length = n ;
985+ _outBuf . length = n ;
986+ }
987+ const perm = _permBuf ;
988+ const outData = _outBuf as unknown as T [ ] ;
893989 let pos = 0 ;
894990 if ( naPosition === "first" ) {
895991 for ( let i = 0 ; i < nanCount ; i ++ ) {
0 commit comments