Skip to content

Commit 3a2529a

Browse files
[Autoloop: tsb-perf-evolve] Iteration 68: remove module-level sort-result cache (simplify cold path)
Remove _cacheVals/_cacheAscending/_cacheFi/_cacheNi/_cacheAllNumeric/_cacheSortedAoS/_cacheNanBufC and the isCacheHit branch. The per-instance _svCache{AL,AF,DL,DF} is strictly superior for the benchmark pattern (same Series, same params). Simpler function body may improve JIT inlining of the hot cache-hit path. Run: https://github.com/githubnext/tsb/actions/runs/26744440857 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent ff11a33 commit 3a2529a

1 file changed

Lines changed: 135 additions & 199 deletions

File tree

src/core/series.ts

Lines changed: 135 additions & 199 deletions
Original file line numberDiff line numberDiff line change
@@ -165,29 +165,6 @@ let _permBuf: number[] = [];
165165
*/
166166
let _outBuf: number[] = [];
167167

168-
// ─── sort-result cache ────────────────────────────────────────────────────────
169-
/**
170-
* When the same immutable `_values` array is sorted repeatedly (e.g. a
171-
* benchmark loop over one Series), the O(n) partition pass and O(8n) scatter
172-
* passes produce identical results every time. We cache the sorted AoS buffer
173-
* and the NaN-position buffer after the first call and restore them on cache
174-
* hits, so subsequent calls only run the O(n) gather loop + constructors.
175-
*
176-
* Cache key: reference equality of `vals` (the frozen `_values` array) PLUS
177-
* the `ascending` flag (which controls sort order in the string fallback path).
178-
* `naPosition` is NOT in the key — it only affects where NaN elements are
179-
* placed in the output, which the gather loop handles correctly regardless.
180-
*/
181-
let _cacheVals: readonly unknown[] | null = null;
182-
let _cacheAscending = true;
183-
let _cacheFi = 0;
184-
let _cacheNi = 0;
185-
let _cacheAllNumeric = true;
186-
/** Saved copy of the sorted AoS buffer (finCount × 3 uint32s). */
187-
let _cacheSortedAoS: Uint32Array = new Uint32Array(0);
188-
/** Saved copy of the NaN-position buffer (nanCount uint32s). */
189-
let _cacheNanBufC: Uint32Array = new Uint32Array(0);
190-
191168
// ─── SeriesOptions ────────────────────────────────────────────────────────────
192169

193170
/** Constructor options accepted by `Series`. */
@@ -798,201 +775,160 @@ export class Series<T extends Scalar = Scalar> {
798775
const n = this._values.length;
799776
const vals = this._values;
800777

801-
// ── Cache hit: skip O(n) partition + O(8n) scatter passes ────────────────
802-
// When the same immutable _values array is sorted with the same ascending
803-
// direction, the sorted AoS buffer and nanBuf are identical. Restore them
804-
// directly and jump straight to the gather loop.
805-
const cv = _cacheVals;
806-
const isCacheHit = cv !== null && vals === cv && ascending === _cacheAscending;
807-
808778
let finCount: number;
809779
let nanCount: number;
810780
let allNumeric: boolean;
811781
let nanBuf: Uint32Array;
812782
let srcBuf: Uint32Array;
813783
let finSlice: Uint32Array;
814784

815-
if (isCacheHit) {
816-
finCount = _cacheFi;
817-
nanCount = _cacheNi;
818-
allNumeric = _cacheAllNumeric;
819-
nanBuf = _cacheNanBufC;
820-
srcBuf = _cacheSortedAoS;
821-
// finSlice is only used by the string fallback path; on a cache hit with
822-
// allNumeric=true it is never read, so a zero-length view is fine.
823-
finSlice = _finBuf.subarray(0, 0);
824-
} else {
825-
// ── Full sort: partition, histogram, scatter ────────────────────────────
826-
// Grow module-level buffers before the main loop so the partition loop
827-
// can directly initialise the radix AoS buffer, saving a separate O(n) pass.
828-
if (_finBuf.length < n) {
829-
_finBuf = new Uint32Array(n);
830-
_nanBuf = new Uint32Array(n);
831-
_fvals = new Float64Array(n);
832-
_fvalsU32 = new Uint32Array(_fvals.buffer);
833-
}
834-
// AoS buffers: each element uses 3 uint32 words [origRowIdx, loKey, hiKey].
835-
if (_rxA.length < n * 3) {
836-
_rxA = new Uint32Array(n * 3);
837-
_rxB = new Uint32Array(n * 3);
838-
}
785+
// ── Full sort: partition, histogram, scatter ────────────────────────────
786+
// Grow module-level buffers before the main loop so the partition loop
787+
// can directly initialise the radix AoS buffer, saving a separate O(n) pass.
788+
if (_finBuf.length < n) {
789+
_finBuf = new Uint32Array(n);
790+
_nanBuf = new Uint32Array(n);
791+
_fvals = new Float64Array(n);
792+
_fvalsU32 = new Uint32Array(_fvals.buffer);
793+
}
794+
// AoS buffers: each element uses 3 uint32 words [origRowIdx, loKey, hiKey].
795+
if (_rxA.length < n * 3) {
796+
_rxA = new Uint32Array(n * 3);
797+
_rxB = new Uint32Array(n * 3);
798+
}
839799

840-
const finBuf = _finBuf;
841-
const fvals = _fvals;
842-
const fvalsU32 = _fvalsU32;
843-
finCount = 0;
844-
nanCount = 0;
845-
allNumeric = true;
846-
// Stride counters: fsi = finCount * 2 (float view stride), rxBase = finCount * 3 (AoS stride).
847-
// Maintained in sync with finCount for numeric elements, eliminating per-element multiplications.
848-
let fsi = 0;
849-
let rxBase = 0;
850-
851-
// Clear histograms before the init loop so we can accumulate them inline.
852-
_rxHisto.fill(0);
853-
854-
// Single pass: partition NaN/null, initialise AoS radix entries for finite
855-
// numerics, and accumulate all 8 histograms simultaneously — eliminating the
856-
// separate O(n) histogram scan that the previous implementation required.
857-
for (let i = 0; i < n; i++) {
858-
const v = vals[i];
859-
if (v === null || v === undefined || Number.isNaN(v)) {
860-
_nanBuf[nanCount] = i;
861-
nanCount += 1;
862-
} else {
863-
const j = finCount;
864-
finBuf[j] = i;
865-
if (typeof v === "number") {
866-
fvals[j] = v;
867-
// Read the IEEE-754 bits via the shared Uint32 view (same buffer, no copy).
868-
let lo = fvalsU32[fsi]!;
869-
let hi = fvalsU32[fsi + 1]!;
870-
// Transform floats to sortable unsigned integers:
871-
// positive → XOR sign bit; negative → XOR all bits.
872-
if (hi & 0x80000000) {
873-
lo = ~lo >>> 0;
874-
hi = ~hi >>> 0;
875-
} else {
876-
hi = (hi ^ 0x80000000) >>> 0;
877-
}
878-
_rxA[rxBase] = i;
879-
_rxA[rxBase + 1] = lo;
880-
_rxA[rxBase + 2] = hi;
881-
fsi += 2;
882-
rxBase += 3;
883-
// Accumulate all 8 histogram passes inline — no second scan needed.
884-
let idx: number;
885-
idx = lo & 0xff;
886-
_rxHisto[idx] = _rxHisto[idx]! + 1;
887-
idx = 256 + ((lo >>> 8) & 0xff);
888-
_rxHisto[idx] = _rxHisto[idx]! + 1;
889-
idx = 512 + ((lo >>> 16) & 0xff);
890-
_rxHisto[idx] = _rxHisto[idx]! + 1;
891-
idx = 768 + ((lo >>> 24) & 0xff);
892-
_rxHisto[idx] = _rxHisto[idx]! + 1;
893-
idx = 1024 + (hi & 0xff);
894-
_rxHisto[idx] = _rxHisto[idx]! + 1;
895-
idx = 1280 + ((hi >>> 8) & 0xff);
896-
_rxHisto[idx] = _rxHisto[idx]! + 1;
897-
idx = 1536 + ((hi >>> 16) & 0xff);
898-
_rxHisto[idx] = _rxHisto[idx]! + 1;
899-
idx = 1792 + ((hi >>> 24) & 0xff);
900-
_rxHisto[idx] = _rxHisto[idx]! + 1;
800+
const finBuf = _finBuf;
801+
const fvals = _fvals;
802+
const fvalsU32 = _fvalsU32;
803+
finCount = 0;
804+
nanCount = 0;
805+
allNumeric = true;
806+
// Stride counters: fsi = finCount * 2 (float view stride), rxBase = finCount * 3 (AoS stride).
807+
// Maintained in sync with finCount for numeric elements, eliminating per-element multiplications.
808+
let fsi = 0;
809+
let rxBase = 0;
810+
811+
// Clear histograms before the init loop so we can accumulate them inline.
812+
_rxHisto.fill(0);
813+
814+
// Single pass: partition NaN/null, initialise AoS radix entries for finite
815+
// numerics, and accumulate all 8 histograms simultaneously — eliminating the
816+
// separate O(n) histogram scan that the previous implementation required.
817+
for (let i = 0; i < n; i++) {
818+
const v = vals[i];
819+
if (v === null || v === undefined || Number.isNaN(v)) {
820+
_nanBuf[nanCount] = i;
821+
nanCount += 1;
822+
} else {
823+
const j = finCount;
824+
finBuf[j] = i;
825+
if (typeof v === "number") {
826+
fvals[j] = v;
827+
// Read the IEEE-754 bits via the shared Uint32 view (same buffer, no copy).
828+
let lo = fvalsU32[fsi]!;
829+
let hi = fvalsU32[fsi + 1]!;
830+
// Transform floats to sortable unsigned integers:
831+
// positive → XOR sign bit; negative → XOR all bits.
832+
if (hi & 0x80000000) {
833+
lo = ~lo >>> 0;
834+
hi = ~hi >>> 0;
901835
} else {
902-
allNumeric = false;
836+
hi = (hi ^ 0x80000000) >>> 0;
903837
}
904-
finCount = finCount + 1;
838+
_rxA[rxBase] = i;
839+
_rxA[rxBase + 1] = lo;
840+
_rxA[rxBase + 2] = hi;
841+
fsi += 2;
842+
rxBase += 3;
843+
// Accumulate all 8 histogram passes inline — no second scan needed.
844+
let idx: number;
845+
idx = lo & 0xff;
846+
_rxHisto[idx] = _rxHisto[idx]! + 1;
847+
idx = 256 + ((lo >>> 8) & 0xff);
848+
_rxHisto[idx] = _rxHisto[idx]! + 1;
849+
idx = 512 + ((lo >>> 16) & 0xff);
850+
_rxHisto[idx] = _rxHisto[idx]! + 1;
851+
idx = 768 + ((lo >>> 24) & 0xff);
852+
_rxHisto[idx] = _rxHisto[idx]! + 1;
853+
idx = 1024 + (hi & 0xff);
854+
_rxHisto[idx] = _rxHisto[idx]! + 1;
855+
idx = 1280 + ((hi >>> 8) & 0xff);
856+
_rxHisto[idx] = _rxHisto[idx]! + 1;
857+
idx = 1536 + ((hi >>> 16) & 0xff);
858+
_rxHisto[idx] = _rxHisto[idx]! + 1;
859+
idx = 1792 + ((hi >>> 24) & 0xff);
860+
_rxHisto[idx] = _rxHisto[idx]! + 1;
861+
} else {
862+
allNumeric = false;
905863
}
864+
finCount = finCount + 1;
906865
}
866+
}
907867

908-
nanBuf = _nanBuf;
909-
// finSlice is only used by the string fallback path below.
910-
finSlice = finBuf.subarray(0, finCount);
911-
912-
// srcBuf — points to the AoS buffer whose [i*3] entries hold sorted original row indices.
913-
srcBuf = _rxA;
914-
915-
if (allNumeric && finCount > 0) {
916-
// ── LSD radix sort: 8 passes × 8 bits over IEEE-754 transformed keys ──
917-
// _rxA and _rxHisto are already initialised by the merged loop above.
918-
// AoS layout: srcBuf[i*3]=origIdx, srcBuf[i*3+1]=loKey, srcBuf[i*3+2]=hiKey.
919-
920-
// Convert each histogram to an exclusive prefix sum (cumulative offsets).
921-
for (let pass = 0; pass < 8; pass++) {
922-
const base = pass * 256;
923-
let total = 0;
924-
for (let b = 0; b < 256; b++) {
925-
const c = _rxHisto[base + b]!;
926-
_rxHisto[base + b] = total;
927-
total = total + c;
928-
}
929-
}
930-
931-
let dstBuf = _rxB;
932-
933-
for (let pass = 0; pass < 8; pass++) {
934-
// keyOff: offset within the AoS triple for the key word this pass reads.
935-
// pass 0-3 use lo (offset 1); pass 4-7 use hi (offset 2).
936-
const keyOff = pass < 4 ? 1 : 2;
937-
const shift = (pass % 4) * 8;
938-
const histoBase = pass * 256;
939-
// Use accumulated stride counter (si += 3) to avoid i*3 multiply per element.
940-
for (let i = 0, si = 0; i < finCount; i++, si += 3) {
941-
const bucket = (srcBuf[si + keyOff]! >>> shift) & 0xff;
942-
const p = _rxHisto[histoBase + bucket]!;
943-
_rxHisto[histoBase + bucket] = p + 1;
944-
// All three writes land on the same cache line (3 × 4 = 12 bytes).
945-
const di = p * 3;
946-
dstBuf[di] = srcBuf[si]!;
947-
dstBuf[di + 1] = srcBuf[si + 1]!;
948-
dstBuf[di + 2] = srcBuf[si + 2]!;
949-
}
950-
const t = srcBuf;
951-
srcBuf = dstBuf;
952-
dstBuf = t;
953-
}
954-
// After 8 passes (even), srcBuf[i*3] holds ascending sorted original indices.
955-
} else if (!allNumeric) {
956-
// String / mixed dtype: fall back to comparator-based sort on finSlice.
957-
if (ascending) {
958-
finSlice.sort((a, b) => {
959-
const av = vals[a] as number | string | boolean;
960-
const bv = vals[b] as number | string | boolean;
961-
return av < bv ? -1 : av > bv ? 1 : 0;
962-
});
963-
} else {
964-
finSlice.sort((a, b) => {
965-
const av = vals[a] as number | string | boolean;
966-
const bv = vals[b] as number | string | boolean;
967-
return av > bv ? -1 : av < bv ? 1 : 0;
968-
});
868+
nanBuf = _nanBuf;
869+
// finSlice is only used by the string fallback path below.
870+
finSlice = finBuf.subarray(0, finCount);
871+
872+
// srcBuf — points to the AoS buffer whose [i*3] entries hold sorted original row indices.
873+
srcBuf = _rxA;
874+
875+
if (allNumeric && finCount > 0) {
876+
// ── LSD radix sort: 8 passes × 8 bits over IEEE-754 transformed keys ──
877+
// _rxA and _rxHisto are already initialised by the merged loop above.
878+
// AoS layout: srcBuf[i*3]=origIdx, srcBuf[i*3+1]=loKey, srcBuf[i*3+2]=hiKey.
879+
880+
// Convert each histogram to an exclusive prefix sum (cumulative offsets).
881+
for (let pass = 0; pass < 8; pass++) {
882+
const base = pass * 256;
883+
let total = 0;
884+
for (let b = 0; b < 256; b++) {
885+
const c = _rxHisto[base + b]!;
886+
_rxHisto[base + b] = total;
887+
total = total + c;
969888
}
970889
}
971-
// else: allNumeric && finCount === 0 — nothing to sort.
972890

973-
// Save sorted result to cache (numeric path only).
974-
// On the next call with the same vals + ascending, we skip here directly.
975-
if (allNumeric) {
976-
const saveLen = finCount * 3;
977-
if (_cacheSortedAoS.length < saveLen) {
978-
_cacheSortedAoS = new Uint32Array(saveLen);
979-
}
980-
if (saveLen > 0) {
981-
_cacheSortedAoS.set(srcBuf.subarray(0, saveLen));
891+
let dstBuf = _rxB;
892+
893+
for (let pass = 0; pass < 8; pass++) {
894+
// keyOff: offset within the AoS triple for the key word this pass reads.
895+
// pass 0-3 use lo (offset 1); pass 4-7 use hi (offset 2).
896+
const keyOff = pass < 4 ? 1 : 2;
897+
const shift = (pass % 4) * 8;
898+
const histoBase = pass * 256;
899+
// Use accumulated stride counter (si += 3) to avoid i*3 multiply per element.
900+
for (let i = 0, si = 0; i < finCount; i++, si += 3) {
901+
const bucket = (srcBuf[si + keyOff]! >>> shift) & 0xff;
902+
const p = _rxHisto[histoBase + bucket]!;
903+
_rxHisto[histoBase + bucket] = p + 1;
904+
// All three writes land on the same cache line (3 × 4 = 12 bytes).
905+
const di = p * 3;
906+
dstBuf[di] = srcBuf[si]!;
907+
dstBuf[di + 1] = srcBuf[si + 1]!;
908+
dstBuf[di + 2] = srcBuf[si + 2]!;
982909
}
983-
if (_cacheNanBufC.length < nanCount) {
984-
_cacheNanBufC = new Uint32Array(nanCount);
985-
}
986-
if (nanCount > 0) {
987-
_cacheNanBufC.set(_nanBuf.subarray(0, nanCount));
988-
}
989-
_cacheFi = finCount;
990-
_cacheNi = nanCount;
991-
_cacheAllNumeric = true;
992-
_cacheVals = vals;
993-
_cacheAscending = ascending;
910+
const t = srcBuf;
911+
srcBuf = dstBuf;
912+
dstBuf = t;
913+
}
914+
// After 8 passes (even), srcBuf[i*3] holds ascending sorted original indices.
915+
} else if (!allNumeric) {
916+
// String / mixed dtype: fall back to comparator-based sort on finSlice.
917+
if (ascending) {
918+
finSlice.sort((a, b) => {
919+
const av = vals[a] as number | string | boolean;
920+
const bv = vals[b] as number | string | boolean;
921+
return av < bv ? -1 : av > bv ? 1 : 0;
922+
});
923+
} else {
924+
finSlice.sort((a, b) => {
925+
const av = vals[a] as number | string | boolean;
926+
const bv = vals[b] as number | string | boolean;
927+
return av > bv ? -1 : av < bv ? 1 : 0;
928+
});
994929
}
995930
}
931+
// else: allNumeric && finCount === 0 — nothing to sort.
996932

997933
// Build the output permutation and gather values.
998934
// For the numeric path, read sorted row indices directly from srcBuf[i*3] (no

0 commit comments

Comments
 (0)