Skip to content

Commit 51ad43c

Browse files
committed
fix(npyiter): ForEach/ExecuteGeneric/ExecuteReducing read past end without EXTERNAL_LOOP
Three symptoms of one bug in NpyIter.Execution.cs. The driver loops — ForEach, ExecuteGeneric(Single/Multi), and ExecuteReducing — pulled their per-call count from `GetInnerLoopSizePtr()`, which always returns `&_state->Shape[NDim - 1]` when the iterator isn't BUFFER'd. In EXLOOP mode that's correct: `iternext` (via ExternalLoopNext) advances `IterIndex` by `Shape[NDim - 1]` per call. But in the default non-EXLOOP non-BUFFER mode, `iternext` (via StandardNext) only advances by one element per call — `state.Advance()` increments `IterIndex` by 1. The kernel was still told `count = Shape[NDim - 1]`, so: 1. The kernel reads `Shape[NDim - 1]` elements starting at the current data pointer, which extends past the last valid element of the source array. 2. The driver then calls iternext, which advances the pointer by one element. 3. The next kernel call reads `Shape[NDim - 1]` elements starting one element later — again past the end — and so on. Net effect: an N-element 1-D array triggers N kernel invocations, each reading N "elements" (with massive overlap), the last ~N-1 of which read uninitialized memory. For `np.array([1, 2, NaN, 4, 5])` the returned NanSum was 46 instead of 12 because the kernel saw the array plus four trailing garbage floats added together four times over. Discovered during the Phase 2 migration when wiring the NaN reduction kernels into NpyIter. Worked around at the call sites by always passing `NpyIterGlobalFlags.EXTERNAL_LOOP`, which keeps iterNext and GetInnerLoopSizePtr in agreement. This commit fixes the bug at the source so future callers don't need the workaround. Approach: - New helper `ResolveInnerLoopCount()` returns the correct count given the current flag combination: BUFFER: _state->BufIterEnd EXLOOP: _state->Shape[NDim - 1] else: 1 - ForEach, ExecuteGenericSingle, ExecuteGenericMulti, ExecuteReducing use ResolveInnerLoopCount instead of dereferencing GetInnerLoopSizePtr. BUFFER mode still reads the pointer per iteration because buffer fills can shrink at the tail. Both EXLOOP and non-EXLOOP paths now produce correct results. The existing Phase 2 call sites keep EXLOOP because it's the SIMD-optimal mode (one call covers the whole inner dimension), but callers who omit the flag no longer get silently-wrong output. Test impact: 6,748 / 6,748 passing on net8.0 and net10.0, plus the bug-repro smoke test (NanSum over a strided 1-D array without EXTERNAL_LOOP) now returns the correct sum on the fly.
1 parent 7264173 commit 51ad43c

1 file changed

Lines changed: 75 additions & 9 deletions

File tree

src/NumSharp.Core/Backends/Iterators/NpyIter.Execution.cs

Lines changed: 75 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -129,21 +129,65 @@ public void ForEach(NpyInnerLoopFunc kernel, void* auxdata = null)
129129

130130
void** dataptrs = GetDataPtrArray();
131131
long* byteStrides = GetInnerLoopByteStrides();
132-
long* innerSize = GetInnerLoopSizePtr();
132+
long innerSize = ResolveInnerLoopCount();
133133

134134
if (IsSingleInnerLoop())
135135
{
136-
kernel(dataptrs, byteStrides, *innerSize, auxdata);
136+
kernel(dataptrs, byteStrides, innerSize, auxdata);
137137
return;
138138
}
139139

140140
var iternext = GetIterNext();
141+
142+
// Buffered fills can change size at the tail, so re-read per call.
143+
if ((_state->ItFlags & (uint)NpyIterFlags.BUFFER) != 0)
144+
{
145+
long* bufSize = GetInnerLoopSizePtr();
146+
do
147+
{
148+
kernel(dataptrs, byteStrides, *bufSize, auxdata);
149+
} while (iternext(ref *_state));
150+
return;
151+
}
152+
153+
// EXLOOP and non-EXLOOP both have a stable innerSize across iterations.
141154
do
142155
{
143-
kernel(dataptrs, byteStrides, *innerSize, auxdata);
156+
kernel(dataptrs, byteStrides, innerSize, auxdata);
144157
} while (iternext(ref *_state));
145158
}
146159

160+
/// <summary>
161+
/// Returns the number of elements the kernel processes per inner-loop
162+
/// invocation, in a way that is correct regardless of which iterator
163+
/// flags are set:
164+
///
165+
/// <list type="bullet">
166+
/// <item>BUFFER: size of the current buffer fill (callers that can
167+
/// observe per-iteration changes should re-read it from
168+
/// <see cref="GetInnerLoopSizePtr"/>).</item>
169+
/// <item>EXTERNAL_LOOP (EXLOOP): innermost coalesced shape dimension —
170+
/// the iterator advances in strides of that size.</item>
171+
/// <item>Otherwise: 1 — the iterator's <c>iternext</c> increments
172+
/// <see cref="NpyIterState.IterIndex"/> by one per call, so the
173+
/// kernel processes one element per invocation.</item>
174+
/// </list>
175+
///
176+
/// Fixes the pre-existing inconsistency where
177+
/// <see cref="GetInnerLoopSizePtr"/> on a non-BUFFER, non-EXLOOP
178+
/// iterator reported <c>Shape[NDim - 1]</c> (the innermost dimension)
179+
/// while <c>Iternext</c> only advanced by one element — causing the
180+
/// kernel to over-read past the end of the array.
181+
/// </summary>
182+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
183+
private long ResolveInnerLoopCount()
184+
{
185+
uint f = _state->ItFlags;
186+
if ((f & (uint)NpyIterFlags.BUFFER) != 0) return _state->BufIterEnd;
187+
if ((f & (uint)NpyIterFlags.EXLOOP) != 0) return _state->Shape[_state->NDim - 1];
188+
return 1;
189+
}
190+
147191
/// <summary>
148192
/// Struct-generic overload — the JIT devirtualizes and inlines the
149193
/// kernel call through the TKernel type parameter. Preferred when the
@@ -170,7 +214,7 @@ public void ExecuteGeneric<TKernel>(TKernel kernel) where TKernel : struct, INpy
170214
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
171215
private void ExecuteGenericSingle<TKernel>(TKernel kernel) where TKernel : struct, INpyInnerLoop
172216
{
173-
kernel.Execute(GetDataPtrArray(), GetInnerLoopByteStrides(), *GetInnerLoopSizePtr());
217+
kernel.Execute(GetDataPtrArray(), GetInnerLoopByteStrides(), ResolveInnerLoopCount());
174218
}
175219

176220
/// <summary>Multi-loop path with do/while driver.</summary>
@@ -179,12 +223,22 @@ private void ExecuteGenericMulti<TKernel>(TKernel kernel) where TKernel : struct
179223
{
180224
void** dataptrs = GetDataPtrArray();
181225
long* byteStrides = GetInnerLoopByteStrides();
182-
long* innerSize = GetInnerLoopSizePtr();
183226
var iternext = GetIterNext();
184227

228+
if ((_state->ItFlags & (uint)NpyIterFlags.BUFFER) != 0)
229+
{
230+
long* bufSize = GetInnerLoopSizePtr();
231+
do
232+
{
233+
kernel.Execute(dataptrs, byteStrides, *bufSize);
234+
} while (iternext(ref *_state));
235+
return;
236+
}
237+
238+
long innerSize = ResolveInnerLoopCount();
185239
do
186240
{
187-
kernel.Execute(dataptrs, byteStrides, *innerSize);
241+
kernel.Execute(dataptrs, byteStrides, innerSize);
188242
} while (iternext(ref *_state));
189243
}
190244

@@ -216,19 +270,31 @@ public TAccum ExecuteReducing<TKernel, TAccum>(TKernel kernel, TAccum init)
216270
{
217271
void** dataptrs = GetDataPtrArray();
218272
long* byteStrides = GetInnerLoopByteStrides();
219-
long* innerSize = GetInnerLoopSizePtr();
220273
TAccum accum = init;
221274

222275
if (IsSingleInnerLoop())
223276
{
224-
kernel.Execute(dataptrs, byteStrides, *innerSize, ref accum);
277+
kernel.Execute(dataptrs, byteStrides, ResolveInnerLoopCount(), ref accum);
225278
return accum;
226279
}
227280

228281
var iternext = GetIterNext();
282+
283+
if ((_state->ItFlags & (uint)NpyIterFlags.BUFFER) != 0)
284+
{
285+
long* bufSize = GetInnerLoopSizePtr();
286+
do
287+
{
288+
if (!kernel.Execute(dataptrs, byteStrides, *bufSize, ref accum))
289+
break;
290+
} while (iternext(ref *_state));
291+
return accum;
292+
}
293+
294+
long innerSize = ResolveInnerLoopCount();
229295
do
230296
{
231-
if (!kernel.Execute(dataptrs, byteStrides, *innerSize, ref accum))
297+
if (!kernel.Execute(dataptrs, byteStrides, innerSize, ref accum))
232298
break;
233299
} while (iternext(ref *_state));
234300
return accum;

0 commit comments

Comments
 (0)