fix(npyiter): ForEach/ExecuteGeneric/ExecuteReducing read past end without EXTERNAL_LOOP

Nucs · Nucs · commit 51ad43cded83 · 2026-04-22T23:41:54.000+03:00
Three symptoms of one bug in NpyIter.Execution.cs. The driver loops —
ForEach, ExecuteGeneric(Single/Multi), and ExecuteReducing — pulled
their per-call count from `GetInnerLoopSizePtr()`, which always returns
`&amp;_state-&gt;Shape[NDim - 1]` when the iterator isn't BUFFER'd. In EXLOOP
mode that's correct: `iternext` (via ExternalLoopNext) advances
`IterIndex` by `Shape[NDim - 1]` per call.

But in the default non-EXLOOP non-BUFFER mode, `iternext` (via
StandardNext) only advances by one element per call — `state.Advance()`
increments `IterIndex` by 1. The kernel was still told `count =
Shape[NDim - 1]`, so:

1. The kernel reads `Shape[NDim - 1]` elements starting at the current
   data pointer, which extends past the last valid element of the
   source array.
2. The driver then calls iternext, which advances the pointer by one
   element.
3. The next kernel call reads `Shape[NDim - 1]` elements starting one
   element later — again past the end — and so on.

Net effect: an N-element 1-D array triggers N kernel invocations, each
reading N "elements" (with massive overlap), the last ~N-1 of which
read uninitialized memory. For `np.array([1, 2, NaN, 4, 5])` the
returned NanSum was 46 instead of 12 because the kernel saw the array
plus four trailing garbage floats added together four times over.

Discovered during the Phase 2 migration when wiring the NaN reduction
kernels into NpyIter. Worked around at the call sites by always passing
`NpyIterGlobalFlags.EXTERNAL_LOOP`, which keeps iterNext and
GetInnerLoopSizePtr in agreement.

This commit fixes the bug at the source so future callers don't need
the workaround. Approach:

- New helper `ResolveInnerLoopCount()` returns the correct count given
  the current flag combination:
    BUFFER:  _state-&gt;BufIterEnd
    EXLOOP:  _state-&gt;Shape[NDim - 1]
    else:    1
- ForEach, ExecuteGenericSingle, ExecuteGenericMulti, ExecuteReducing
  use ResolveInnerLoopCount instead of dereferencing
  GetInnerLoopSizePtr. BUFFER mode still reads the pointer per
  iteration because buffer fills can shrink at the tail.

Both EXLOOP and non-EXLOOP paths now produce correct results. The
existing Phase 2 call sites keep EXLOOP because it's the SIMD-optimal
mode (one call covers the whole inner dimension), but callers who omit
the flag no longer get silently-wrong output.

Test impact: 6,748 / 6,748 passing on net8.0 and net10.0, plus the
bug-repro smoke test (NanSum over a strided 1-D array without
EXTERNAL_LOOP) now returns the correct sum on the fly.
diff --git a/src/NumSharp.Core/Backends/Iterators/NpyIter.Execution.cs b/src/NumSharp.Core/Backends/Iterators/NpyIter.Execution.cs
@@ -129,21 +129,65 @@ public void ForEach(NpyInnerLoopFunc kernel, void* auxdata = null)
 
             void** dataptrs = GetDataPtrArray();
             long* byteStrides = GetInnerLoopByteStrides();
-            long* innerSize = GetInnerLoopSizePtr();
+            long innerSize = ResolveInnerLoopCount();
 
             if (IsSingleInnerLoop())
             {
-                kernel(dataptrs, byteStrides, *innerSize, auxdata);
+                kernel(dataptrs, byteStrides, innerSize, auxdata);
                 return;
             }
 
             var iternext = GetIterNext();
+
+            // Buffered fills can change size at the tail, so re-read per call.
+            if ((_state->ItFlags & (uint)NpyIterFlags.BUFFER) != 0)
+            {
+                long* bufSize = GetInnerLoopSizePtr();
+                do
+                {
+                    kernel(dataptrs, byteStrides, *bufSize, auxdata);
+                } while (iternext(ref *_state));
+                return;
+            }
+
+            // EXLOOP and non-EXLOOP both have a stable innerSize across iterations.
             do
             {
-                kernel(dataptrs, byteStrides, *innerSize, auxdata);
+                kernel(dataptrs, byteStrides, innerSize, auxdata);
             } while (iternext(ref *_state));
         }
 
+        /// <summary>
+        /// Returns the number of elements the kernel processes per inner-loop
+        /// invocation, in a way that is correct regardless of which iterator
+        /// flags are set:
+        ///
+        /// <list type="bullet">
+        ///   <item>BUFFER: size of the current buffer fill (callers that can
+        ///     observe per-iteration changes should re-read it from
+        ///     <see cref="GetInnerLoopSizePtr"/>).</item>
+        ///   <item>EXTERNAL_LOOP (EXLOOP): innermost coalesced shape dimension —
+        ///     the iterator advances in strides of that size.</item>
+        ///   <item>Otherwise: 1 — the iterator's <c>iternext</c> increments
+        ///     <see cref="NpyIterState.IterIndex"/> by one per call, so the
+        ///     kernel processes one element per invocation.</item>
+        /// </list>
+        ///
+        /// Fixes the pre-existing inconsistency where
+        /// <see cref="GetInnerLoopSizePtr"/> on a non-BUFFER, non-EXLOOP
+        /// iterator reported <c>Shape[NDim - 1]</c> (the innermost dimension)
+        /// while <c>Iternext</c> only advanced by one element — causing the
+        /// kernel to over-read past the end of the array.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private long ResolveInnerLoopCount()
+        {
+            uint f = _state->ItFlags;
+            if ((f & (uint)NpyIterFlags.BUFFER) != 0) return _state->BufIterEnd;
+            if ((f & (uint)NpyIterFlags.EXLOOP) != 0) return _state->Shape[_state->NDim - 1];
+            return 1;
+        }
+
         /// <summary>
         /// Struct-generic overload — the JIT devirtualizes and inlines the
         /// kernel call through the TKernel type parameter. Preferred when the
@@ -170,7 +214,7 @@ public void ExecuteGeneric<TKernel>(TKernel kernel) where TKernel : struct, INpy
         [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
         private void ExecuteGenericSingle<TKernel>(TKernel kernel) where TKernel : struct, INpyInnerLoop
         {
-            kernel.Execute(GetDataPtrArray(), GetInnerLoopByteStrides(), *GetInnerLoopSizePtr());
+            kernel.Execute(GetDataPtrArray(), GetInnerLoopByteStrides(), ResolveInnerLoopCount());
         }
 
         /// <summary>Multi-loop path with do/while driver.</summary>
@@ -179,12 +223,22 @@ private void ExecuteGenericMulti<TKernel>(TKernel kernel) where TKernel : struct
         {
             void** dataptrs = GetDataPtrArray();
             long* byteStrides = GetInnerLoopByteStrides();
-            long* innerSize = GetInnerLoopSizePtr();
             var iternext = GetIterNext();
 
+            if ((_state->ItFlags & (uint)NpyIterFlags.BUFFER) != 0)
+            {
+                long* bufSize = GetInnerLoopSizePtr();
+                do
+                {
+                    kernel.Execute(dataptrs, byteStrides, *bufSize);
+                } while (iternext(ref *_state));
+                return;
+            }
+
+            long innerSize = ResolveInnerLoopCount();
             do
             {
-                kernel.Execute(dataptrs, byteStrides, *innerSize);
+                kernel.Execute(dataptrs, byteStrides, innerSize);
             } while (iternext(ref *_state));
         }
 
@@ -216,19 +270,31 @@ public TAccum ExecuteReducing<TKernel, TAccum>(TKernel kernel, TAccum init)
         {
             void** dataptrs = GetDataPtrArray();
             long* byteStrides = GetInnerLoopByteStrides();
-            long* innerSize = GetInnerLoopSizePtr();
             TAccum accum = init;
 
             if (IsSingleInnerLoop())
             {
-                kernel.Execute(dataptrs, byteStrides, *innerSize, ref accum);
+                kernel.Execute(dataptrs, byteStrides, ResolveInnerLoopCount(), ref accum);
                 return accum;
             }
 
             var iternext = GetIterNext();
+
+            if ((_state->ItFlags & (uint)NpyIterFlags.BUFFER) != 0)
+            {
+                long* bufSize = GetInnerLoopSizePtr();
+                do
+                {
+                    if (!kernel.Execute(dataptrs, byteStrides, *bufSize, ref accum))
+                        break;
+                } while (iternext(ref *_state));
+                return accum;
+            }
+
+            long innerSize = ResolveInnerLoopCount();
             do
             {
-                if (!kernel.Execute(dataptrs, byteStrides, *innerSize, ref accum))
+                if (!kernel.Execute(dataptrs, byteStrides, innerSize, ref accum))
                     break;
             } while (iternext(ref *_state));
             return accum;