ooples
diff --git a/‎.github/workflows/ci-shard-closure-policy.yml‎
Lines changed: 122 additions & 0 deletions b/‎.github/workflows/ci-shard-closure-policy.yml‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎.github/workflows/sonarcloud.yml‎
Lines changed: 111 additions & 1 deletion b/‎.github/workflows/sonarcloud.yml‎
Lines changed: 111 additions & 1 deletion
diff --git a/‎Directory.Packages.props‎
Lines changed: 12 additions & 4 deletions b/‎Directory.Packages.props‎
Lines changed: 12 additions & 4 deletions
@@ -0,0 +1,122 @@
+name: CI Shard Closure Policy
+
+# Enforces the "shard stays open until shard goes green" rule established in
+# audit comment https://github.com/ooples/AiDotNet/issues/1315#issuecomment-4501244896.
+# When an issue tagged `ci-failure` is closed, this action checks the latest
+# Build & SonarCloud run on master and warns / auto-reopens if the shard the
+# issue claims to fix is still red.
+#
+# Why this exists: historically issues were closed when their originally-listed
+# tests passed, but the shard those tests belong to was still failing due to
+# OTHER tests in the same shard. Result: dashboard said "fixed" while CI was
+# perpetually red. The audit pulled 4 prematurely-closed issues that were
+# still tracking red shards (#1304, #1305, #1307, #1313). This action stops
+# that pattern at the source.
+
+on:
+  issues:
+    types: [closed]
+
+permissions:
+  issues: write
+  actions: read
+
+jobs:
+  check-shard-still-red:
+    runs-on: ubuntu-latest
+    # Only run on ci-failure-labeled issues. Other issue closures are out of
+    # scope — this isn't a generic "did you fix it" guardrail, just a CI-shard
+    # accountability check.
+    if: contains(github.event.issue.labels.*.name, 'ci-failure')
+    steps:
+      - name: Extract shard name from issue title
+        id: extract
+        env:
+          ISSUE_TITLE: ${{ github.event.issue.title }}
+        run: |
+          # Issue titles follow conventions like:
+          #   "[PR #1290 CI] Tests (net10.0) - ModelFamily - NeuralNetworks: 5 failing tests"
+          #   "[PR #1290 CI Cluster 6] Long-training timeouts ..."
+          #   "[CI] Tests (net10.0) - Unit - 08d NN-Adapters/Other: MoE MoreData failing"
+          # Try to extract the shard short name (the bit after "Tests (net10.0) - ").
+          shard=$(echo "$ISSUE_TITLE" | grep -oP 'Tests \(net10\.0\) - \K[^:]+' | head -1 | sed 's/[[:space:]]*$//')
+          if [ -z "$shard" ]; then
+            echo "No shard name found in title — issue not bound to a named shard, skipping check."
+            echo "shard=" >> "$GITHUB_OUTPUT"
+          else
+            echo "Extracted shard: '$shard'"
+            echo "shard=$shard" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Check latest master CI run for this shard
+        id: check
+        if: steps.extract.outputs.shard != ''
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          SHARD_NAME: ${{ steps.extract.outputs.shard }}
+        run: |
+          # Find the most recent COMPLETED Build & SonarCloud run on master,
+          # success or failure — whichever is newer. Picking "newest success
+          # first, fall back to failure" would let a newer red run be ignored
+          # whenever any older green run exists, so the tracking issue stays
+          # closed while the shard is currently red. We deliberately skip
+          # cancelled runs (those don't tell us anything about shard health —
+          # they were superseded by a newer commit).
+          run_id=$(gh run list \
+            --repo "${{ github.repository }}" \
+            --workflow "Build & SonarCloud" \
+            --branch master \
+            --limit 20 \
+            --json databaseId,status,conclusion,createdAt \
+            -q '[.[] | select(.status == "completed" and .conclusion != "cancelled")] | sort_by(.createdAt) | last | .databaseId')
+
+          if [ -z "$run_id" ]; then
+            echo "No recent completed master runs found — cannot verify shard state."
+            echo "shard_status=unknown" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          echo "Checking run $run_id for shard '$SHARD_NAME'"
+          # Job names look like "Tests (net10.0) - ModelFamily - NeuralNetworks".
+          # Match by suffix so the extracted shard name maps cleanly.
+          # Pass SHARD_NAME via jq --arg rather than string-interpolating into
+          # the filter — SHARD_NAME comes from issue titles (user-controlled),
+          # and a quote / backslash inside would break the jq program syntax,
+          # producing empty status and bypassing the audit.
+          status=$(gh run view "$run_id" \
+            --repo "${{ github.repository }}" \
+            --json jobs \
+            | jq -r --arg shard "$SHARD_NAME" \
+                '.jobs[] | select(.name | endswith($shard)) | .conclusion' \
+            | head -1)
+
+          if [ -z "$status" ]; then
+            echo "Could not find matching job for shard '$SHARD_NAME' in run $run_id."
+            echo "shard_status=unknown" >> "$GITHUB_OUTPUT"
+          else
+            echo "Shard '$SHARD_NAME' last status: $status"
+            echo "shard_status=$status" >> "$GITHUB_OUTPUT"
+            echo "run_id=$run_id" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Reopen issue if shard still red
+        if: steps.check.outputs.shard_status == 'failure' || steps.check.outputs.shard_status == 'cancelled'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          ISSUE_NUMBER: ${{ github.event.issue.number }}
+          SHARD_NAME: ${{ steps.extract.outputs.shard }}
+          SHARD_STATUS: ${{ steps.check.outputs.shard_status }}
+          RUN_ID: ${{ steps.check.outputs.run_id }}
+        run: |
+          gh issue reopen "$ISSUE_NUMBER" \
+            --repo "${{ github.repository }}" \
+            --comment "⚠️ Auto-reopened by **CI Shard Closure Policy**.
+
+          This issue was closed, but the shard \`$SHARD_NAME\` is still **$SHARD_STATUS** in the latest master CI run ([run $RUN_ID](${{ github.server_url }}/${{ github.repository }}/actions/runs/$RUN_ID)).
+
+          Per the closure policy established in #1315: **a shard's tracking issue stays open until the shard goes green in CI**, not until the originally-listed tests pass. The shard may still be failing because (a) other tests in the same shard are red, (b) a new failure appeared after the original list was filed, or (c) the runner was cancelled and we don't yet know what's failing.
+
+          To close cleanly:
+          1. Verify the latest CI run on master shows this shard as ✅ success
+          2. If new failures appeared, file a fresh issue or expand this one's scope first
+          3. Then close — at which point this guard won't fire."
@@ -43,6 +43,43 @@ env:
   DOTNET_NOLOGO: true
   DOTNET_CLI_TELEMETRY_OPTOUT: 1
 
+  # ---------------------------------------------------------------------------
+  # Memory-management knobs for CI test execution
+  # ---------------------------------------------------------------------------
+  #
+  # Streaming-pool threshold engagement (AIDOTNET_STREAMING_THRESHOLD_PARAMS
+  # below the compiled 10 B default) has been attempted twice in this PR:
+  #  1. b43dd9323: 1 M — produced
+  #     `WeightRegistry.Configure: existing streaming pool has N registered
+  #     entries` because the pool state leaked across xUnit-parallel
+  #     collections.
+  #  2. 81052b16f: 100 M — even with `WeightRegistry.Reset()` in
+  #     NeuralNetworkModelTestBase.InitializeAsync (commit 8ab358d2b), this
+  #     produced a new class of failures: `Streaming pool: handle N is
+  #     unknown` on SimCSE and other models. The pool was being reset, but
+  #     tensor instances from the prior test still hold stale streaming-pool
+  #     handle references that point at the cleared state; on materialize the
+  #     pool throws because the handle was just cleared.
+  #
+  # Fix is non-trivial (need per-tensor handle reset in
+  # WeightRegistry.Reset, or test-isolation strategy that doesn't reset the
+  # pool mid-run). Left at the compiled default until the underlying
+  # handle-leak is fixed properly.
+  #
+  # Memory pressure on heavy shards is handled instead by per-shard
+  # `xunit.MaxParallelThreads=1` (see the test step below), which
+  # serializes model loads on the heaviest shards.
+
+  # GC tuning for tests: switch to Server GC (multi-threaded, larger heap
+  # segments, batched Gen-2 collections) — Workstation GC's per-thread heap
+  # mode keeps Gen-2 retention pinned to the test thread for longer than we
+  # can afford under parallel test collections. ServerGC ALSO triggers
+  # background concurrent collection more aggressively, reducing the chance
+  # that one test's working set blocks another test from getting a fresh
+  # allocation context.
+  DOTNET_gcServer: 1
+  DOTNET_GCConserveMemory: 9
+
 jobs:
   # CodeQL runs on Ubuntu with net10.0 only - parallel with SonarCloud
   # Runs on PRs, pushes to master/main, and weekly schedule for security analysis
@@ -431,7 +468,80 @@ jobs:
           $sanitizedShardName = $shardName -replace '[\\/:*?"<>|\s-]+', '_'
           $results = Join-Path "TestResults" $sanitizedShardName
           New-Item -Path $results -ItemType Directory -Force | Out-Null
-          dotnet test ${{ matrix.shard.project }} -c Release --framework ${{ matrix.shard.framework }} --no-build --no-restore --filter "${{ matrix.shard.filter }}" --collect:"XPlat Code Coverage" --settings coverlet.runsettings --logger "trx;LogFileName=test-results.trx" --logger "console;verbosity=normal" --results-directory $results --blame-hang-timeout 5min --blame-hang-dump-type none
+
+          # Pre-test resource snapshot. Cancelled-runner shards (Diffusion S-Z,
+          # ModelFamily-NN, Generated Layers, NN-Remaining, Unit-03 Diffusion)
+          # die with `The runner has received a shutdown signal` 2-6 min into
+          # test execution. The dotnet test step exits before producing TRX
+          # output so we have no idea what was running at OOM time. Dump
+          # memory + disk + CPU info on entry so the next cancellation has
+          # forensic data.
+          Write-Host "=== Pre-test resource snapshot ==="
+          Write-Host "Memory:"
+          free -h
+          Write-Host "Disk:"
+          df -h /
+          Write-Host "Processors:"
+          nproc
+
+          # Per-shard parallelism control. The streaming + Server GC
+          # changes earlier in this PR helped most shards stop cancelling,
+          # but the heaviest model-family shards (Diffusion A-I/J-R/S-Z,
+          # Generated Layers, ModelFamily-NeuralNetworks, NN-Remaining,
+          # Unit-03 Diffusion) still trip OOM with 4 parallel BERT-class
+          # model loads in flight on a 16 GB ubuntu-latest runner. Per-iter
+          # peak memory ≈ 880 MB weights + 1.76 GB Adam state per slot;
+          # 4 slots × 2.6 GB + dotnet/xUnit overhead overruns the envelope.
+          # For these specific shards we pass `xunit.MaxParallelThreads=1`
+          # so heavy models load serially — every other shard stays at
+          # the JSON default (= ProcessorCount = 4) and runs full-speed.
+          $heavyShards = @(
+            'ModelFamily - Diffusion A-I',
+            'ModelFamily - Diffusion J-R',
+            'ModelFamily - Diffusion S-Z',
+            'ModelFamily - Generated Layers',
+            'ModelFamily - NeuralNetworks',
+            'Unit - 08e NN-Remaining (catch-all)',
+            'Unit - 03 Diffusion/Encoding'
+          )
+          $serializeShard = $heavyShards -contains $shardName
+          Write-Host "Running shard '$shardName' (serialized: $serializeShard)"
+
+          # Build the argument list as a PowerShell array so the `--`
+          # separator and the runner args reach `dotnet test` as distinct
+          # tokens. Earlier we joined them into one string and pwsh's
+          # token splitter parsed `--` as a standalone switch that MSBuild
+          # then rejected with `MSB1001: Unknown switch`.
+          $dotnetArgs = @(
+            'test', '${{ matrix.shard.project }}',
+            '-c', 'Release',
+            '--framework', '${{ matrix.shard.framework }}',
+            '--no-build', '--no-restore',
+            '--filter', '${{ matrix.shard.filter }}',
+            '--collect:XPlat Code Coverage',
+            '--settings', 'coverlet.runsettings',
+            '--logger', 'trx;LogFileName=test-results.trx',
+            '--logger', 'console;verbosity=normal',
+            '--results-directory', $results,
+            '--blame-hang-timeout', '5min',
+            '--blame-hang-dump-type', 'none'
+          )
+          if ($serializeShard) {
+            $dotnetArgs += '--'
+            $dotnetArgs += 'xunit.MaxParallelThreads=1'
+          }
+          & dotnet @dotnetArgs
+
+          # Post-test resource snapshot. If the runner survives this point,
+          # the test step finished naturally and the snapshot tells us what
+          # the high-water mark looked like.
+          $exitCode = $LASTEXITCODE
+          Write-Host "=== Post-test resource snapshot ==="
+          Write-Host "Memory:"
+          free -h
+          Write-Host "Disk:"
+          df -h /
+          exit $exitCode
 
       - name: Report slow tests
         if: always()
 
@@ -5,10 +5,18 @@
   <ItemGroup>
     <!-- AiDotNet ecosystem -->
     <PackageVersion Include="AiDotNet" Version="0.113.0" />
-    <!-- AiDotNet.Tensors 0.81.0 is the projected version for PR ooples/AiDotNet.Tensors#359
-         (paired with this PR's GraFPrint perf-overhaul + scheduler-fused + determinism work).
-         CI will fail to restore until that Tensors PR merges and the new NuGet publishes;
-         after release, bump the literal here. -->
+    <!-- AiDotNet.Tensors needs a version bump after ooples/AiDotNet.Tensors#424
+         publishes a new NuGet. That PR replaces the silent `checked(int * int)`
+         dim-product overflow in TensorAllocator.Rent / RentPinned with a
+         `long` accumulator that names the requested shape when the element
+         count exceeds Array.MaxLength, plus an ArgumentOutOfRangeException
+         naming the index and value for negative dims (lazy-layer `-1`
+         sentinel propagation). Diagnoses the otherwise-opaque
+         `OverflowException` failures on TimeMachine / DQN / OWLViT /
+         DGCNN / TabTransformer / TabDPT / SlimSAM / TriaffineNER tests on
+         SonarCloud run 26241806890. (Previous PR ooples/AiDotNet.Tensors#359
+         — GraFPrint perf-overhaul + scheduler-fused + determinism — is
+         already in 0.81.3.) -->
     <PackageVersion Include="AiDotNet.Tensors" Version="0.81.3" />
     <PackageVersion Include="AiDotNet.Native.OneDNN" Version="0.81.3" />
     <PackageVersion Include="AiDotNet.Native.OpenBLAS" Version="0.81.3" />