ooples
diff --git a/‎src/Models/Options/GradientBasedOptimizerOptions.cs‎
Lines changed: 11 additions & 1 deletion b/‎src/Models/Options/GradientBasedOptimizerOptions.cs‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎src/Optimizers/AMSGradOptimizer.cs‎
Lines changed: 23 additions & 0 deletions b/‎src/Optimizers/AMSGradOptimizer.cs‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/Optimizers/AdaDeltaOptimizer.cs‎
Lines changed: 19 additions & 0 deletions b/‎src/Optimizers/AdaDeltaOptimizer.cs‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/Optimizers/AdaMaxOptimizer.cs‎
Lines changed: 20 additions & 0 deletions b/‎src/Optimizers/AdaMaxOptimizer.cs‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/Optimizers/AdagradOptimizer.cs‎
Lines changed: 9 additions & 0 deletions b/‎src/Optimizers/AdagradOptimizer.cs‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/Optimizers/Adam8BitOptimizer.cs‎
Lines changed: 9 additions & 0 deletions b/‎src/Optimizers/Adam8BitOptimizer.cs‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/Optimizers/AdamOptimizer.cs‎
Lines changed: 48 additions & 70 deletions b/‎src/Optimizers/AdamOptimizer.cs‎
Lines changed: 48 additions & 70 deletions
diff --git a/‎src/Optimizers/AdamWOptimizer.cs‎
Lines changed: 9 additions & 0 deletions b/‎src/Optimizers/AdamWOptimizer.cs‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/Optimizers/FTRLOptimizer.cs‎
Lines changed: 9 additions & 0 deletions b/‎src/Optimizers/FTRLOptimizer.cs‎
Lines changed: 9 additions & 0 deletions
@@ -193,7 +193,17 @@ internal void SetLossFunctionFromAutoSync(ILossFunction<T> lossFunction)
     /// putting a speed limit on these updates to keep training stable.
     /// </para>
     /// </remarks>
-    public bool EnableGradientClipping { get; set; } = false;
+    // #1380 part 2: default ON. PyTorch's transformer training default is
+    // torch.nn.utils.clip_grad_norm_(params, max_norm=1.0) and this matches
+    // the canonical industry recipe. The previous default of `false`
+    // produced runaway parameter growth on the Optimize / BuildAsync code
+    // path when paired with the default DataSplitter.Split 70/15/15 small-
+    // validation sets, since the small-batch evaluation forward passes
+    // generate large gradient spikes that need clipping to stay stable.
+    // Callers who want the previous behavior can still opt out by setting
+    // this to false explicitly. See AiDotNet#1413 for the bisection
+    // evidence and AiDotNet#1380 for the headline issue.
+    public bool EnableGradientClipping { get; set; } = true;
 
     /// <summary>
     /// Gets or sets the gradient clipping method to use.
 
@@ -116,6 +116,20 @@ public override OptimizationResult<T, TInput, TOutput> Optimize(OptimizationInpu
         _m = new Vector<T>(parameters.Length);
         _v = new Vector<T>(parameters.Length);
         _vHat = new Vector<T>(parameters.Length);
+        // Reset the NN tape-side state. The flat-vector path got fresh
+        // Vectors above; the tape path uses parameter-tensor-keyed
+        // dictionaries (_tapeM/_tapeV/_tapeVHat) plus a separate
+        // _tapeStep counter that PERSIST across Optimize calls on the
+        // same optimizer instance. Without this clear, a second Optimize
+        // call on the same optimizer would carry the prior run's first/
+        // second moments AND its v̂_max running maximum (the AMSGrad
+        // bound that defines this optimizer), plus a pre-advanced bias-
+        // correction counter — biasing every per-parameter step from
+        // iteration 1.
+        _tapeM.Clear();
+        _tapeV.Clear();
+        _tapeVHat.Clear();
+        _tapeStep = 0;
         InitializeAdaptiveParameters();
 
         for (int epoch = 0; epoch < _options.MaxIterations; epoch++)
@@ -171,6 +185,15 @@ public override OptimizationResult<T, TInput, TOutput> Optimize(OptimizationInpu
     /// </remarks>
     protected override IFullModel<T, TInput, TOutput> UpdateSolution(IFullModel<T, TInput, TOutput> currentSolution, Vector<T> gradient)
     {
+        // #1413 CONSOLIDATION: NN solutions go through base.UpdateSolution
+        // which synthesizes a TapeStepContext and delegates to Step
+        // (one source of truth, matches PyTorch/TF/JAX). Non-NN solutions
+        // (regression, clustering, classical models) keep the legacy
+        // flat-vector path below for backward compatibility.
+        if (currentSolution is AiDotNet.Interfaces.INeuralNetwork<T>)
+        {
+            return base.UpdateSolution(currentSolution, gradient);
+        }
         var parameters = InterfaceGuard.Parameterizable(currentSolution).GetParameters();
 
         // Use shared UpdateParameters method to eliminate duplication
 
@@ -194,6 +194,16 @@ public override OptimizationResult<T, TInput, TOutput> Optimize(OptimizationInpu
 
         _accumulatedSquaredGradients = new Vector<T>(parameters.Length);
         _accumulatedSquaredUpdates = new Vector<T>(parameters.Length);
+        // Reset the NN tape-side accumulators too. The flat-vector path
+        // gets a fresh Vector<T> per Optimize call (lines above); the
+        // tape-tracked path uses parameter-tensor-keyed dictionaries
+        // (_tapeAccSqGrad / _tapeAccSqUpd) that PERSIST across Optimize
+        // calls on the same optimizer instance. Without this clear, a
+        // second Optimize call on the same optimizer would carry the
+        // prior run's AdaDelta history into the new model, biasing
+        // every per-parameter step size from iteration 1.
+        _tapeAccSqGrad.Clear();
+        _tapeAccSqUpd.Clear();
         InitializeAdaptiveParameters();
 
         for (int epoch = 0; epoch < _options.MaxIterations; epoch++)
@@ -260,6 +270,15 @@ public override OptimizationResult<T, TInput, TOutput> Optimize(OptimizationInpu
     /// </remarks>
     protected override IFullModel<T, TInput, TOutput> UpdateSolution(IFullModel<T, TInput, TOutput> currentSolution, Vector<T> gradient)
     {
+        // #1413 CONSOLIDATION: NN solutions go through base.UpdateSolution
+        // which synthesizes a TapeStepContext and delegates to Step
+        // (one source of truth, matches PyTorch/TF/JAX). Non-NN solutions
+        // (regression, clustering, classical models) keep the legacy
+        // flat-vector path below for backward compatibility.
+        if (currentSolution is AiDotNet.Interfaces.INeuralNetwork<T>)
+        {
+            return base.UpdateSolution(currentSolution, gradient);
+        }
         var parameters = InterfaceGuard.Parameterizable(currentSolution).GetParameters();
 
         // Initialize state vectors if needed
 
@@ -202,6 +202,17 @@ public override OptimizationResult<T, TInput, TOutput> Optimize(OptimizationInpu
 
         _m = new Vector<T>(parameters.Length);
         _u = new Vector<T>(parameters.Length);
+        // Reset the NN tape-side accumulators + bias-correction step count.
+        // The flat-vector path gets fresh Vectors above; the tape path
+        // uses parameter-tensor-keyed dictionaries (_tapeM/_tapeU) and a
+        // separate _tapeStep counter that PERSIST across Optimize calls
+        // on the same optimizer instance. Without this clear, a second
+        // Optimize call on the same optimizer would carry the prior run's
+        // first/inf moments AND a pre-advanced bias-correction counter,
+        // biasing every per-parameter step from iteration 1.
+        _tapeM.Clear();
+        _tapeU.Clear();
+        _tapeStep = 0;
         InitializeAdaptiveParameters();
 
         for (int epoch = 0; epoch < _options.MaxIterations; epoch++)
@@ -269,6 +280,15 @@ public override OptimizationResult<T, TInput, TOutput> Optimize(OptimizationInpu
     /// </remarks>
     protected override IFullModel<T, TInput, TOutput> UpdateSolution(IFullModel<T, TInput, TOutput> currentSolution, Vector<T> gradient)
     {
+        // #1413 CONSOLIDATION: NN solutions go through base.UpdateSolution
+        // which synthesizes a TapeStepContext and delegates to Step
+        // (one source of truth, matches PyTorch/TF/JAX). Non-NN solutions
+        // (regression, clustering, classical models) keep the legacy
+        // flat-vector path below for backward compatibility.
+        if (currentSolution is AiDotNet.Interfaces.INeuralNetwork<T>)
+        {
+            return base.UpdateSolution(currentSolution, gradient);
+        }
         var parameters = InterfaceGuard.Parameterizable(currentSolution).GetParameters();
 
         // Initialize state vectors if needed
 
@@ -250,6 +250,15 @@ private void UpdateAccumulatedSquaredGradients(Vector<T> gradient)
     /// </remarks>
     protected override IFullModel<T, TInput, TOutput> UpdateSolution(IFullModel<T, TInput, TOutput> currentSolution, Vector<T> gradient)
     {
+        // #1413 CONSOLIDATION: NN solutions go through base.UpdateSolution
+        // which synthesizes a TapeStepContext and delegates to Step
+        // (one source of truth, matches PyTorch/TF/JAX). Non-NN solutions
+        // (regression, clustering, classical models) keep the legacy
+        // flat-vector path below for backward compatibility.
+        if (currentSolution is AiDotNet.Interfaces.INeuralNetwork<T>)
+        {
+            return base.UpdateSolution(currentSolution, gradient);
+        }
         var parameters = InterfaceGuard.Parameterizable(currentSolution).GetParameters();
 
         // === Vectorized Adagrad Update using IEngine (Phase B: US-GPU-015) ===
 
@@ -758,6 +758,15 @@ protected override void UpdateAdaptiveParameters(OptimizationStepData<T, TInput,
     /// </summary>
     protected override IFullModel<T, TInput, TOutput> UpdateSolution(IFullModel<T, TInput, TOutput> currentSolution, Vector<T> gradient)
     {
+        // #1413 CONSOLIDATION: NN solutions go through base.UpdateSolution
+        // which synthesizes a TapeStepContext and delegates to Step
+        // (one source of truth, matches PyTorch/TF/JAX). Non-NN solutions
+        // (regression, clustering, classical models) keep the legacy
+        // flat-vector path below for backward compatibility.
+        if (currentSolution is AiDotNet.Interfaces.INeuralNetwork<T>)
+        {
+            return base.UpdateSolution(currentSolution, gradient);
+        }
         var parameters = InterfaceGuard.Parameterizable(currentSolution).GetParameters();
 
         if (_mQuantized == null && _mFullPrecision == null)
 
@@ -157,6 +157,19 @@ public override OptimizationResult<T, TInput, TOutput> Optimize(OptimizationInpu
         // previous run's running maximum as a lower bound and suppress
         // early updates in the new run. (PR #1350 round-2 review.)
         _vMaxVector = null;
+        // Reset the NN tape-side state. The flat-vector path got reset
+        // above; the tape path uses parameter-tensor-keyed dictionaries
+        // (_tapeM, _tapeV, _tapeVMax) and a separate _tapeStep counter
+        // that PERSIST across Optimize calls on the same optimizer
+        // instance. Without this clear, a second Optimize call on the
+        // same optimizer would carry the prior run's first/second moments
+        // (and AMSGrad's running maximum) plus a pre-advanced bias-
+        // correction counter, biasing every per-parameter step from
+        // iteration 1.
+        _tapeM.Clear();
+        _tapeV.Clear();
+        _tapeVMax.Clear();
+        _tapeStep = 0;
 
         // Initialize parameters
         InitializeAdaptiveParameters();
@@ -314,7 +327,15 @@ protected override void UpdateAdaptiveParameters(OptimizationStepData<T, TInput,
     }
 
     /// <summary>
-    /// Updates the current solution using the Adam update rule.
+    /// Updates the current solution using the Adam update rule. Kept for the
+    /// non-NN code path (regression, clustering, classical models where the
+    /// solution does NOT implement <see cref="AiDotNet.Interfaces.INeuralNetwork{T}"/>);
+    /// the base-class <see cref="GradientBasedOptimizerBase{T,TInput,TOutput}.UpdateSolution"/>
+    /// intercepts NN solutions and delegates to <see cref="Step(TapeStepContext{T})"/>
+    /// via <see cref="GradientBasedOptimizerBase{T,TInput,TOutput}.SynthesizeTapeStepContext"/>,
+    /// so the legacy flat-vector path here only runs for non-NN models — eliminating
+    /// the historical two-Adam-implementations split (#1413). All NN training
+    /// goes through Step, which has the anomaly guard + gradient clipping safeguards.
     /// </summary>
     /// <param name="currentSolution">The current solution being optimized.</param>
     /// <param name="gradient">The calculated gradient for the current solution.</param>
@@ -324,75 +345,16 @@ protected override void UpdateAdaptiveParameters(OptimizationStepData<T, TInput,
     /// It uses the current gradient and past information to decide how to change each parameter.
     /// </para>
     /// </remarks>
-    protected override IFullModel<T, TInput, TOutput> UpdateSolution(IFullModel<T, TInput, TOutput> currentSolution, Vector<T> gradient)
-    {
-        var parameters = InterfaceGuard.Parameterizable(currentSolution).GetParameters();
-
-        // Right-size _m/_v to gradient on first call or after lazy-layer expansion.
-        if (_m.Length != gradient.Length)
-        {
-            var newM = new Vector<T>(gradient.Length);
-            var newV = new Vector<T>(gradient.Length);
-            int copyLen = Math.Min(_m.Length, gradient.Length);
-            for (int i = 0; i < copyLen; i++) { newM[i] = _m[i]; newV[i] = _v[i]; }
-            _m = newM;
-            _v = newV;
-        }
-
-        // === Vectorized Adam Update using IEngine ===
-        // Phase B: US-GPU-015 - GPU-accelerated gradient updates
-
-        T oneMinusBeta1 = NumOps.Subtract(NumOps.One, _currentBeta1);
-        T oneMinusBeta2 = NumOps.Subtract(NumOps.One, _currentBeta2);
-        T biasCorrection1 = NumOps.Subtract(NumOps.One, NumOps.Power(_currentBeta1, NumOps.FromDouble(_t)));
-        T biasCorrection2 = NumOps.Subtract(NumOps.One, NumOps.Power(_currentBeta2, NumOps.FromDouble(_t)));
-        T epsilon = NumOps.FromDouble(_options.Epsilon);
-
-        // Update biased first moment: m = beta1 * m + (1 - beta1) * gradient
-        var mScaled = (Vector<T>)Engine.Multiply(_m, _currentBeta1);
-        var gradScaled = (Vector<T>)Engine.Multiply(gradient, oneMinusBeta1);
-        _m = (Vector<T>)Engine.Add(mScaled, gradScaled);
-
-        // Update biased second moment: v = beta2 * v + (1 - beta2) * gradient^2
-        var gradSquared = (Vector<T>)Engine.Multiply(gradient, gradient);
-        var vScaled = (Vector<T>)Engine.Multiply(_v, _currentBeta2);
-        var gradSquaredScaled = (Vector<T>)Engine.Multiply(gradSquared, oneMinusBeta2);
-        _v = (Vector<T>)Engine.Add(vScaled, gradSquaredScaled);
-
-        // Compute bias-corrected first moment: mHat = m / (1 - beta1^t)
-        var mHat = (Vector<T>)Engine.Divide(_m, biasCorrection1);
-
-        // Compute bias-corrected second moment: vHat = v / (1 - beta2^t)
-        var vHat = (Vector<T>)Engine.Divide(_v, biasCorrection2);
-
-        // AMSGrad: when enabled, divide by sqrt(running max of v̂) instead
-        // of sqrt(v̂) — the same correction the vector UpdateParameters
-        // path applies. Without this branch the UpdateSolution path
-        // silently ran plain Adam even with UseAMSGrad=true, defeating
-        // the purpose of the AMSGrad option on the BuildAsync/Optimize
-        // call path. PR #1350 review.
-        var vHatForDenominator = vHat;
-        if (_options.UseAMSGrad)
-        {
-            if (_vMaxVector is null || _vMaxVector.Length != vHat.Length)
-                _vMaxVector = new Vector<T>(vHat.Length);
-            _vMaxVector = (Vector<T>)Engine.Max(_vMaxVector, vHat);
-            vHatForDenominator = _vMaxVector;
-        }
-
-        // Compute update: update = learningRate * mHat / (sqrt(vHat_used) + epsilon)
-        var vHatSqrt = (Vector<T>)Engine.Sqrt(vHatForDenominator);
-        // Create epsilon vector for addition
-        var epsilonVec = Vector<T>.CreateDefault(vHatSqrt.Length, epsilon);
-        var denominator = (Vector<T>)Engine.Add(vHatSqrt, epsilonVec);
-        var updateDiv = (Vector<T>)Engine.Divide(mHat, denominator);
-        var update = (Vector<T>)Engine.Multiply(updateDiv, CurrentLearningRate);
-
-        // Apply update: parameters = parameters - update
-        var updatedParams = (Vector<T>)Engine.Subtract(parameters, update);
-
-        return InterfaceGuard.Parameterizable(currentSolution).WithParameters(updatedParams);
-    }
+    // #1413 ARCHITECTURAL CONSOLIDATION: AdamOptimizer's flat-vector
+    // UpdateSolution override is REMOVED. NN solutions go through the base
+    // class's UpdateSolution which synthesizes a TapeStepContext from the
+    // flat gradient and delegates to Step(TapeStepContext) — the SAME code
+    // path the per-sample nn.Train bypass uses, with the SAME anomaly
+    // guard, gradient clipping, AMSGrad, and float-loop fast path. Non-NN
+    // solutions fall through to the base's UpdateParameters dispatch which
+    // resolves to AdamOptimizer.UpdateParameters (still present below).
+    // This is the elimination of the two-Adam-implementations split that
+    // caused #1380.
 
     /// <summary>
     /// Updates a vector of parameters using the Adam optimization algorithm.
@@ -1450,6 +1412,22 @@ private bool AnyGradientIsAnomalous(TapeStepContext<T> context)
         return false;
     }
 
+    /// <summary>
+    /// Flat-vector overload of <see cref="AnyGradientIsAnomalous(TapeStepContext{T})"/>
+    /// for the Optimize / UpdateSolution path (#1380 part 2). Iterates the
+    /// gradient Vector directly since UpdateSolution doesn't have a
+    /// TapeStepContext to walk.
+    /// </summary>
+    private bool AnyGradientIsAnomalous(Vector<T> gradient)
+    {
+        for (int i = 0; i < gradient.Length; i++)
+        {
+            double v = NumOps.ToDouble(gradient[i]);
+            if (double.IsNaN(v) || double.IsInfinity(v)) return true;
+        }
+        return false;
+    }
+
     private static void ApplyGlobalNormGradientClipping(
         TapeStepContext<T> context,
         double maxNorm)
 
@@ -255,6 +255,15 @@ protected override void UpdateAdaptiveParameters(OptimizationStepData<T, TInput,
     /// <returns>A new solution with updated parameters.</returns>
     protected override IFullModel<T, TInput, TOutput> UpdateSolution(IFullModel<T, TInput, TOutput> currentSolution, Vector<T> gradient)
     {
+        // #1413 CONSOLIDATION: NN solutions go through base.UpdateSolution
+        // which synthesizes a TapeStepContext and delegates to Step
+        // (one source of truth, matches PyTorch/TF/JAX). Non-NN solutions
+        // (regression, clustering, classical models) keep the legacy
+        // flat-vector path below for backward compatibility.
+        if (currentSolution is AiDotNet.Interfaces.INeuralNetwork<T>)
+        {
+            return base.UpdateSolution(currentSolution, gradient);
+        }
         var parameters = InterfaceGuard.Parameterizable(currentSolution).GetParameters();
 
         // Right-size _m/_v/_vMax to gradient on first call or after lazy-layer
 
@@ -344,6 +344,15 @@ public override OptimizationResult<T, TInput, TOutput> Optimize(OptimizationInpu
     /// <returns>The updated solution.</returns>
     protected override IFullModel<T, TInput, TOutput> UpdateSolution(IFullModel<T, TInput, TOutput> currentSolution, Vector<T> gradient)
     {
+        // #1413 CONSOLIDATION: NN solutions go through base.UpdateSolution
+        // which synthesizes a TapeStepContext and delegates to Step
+        // (one source of truth, matches PyTorch/TF/JAX). Non-NN solutions
+        // (regression, clustering, classical models) keep the legacy
+        // flat-vector path below for backward compatibility.
+        if (currentSolution is AiDotNet.Interfaces.INeuralNetwork<T>)
+        {
+            return base.UpdateSolution(currentSolution, gradient);
+        }
         // === Partially Vectorized FTRL Update using IEngine (Phase B: US-GPU-015) ===
         // FTRL uses L1 thresholding which requires conditional logic per-element
         // Vectorized: gradient operations, sigma calculation, state updates