ooples
diff --git a/‎src/AiDotNet.Generators/TestScaffoldGenerator.cs‎
Lines changed: 28 additions & 0 deletions b/‎src/AiDotNet.Generators/TestScaffoldGenerator.cs‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/Audio/AudioGen/AudioGenModel.cs‎
Lines changed: 4 additions & 4 deletions b/‎src/Audio/AudioGen/AudioGenModel.cs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/Audio/Emotion/SpeechEmotionRecognizer.cs‎
Lines changed: 1 addition & 1 deletion b/‎src/Audio/Emotion/SpeechEmotionRecognizer.cs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Audio/LanguageIdentification/ECAPATDNNLanguageIdentifier.cs‎
Lines changed: 4 additions & 4 deletions b/‎src/Audio/LanguageIdentification/ECAPATDNNLanguageIdentifier.cs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/Audio/LanguageIdentification/VoxLingua107Identifier.cs‎
Lines changed: 4 additions & 4 deletions b/‎src/Audio/LanguageIdentification/VoxLingua107Identifier.cs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/Audio/LanguageIdentification/Wav2Vec2LanguageIdentifier.cs‎
Lines changed: 4 additions & 4 deletions b/‎src/Audio/LanguageIdentification/Wav2Vec2LanguageIdentifier.cs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/Audio/MusicGen/MusicGenModel.cs‎
Lines changed: 4 additions & 4 deletions b/‎src/Audio/MusicGen/MusicGenModel.cs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/Audio/SpeechRecognition/Wav2Vec2Model.cs‎
Lines changed: 37 additions & 6 deletions b/‎src/Audio/SpeechRecognition/Wav2Vec2Model.cs‎
Lines changed: 37 additions & 6 deletions
diff --git a/‎src/Audio/Whisper/WhisperModel.cs‎
Lines changed: 2 additions & 2 deletions b/‎src/Audio/Whisper/WhisperModel.cs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/Classification/Boosting/DARTClassifier.cs‎
Lines changed: 1 addition & 1 deletion b/‎src/Classification/Boosting/DARTClassifier.cs‎
Lines changed: 1 addition & 1 deletion
@@ -1936,6 +1936,34 @@ private static void EmitGeneratedTestClass(
             sb.AppendLine("    protected override int[] InputShape => new[] { 36, 1024 };");
             sb.AppendLine("    protected override int[] OutputShape => new[] { 4 };");
         }
+        else if (isVisionModel &&
+                 (model.ClassName.StartsWith("LayoutLM", System.StringComparison.Ordinal)
+                  || model.ClassName.StartsWith("LayoutXLM", System.StringComparison.Ordinal)
+                  || model.ClassName.StartsWith("LiLT", System.StringComparison.Ordinal)
+                  || model.ClassName.StartsWith("DocFormer", System.StringComparison.Ordinal)
+                  || model.ClassName.StartsWith("DocBank", System.StringComparison.Ordinal)
+                  || model.ClassName.StartsWith("DocGCN", System.StringComparison.Ordinal)
+                  || model.ClassName.StartsWith("PICK", System.StringComparison.Ordinal)
+                  || model.ClassName.StartsWith("TRIE", System.StringComparison.Ordinal)
+                  || model.ClassName.StartsWith("DocOwl", System.StringComparison.Ordinal)
+                  || model.ClassName.StartsWith("UDOP", System.StringComparison.Ordinal)
+                  || model.ClassName.StartsWith("InfographicVQA", System.StringComparison.Ordinal)))
+        {
+            // LayoutLM-family document models (Xu et al. 2020 KDD "LayoutLM",
+            // Xu et al. 2021 ACL "LayoutXLM", Wang et al. 2022 ACL "LiLT",
+            // Appalaraju et al. 2021 ICCV "DocFormer", etc.) carry the Vision
+            // domain tag because they understand 2D layout, but their actual
+            // model input is TOKEN IDs (rank-1 sequence of int32-shaped doubles),
+            // not raw RGB pixels. Feeding a [3, 128, 128] image tensor causes
+            // the first EmbeddingLayer to treat every float as a token ID:
+            // 49 152 lookups × 768 embedding dim × 12 transformer layers ×
+            // 30 Train iters times out every test that runs Forward. Emit
+            // a short token-ID sequence so the model's intended code path
+            // (token embedding → 2D position embeddings → BERT-style stack)
+            // runs at sensible cost.
+            sb.AppendLine("    protected override int[] InputShape => new[] { 16 };");
+            sb.AppendLine("    protected override int[] OutputShape => new[] { 4 };");
+        }
         else if (isVisionModel &&
                  (model.ClassName.StartsWith("UNITER", System.StringComparison.Ordinal)
                   || model.ClassName.StartsWith("VisualBERT", System.StringComparison.Ordinal)
 
@@ -353,7 +353,7 @@ public AudioGenModel(
         IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>? optimizer = null,
         ILossFunction<T>? lossFunction = null,
         AudioGenOptions? options = null)
-        : base(architecture, lossFunction ?? new CrossEntropyLoss<T>(), 1.0)
+        : base(architecture, lossFunction ?? new CrossEntropyWithLogitsLoss<T>(), 1.0)
     {
         _options = options ?? new AudioGenOptions();
         Options = _options;
@@ -436,7 +436,7 @@ public AudioGenModel(
             _tokenizer = tokenizer;
 
             _optimizer = optimizer;
-            _lossFunction = lossFunction ?? new CrossEntropyLoss<T>();
+            _lossFunction = lossFunction ?? new CrossEntropyWithLogitsLoss<T>();
 
             _random = seed.HasValue
                 ? RandomHelper.CreateSeededRandom(seed.Value)
@@ -512,7 +512,7 @@ public AudioGenModel(
         IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>? optimizer = null,
         ILossFunction<T>? lossFunction = null,
         AudioGenOptions? options = null)
-        : base(architecture, lossFunction ?? new CrossEntropyLoss<T>(), 1.0)
+        : base(architecture, lossFunction ?? new CrossEntropyWithLogitsLoss<T>(), 1.0)
     {
         _options = options ?? new AudioGenOptions();
         Options = _options;
@@ -569,7 +569,7 @@ public AudioGenModel(
         // Use T5-style tokenizer as default for AudioGen text encoder
         _tokenizer = tokenizer ?? Tokenization.LanguageModelTokenizerFactory.CreateForBackbone(LanguageModelBackbone.FlanT5);
         _optimizer = optimizer ?? new AdamWOptimizer<T, Tensor<T>, Tensor<T>>(this);
-        _lossFunction = lossFunction ?? new CrossEntropyLoss<T>();
+        _lossFunction = lossFunction ?? new CrossEntropyWithLogitsLoss<T>();
 
         _random = seed.HasValue
             ? RandomHelper.CreateSeededRandom(seed.Value)
 
@@ -358,7 +358,7 @@ public SpeechEmotionRecognizer(
         }
         else
         {
-            LossFunction = new CrossEntropyLoss<T>();
+            LossFunction = new CrossEntropyWithLogitsLoss<T>();
         }
 
         // Create mel spectrogram extractor
 
@@ -123,7 +123,7 @@ public ECAPATDNNLanguageIdentifier(
         NeuralNetworkArchitecture<T> architecture,
         string modelPath,
         ECAPATDNNOptions? options = null)
-        : base(architecture, new CrossEntropyLoss<T>())
+        : base(architecture, new CrossEntropyWithLogitsLoss<T>())
     {
         if (string.IsNullOrWhiteSpace(modelPath))
             throw new ArgumentException("Model path cannot be null or empty.", nameof(modelPath));
@@ -138,7 +138,7 @@ public ECAPATDNNLanguageIdentifier(
         SampleRate = _options.SampleRate;
         NumMels = _options.NumMels;
 
-        _lossFunction = new CrossEntropyLoss<T>();
+        _lossFunction = new CrossEntropyWithLogitsLoss<T>();
 
         // Initialize MFCC extractor
         _mfccExtractor = new MfccExtractor<T>(new MfccOptions
@@ -173,7 +173,7 @@ public ECAPATDNNLanguageIdentifier(
         ECAPATDNNOptions? options = null,
         IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>? optimizer = null,
         ILossFunction<T>? lossFunction = null)
-        : base(architecture, lossFunction ?? new CrossEntropyLoss<T>())
+        : base(architecture, lossFunction ?? new CrossEntropyWithLogitsLoss<T>())
     {
         if (supportedLanguages is null)
             throw new ArgumentNullException(nameof(supportedLanguages));
@@ -187,7 +187,7 @@ public ECAPATDNNLanguageIdentifier(
         SampleRate = _options.SampleRate;
         NumMels = _options.NumMels;
 
-        _lossFunction = lossFunction ?? new CrossEntropyLoss<T>();
+        _lossFunction = lossFunction ?? new CrossEntropyWithLogitsLoss<T>();
         _optimizer = optimizer ?? new AdamWOptimizer<T, Tensor<T>, Tensor<T>>(this);
 
         // Initialize MFCC extractor
 
@@ -154,7 +154,7 @@ public VoxLingua107Identifier(
         NeuralNetworkArchitecture<T> architecture,
         string modelPath,
         VoxLingua107Options? options = null)
-        : base(architecture, new CrossEntropyLoss<T>())
+        : base(architecture, new CrossEntropyWithLogitsLoss<T>())
     {
         if (string.IsNullOrWhiteSpace(modelPath))
             throw new ArgumentException("Model path cannot be null or empty.", nameof(modelPath));
@@ -169,7 +169,7 @@ public VoxLingua107Identifier(
         SampleRate = _options.SampleRate;
         NumMels = _options.NumMels;
 
-        _lossFunction = new CrossEntropyLoss<T>();
+        _lossFunction = new CrossEntropyWithLogitsLoss<T>();
 
         // Initialize MFCC extractor
         _mfccExtractor = new MfccExtractor<T>(new MfccOptions
@@ -202,7 +202,7 @@ public VoxLingua107Identifier(
         VoxLingua107Options? options = null,
         IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>? optimizer = null,
         ILossFunction<T>? lossFunction = null)
-        : base(architecture, lossFunction ?? new CrossEntropyLoss<T>())
+        : base(architecture, lossFunction ?? new CrossEntropyWithLogitsLoss<T>())
     {
         _numOps = MathHelper.GetNumericOperations<T>();
         _options = options ?? new VoxLingua107Options();
@@ -211,7 +211,7 @@ public VoxLingua107Identifier(
         SampleRate = _options.SampleRate;
         NumMels = _options.NumMels;
 
-        _lossFunction = lossFunction ?? new CrossEntropyLoss<T>();
+        _lossFunction = lossFunction ?? new CrossEntropyWithLogitsLoss<T>();
         _optimizer = optimizer ?? new AdamWOptimizer<T, Tensor<T>, Tensor<T>>(this);
 
         // Initialize MFCC extractor
 
@@ -115,7 +115,7 @@ public Wav2Vec2LanguageIdentifier(
         NeuralNetworkArchitecture<T> architecture,
         string modelPath,
         Wav2Vec2LidOptions? options = null)
-        : base(architecture, new CrossEntropyLoss<T>())
+        : base(architecture, new CrossEntropyWithLogitsLoss<T>())
     {
         if (string.IsNullOrWhiteSpace(modelPath))
             throw new ArgumentException("Model path cannot be null or empty.", nameof(modelPath));
@@ -129,7 +129,7 @@ public Wav2Vec2LanguageIdentifier(
 
         SampleRate = _options.SampleRate;
 
-        _lossFunction = new CrossEntropyLoss<T>();
+        _lossFunction = new CrossEntropyWithLogitsLoss<T>();
 
         // Initialize language mappings
         (_languageIdToCode, _languageCodeToId, _languageCodeToName) = InitializeLanguageMappings();
@@ -153,7 +153,7 @@ public Wav2Vec2LanguageIdentifier(
         Wav2Vec2LidOptions? options = null,
         IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>? optimizer = null,
         ILossFunction<T>? lossFunction = null)
-        : base(architecture, lossFunction ?? new CrossEntropyLoss<T>())
+        : base(architecture, lossFunction ?? new CrossEntropyWithLogitsLoss<T>())
     {
         if (supportedLanguages is null)
             throw new ArgumentNullException(nameof(supportedLanguages));
@@ -166,7 +166,7 @@ public Wav2Vec2LanguageIdentifier(
 
         SampleRate = _options.SampleRate;
 
-        _lossFunction = lossFunction ?? new CrossEntropyLoss<T>();
+        _lossFunction = lossFunction ?? new CrossEntropyWithLogitsLoss<T>();
         _optimizer = optimizer ?? new AdamWOptimizer<T, Tensor<T>, Tensor<T>>(this);
 
         // Initialize language mappings
 
@@ -165,7 +165,7 @@ public MusicGenModel(
         MusicGenOptions? options = null,
         IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>? optimizer = null,
         ILossFunction<T>? lossFunction = null)
-        : base(architecture, lossFunction ?? new CrossEntropyLoss<T>(), 1.0)
+        : base(architecture, lossFunction ?? new CrossEntropyWithLogitsLoss<T>(), 1.0)
     {
         // Validate paths
         if (string.IsNullOrWhiteSpace(textEncoderPath))
@@ -217,7 +217,7 @@ public MusicGenModel(
         }
 
         _optimizer = optimizer;
-        _lossFunction = lossFunction ?? new CrossEntropyLoss<T>();
+        _lossFunction = lossFunction ?? new CrossEntropyWithLogitsLoss<T>();
         _random = _options.Seed.HasValue
             ? RandomHelper.CreateSeededRandom(_options.Seed.Value)
             : RandomHelper.CreateSecureRandom();
@@ -250,7 +250,7 @@ public MusicGenModel(
         ITokenizer? tokenizer = null,
         IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>? optimizer = null,
         ILossFunction<T>? lossFunction = null)
-        : base(architecture, lossFunction ?? new CrossEntropyLoss<T>(), 1.0)
+        : base(architecture, lossFunction ?? new CrossEntropyWithLogitsLoss<T>(), 1.0)
     {
         _options = options ?? new MusicGenOptions();
         Options = _options;
@@ -262,7 +262,7 @@ public MusicGenModel(
         // Use T5-compatible tokenizer as default
         _tokenizer = tokenizer ?? Tokenization.LanguageModelTokenizerFactory.CreateForBackbone(LanguageModelBackbone.FlanT5);
         _optimizer = optimizer ?? new AdamWOptimizer<T, Tensor<T>, Tensor<T>>(this);
-        _lossFunction = lossFunction ?? new CrossEntropyLoss<T>();
+        _lossFunction = lossFunction ?? new CrossEntropyWithLogitsLoss<T>();
         _random = _options.Seed.HasValue
             ? RandomHelper.CreateSeededRandom(_options.Seed.Value)
             : RandomHelper.CreateSecureRandom();
 
@@ -282,8 +282,13 @@ public Wav2Vec2Model(
         // Initialize supported languages
         SupportedLanguages = new[] { language ?? "en" };
 
-        // Default loss function (cross-entropy is standard for ASR)
-        _lossFunction = new CrossEntropyLoss<T>();
+        // Wav2Vec2 + CTC is the standard ASR training stack (Baevski et al.
+        // 2020 §3.2): CTC handles the variable-length output-vs-input
+        // alignment that plain cross-entropy cannot. CE-with-logits would
+        // be silently wrong here — it forces a fixed-length 1:1 alignment
+        // and the loss is computed per-frame, which is not the ASR
+        // objective. PR #1404 review (CodeRabbit).
+        _lossFunction = new CTCLoss<T>(numClasses: _vocabSize, blankIndex: 0);
 
         InitializeLayers();
     }
@@ -367,9 +372,19 @@ public Wav2Vec2Model(
         // Initialize supported languages
         SupportedLanguages = new[] { language ?? "en" };
 
-        // Initialize training components
-        _optimizer = optimizer ?? new AdamOptimizer<T, Tensor<T>, Tensor<T>>(this);
-        _lossFunction = lossFunction ?? new CrossEntropyLoss<T>();
+        // Initialize training components — CTC for ASR (see ONNX ctor for
+        // rationale). Wav2Vec2's variable-length frame-vs-character alignment
+        // can't be expressed by plain cross-entropy.
+        // Paper-faithful LR per Baevski et al. 2020 NeurIPS §3.3 ("wav2vec 2.0"):
+        // Adam with peak LR=5e-4 for pretraining, 5e-5 for ASR fine-tuning.
+        // Framework default (LR=1e-3) is too aggressive for this BERT-base scale
+        // model at random init and causes Training_ShouldReduceLoss to diverge.
+        // Use the 5e-5 fine-tuning default since the test runs from random init
+        // and supervised CTC; pretraining-scale 5e-4 also works.
+        _optimizer = optimizer ?? new AdamOptimizer<T, Tensor<T>, Tensor<T>>(
+            this,
+            new Models.Options.AdamOptimizerOptions<T, Tensor<T>, Tensor<T>> { InitialLearningRate = 5e-5 });
+        _lossFunction = lossFunction ?? new CTCLoss<T>(numClasses: _vocabSize, blankIndex: 0);
 
         InitializeNativeLayers();
     }
@@ -619,7 +634,23 @@ public override void Train(Tensor<T> input, Tensor<T> expectedOutput)
         SetTrainingMode(true);
         try
         {
-            TrainWithTape(input, expectedOutput);
+            // Pass the model's own non-AMSGrad AdamOptimizer explicitly.
+            // The optimizer-null branch would otherwise fall back to
+            // GetOrCreateBaseOptimizer (which builds an AMSGrad Adam),
+            // and the fused-Adam fast path bails out on AMSGrad — leaving
+            // every step on the BERT-base-scale wav2vec2 encoder running
+            // through the eager tape executor at multi-second cost per
+            // iteration.
+            //
+            // The cast goes through `as ... ?? throw` rather than plain
+            // `as` so a user passing a non-gradient optimizer fails loudly
+            // instead of silently dropping into the default-optimizer
+            // fallback (would mask intent and produce mysteriously-different
+            // training trajectories). PR #1404 review (CodeRabbit).
+            var gradientOptimizer = _optimizer as IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>
+                ?? throw new InvalidOperationException(
+                    "Wav2Vec2Model training requires an optimizer implementing IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>.");
+            TrainWithTape(input, expectedOutput, gradientOptimizer);
         }
         finally
         {
 
@@ -363,7 +363,7 @@ public WhisperModel(
         SupportedLanguages = GetSupportedLanguages();
 
         // Default loss function (cross-entropy is standard for sequence-to-sequence ASR)
-        _lossFunction = new CrossEntropyLoss<T>();
+        _lossFunction = new CrossEntropyWithLogitsLoss<T>();
 
         InitializeLayers();
     }
@@ -470,7 +470,7 @@ public WhisperModel(
 
         // Initialize training components
         _optimizer = optimizer ?? new AdamWOptimizer<T, Tensor<T>, Tensor<T>>(this);
-        _lossFunction = lossFunction ?? new CrossEntropyLoss<T>();
+        _lossFunction = lossFunction ?? new CrossEntropyWithLogitsLoss<T>();
 
         InitializeLayers();
     }
 
@@ -104,7 +104,7 @@ public class DARTClassifier<T> : EnsembleClassifierBase<T>
     /// <param name="regularization">Optional regularization.</param>
     public DARTClassifier(DARTClassifierOptions<T>? options = null,
         IRegularization<T, Matrix<T>, Vector<T>>? regularization = null)
-        : base(options ??= new DARTClassifierOptions<T>(), regularization, new CrossEntropyLoss<T>())
+        : base(options ??= new DARTClassifierOptions<T>(), regularization, new CrossEntropyWithLogitsLoss<T>())
     {
         _options = options;
         _trees = [];
Original file line number	Diff line number	Diff line change
`@@ -358,7 +358,7 @@ public SpeechEmotionRecognizer(`
`358`	`358`	`}`
`359`	`359`	`else`
`360`	`360`	`{`
`361`		`- LossFunction = new CrossEntropyLoss<T>();`
	`361`	`+ LossFunction = new CrossEntropyWithLogitsLoss<T>();`
`362`	`362`	`}`
`363`	`363`
`364`	`364`	`// Create mel spectrogram extractor`
Original file line number	Diff line number	Diff line change
`@@ -363,7 +363,7 @@ public WhisperModel(`
`363`	`363`	`SupportedLanguages = GetSupportedLanguages();`
`364`	`364`
`365`	`365`	`// Default loss function (cross-entropy is standard for sequence-to-sequence ASR)`
`366`		`- _lossFunction = new CrossEntropyLoss<T>();`
	`366`	`+ _lossFunction = new CrossEntropyWithLogitsLoss<T>();`
`367`	`367`
`368`	`368`	`InitializeLayers();`
`369`	`369`	`}`
`@@ -470,7 +470,7 @@ public WhisperModel(`
`470`	`470`
`471`	`471`	`// Initialize training components`
`472`	`472`	`_optimizer = optimizer ?? new AdamWOptimizer<T, Tensor<T>, Tensor<T>>(this);`
`473`		`- _lossFunction = lossFunction ?? new CrossEntropyLoss<T>();`
	`473`	`+ _lossFunction = lossFunction ?? new CrossEntropyWithLogitsLoss<T>();`
`474`	`474`
`475`	`475`	`InitializeLayers();`
`476`	`476`	`}`
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ public class DARTClassifier<T> : EnsembleClassifierBase<T>`
`104`	`104`	`/// <param name="regularization">Optional regularization.</param>`
`105`	`105`	`public DARTClassifier(DARTClassifierOptions<T>? options = null,`
`106`	`106`	`IRegularization<T, Matrix<T>, Vector<T>>? regularization = null)`
`107`		`- : base(options ??= new DARTClassifierOptions<T>(), regularization, new CrossEntropyLoss<T>())`
	`107`	`+ : base(options ??= new DARTClassifierOptions<T>(), regularization, new CrossEntropyWithLogitsLoss<T>())`
`108`	`108`	`{`
`109`	`109`	`_options = options;`
`110`	`110`	`_trees = [];`