ooples
diff --git a/‎src/AiModelBuilder.cs‎
Lines changed: 156 additions & 56 deletions b/‎src/AiModelBuilder.cs‎
Lines changed: 156 additions & 56 deletions
diff --git a/‎src/LoRA/DefaultLoRAConfiguration.cs‎
Lines changed: 68 additions & 38 deletions b/‎src/LoRA/DefaultLoRAConfiguration.cs‎
Lines changed: 68 additions & 38 deletions
@@ -2813,73 +2813,159 @@ void OnAutoMLCandidate(IFullModel<T, TInput, TOutput> candidate)
         {
             System.Diagnostics.Trace.TraceInformation("Applying LoRA adapters to neural network layers...");
 
-            // Warmup forward to materialise lazy-init layers BEFORE LoRA
-            // wrapping. LoRAAdapterBase.CreateLoRALayer needs the
-            // layer's input/output dimensions at adapter-construction
-            // time; lazy layers (LayerNorm gamma/beta, MultiHeadAttention
-            // lazy weight banks) report (0, …) until first Forward
-            // materialises the shape. Without the warmup, LoRALayer's
-            // ctor would throw ArgumentOutOfRangeException("Output size
-            // must be positive"). Best-effort: if the warmup throws
-            // (e.g. the user wired a forward path that requires training
-            // mode), the ApplyLoRA-side IsShapeResolved guard silently
-            // skips still-unresolved layers so the wrap loop succeeds on
-            // the materialised ones. Discovered by AiDotNet#1345 Bucket10
-            // ConfigureLoRA test.
-            try
+            // AiDotNet#1370 shape oracle: pre-loop asks every layer to declare its
+            // shape from constructor args alone (TryDeclareShape). Layers like
+            // MultiHeadAttentionLayer (knows embeddingDim from ctor) and any
+            // layer constructed with explicit shape (e.g. LayerNormalizationLayer
+            // with the featureSize ctor) return true without needing input.
+            // Lazy convs / inferred-shape layers still return false and trigger
+            // the existing warmup forward as a fallback.
+            // PR #1388 review C7iL5: TryDeclareShape() is a public virtual
+            // extension point — a custom layer override can throw arbitrary
+            // exceptions. Treat non-fatal failures as "shape not declared"
+            // (falls back to the warmup forward below), but let cancellation
+            // and OOM propagate so the host can still abort. Trace the
+            // failure with the layer type + full exception so the operator
+            // can diagnose silently-skipped declarations.
+            static bool TryDeclareShapeSafely(NeuralNetworks.Layers.LayerBase<T> layer)
             {
-                bool prevTrainingMode = neuralNetForLoRA.IsTrainingMode;
-                neuralNetForLoRA.SetTrainingMode(false);
                 try
                 {
-                    // One sample is enough to resolve lazy-layer shapes;
-                    // a full-dataset forward would do O(N) work and
-                    // allocate a full pass of activation tensors just to
-                    // shape-resolve. Carve off a 1-row probe.
-                    var warmupProbe = TrySliceFirstSampleForLoRAWarmup(x);
-                    var warmupResult = _model.Predict(warmupProbe);
-                    System.GC.KeepAlive(warmupResult);
+                    return layer.TryDeclareShape();
                 }
-                finally
+                catch (Exception ex) when (
+                    ex is not OperationCanceledException
+                    && ex is not OutOfMemoryException
+                    && ex is not StackOverflowException)
                 {
-                    neuralNetForLoRA.SetTrainingMode(prevTrainingMode);
+                    System.Diagnostics.Trace.TraceWarning(
+                        $"TryDeclareShape failed for {layer.GetType().FullName} — " +
+                        $"treating as 'needs warmup': {ex}");
+                    return false;
                 }
             }
-            catch (OperationCanceledException)
+
+            // PR #1388 review C8mvN: only let LoRA-targeted layers drive the
+            // warmup-skip decision. A non-target lazy layer (e.g. a lazy
+            // ActivationLayer or DropoutLayer) won't be wrapped by ApplyLoRA
+            // anyway — counting it as "needs warmup" forces the warmup
+            // forward needlessly on mixed networks. Use the configuration's
+            // own non-mutating eligibility predicate when available; for a
+            // custom ILoRAConfiguration implementation that doesn't expose
+            // one, fall back to "every LayerBase counts" (conservative —
+            // may force an unnecessary warmup, but never skips one
+            // incorrectly).
+            var loraTargetProbe = _loraConfiguration as LoRA.DefaultLoRAConfiguration<T>;
+
+            int declaredCount = 0;
+            int needsWarmupCount = 0;
+            for (int i = 0; i < neuralNetForLoRA.Layers.Count; i++)
             {
-                // Cancellation propagates — caller wants out, not a swallowed warmup.
-                throw;
+                var layer = neuralNetForLoRA.Layers[i];
+                if (layer is not NeuralNetworks.Layers.LayerBase<T> declarable)
+                {
+                    // Non-LayerBase<T> layers (rare, e.g. wrapper adapters from a
+                    // prior pass) bypass the oracle entirely — the ApplyLoRA call
+                    // handles its own shape probing.
+                    continue;
+                }
+                if (loraTargetProbe is not null && !loraTargetProbe.IsLoRATarget(declarable))
+                {
+                    // Not a LoRA target — its shape doesn't gate the warmup-skip
+                    // decision. Skip without bumping either counter.
+                    continue;
+                }
+                if (TryDeclareShapeSafely(declarable))
+                    declaredCount++;
+                else
+                    needsWarmupCount++;
             }
-            catch (OutOfMemoryException)
+
+            // If every shape-aware layer declared successfully, skip the warmup
+            // forward entirely — this is the win that beats PyTorch / HuggingFace
+            // PEFT's construction-time shape requirement: we get the zero-warmup
+            // behavior when shapes are known, AND still support lazy layers via
+            // the warmup fallback below when needed.
+            bool skipWarmup = needsWarmupCount == 0;
+            if (skipWarmup)
             {
-                // Critical: don't mask. The host may need to abort.
-                // StackOverflowException is intentionally NOT listed —
-                // modern .NET terminates the process on SOE rather than
-                // letting it propagate, so a catch clause for it is
-                // unreachable (review #1368 C7mpq).
-                throw;
+                System.Diagnostics.Trace.TraceInformation(
+                    $"LoRA warmup forward SKIPPED — all {declaredCount} shape-aware layer(s) " +
+                    "declared shape from constructor args (AiDotNet#1370 shape oracle).");
             }
-            catch (Exception ex)
+            else
             {
-                // Best-effort warmup: documented forward-mode requirements
-                // (e.g. layers that need IsTrainingMode=true) can throw here.
-                // The ApplyLoRA-side IsShapeResolved guard silently skips
-                // still-unresolved layers so the wrap loop succeeds on
-                // materialized ones (review #1368 C6WOG: narrowed to let
-                // OperationCanceledException + OutOfMemoryException +
-                // StackOverflowException propagate; everything else is
-                // genuine warmup variance and stays as a Trace warning).
-                // Include ex.ToString() so the trace carries the full
-                // stack trace + inner exceptions, not just the top-frame
-                // message. Trace.TraceWarning is the only signal an
-                // operator has when the warmup fails silently (this PR's
-                // review C88M6: ex.Message dropped the origin frame and
-                // any chained inner exception, leaving a downstream
-                // skipped-lazy-layer mystery if the warmup actually
-                // failed inside an unrelated subsystem).
-                System.Diagnostics.Trace.TraceWarning(
-                    $"LoRA warmup forward failed (proceeding — layers that materialised get wrapped; " +
-                    $"lazy ones skipped via IsShapeResolved guard): {ex}");
+                System.Diagnostics.Trace.TraceInformation(
+                    $"LoRA warmup forward required — {needsWarmupCount} layer(s) still need a forward " +
+                    $"pass to resolve shape ({declaredCount} declared from ctor).");
+
+                // Warmup forward to materialise lazy-init layers that didn't
+                // self-declare. LoRAAdapterBase.CreateLoRALayer needs the
+                // layer's input/output dimensions at adapter-construction
+                // time; lazy layers that fall through TryDeclareShape report
+                // (0, …) until first Forward materialises the shape.
+                // Without the warmup, LoRALayer's ctor would throw
+                // ArgumentOutOfRangeException("Output size must be positive").
+                // Best-effort: if the warmup throws (e.g. the user wired a
+                // forward path that requires training mode), the ApplyLoRA-side
+                // IsShapeResolved guard silently skips still-unresolved layers
+                // so the wrap loop succeeds on the materialised ones.
+                // Discovered by AiDotNet#1345 Bucket10 ConfigureLoRA test.
+                try
+                {
+                    bool prevTrainingMode = neuralNetForLoRA.IsTrainingMode;
+                    neuralNetForLoRA.SetTrainingMode(false);
+                    try
+                    {
+                        // One sample is enough to resolve lazy-layer shapes;
+                        // a full-dataset forward would do O(N) work and
+                        // allocate a full pass of activation tensors just to
+                        // shape-resolve. Carve off a 1-row probe.
+                        var warmupProbe = TrySliceFirstSampleForLoRAWarmup(x);
+                        var warmupResult = _model.Predict(warmupProbe);
+                        System.GC.KeepAlive(warmupResult);
+                    }
+                    finally
+                    {
+                        neuralNetForLoRA.SetTrainingMode(prevTrainingMode);
+                    }
+                }
+                catch (OperationCanceledException)
+                {
+                    // Cancellation propagates — caller wants out, not a swallowed warmup.
+                    throw;
+                }
+                catch (OutOfMemoryException)
+                {
+                    // Critical: don't mask. The host may need to abort.
+                    // StackOverflowException is intentionally NOT listed —
+                    // modern .NET terminates the process on SOE rather than
+                    // letting it propagate, so a catch clause for it is
+                    // unreachable (review #1368 C7mpq).
+                    throw;
+                }
+                catch (Exception ex)
+                {
+                    // Best-effort warmup: documented forward-mode requirements
+                    // (e.g. layers that need IsTrainingMode=true) can throw here.
+                    // The ApplyLoRA-side IsShapeResolved guard silently skips
+                    // still-unresolved layers so the wrap loop succeeds on
+                    // materialized ones (review #1368 C6WOG: narrowed to let
+                    // OperationCanceledException + OutOfMemoryException +
+                    // StackOverflowException propagate; everything else is
+                    // genuine warmup variance and stays as a Trace warning).
+                    // Include ex.ToString() so the trace carries the full
+                    // stack trace + inner exceptions, not just the top-frame
+                    // message. Trace.TraceWarning is the only signal an
+                    // operator has when the warmup fails silently (this PR's
+                    // review C88M6: ex.Message dropped the origin frame and
+                    // any chained inner exception, leaving a downstream
+                    // skipped-lazy-layer mystery if the warmup actually
+                    // failed inside an unrelated subsystem).
+                    System.Diagnostics.Trace.TraceWarning(
+                        $"LoRA warmup forward failed (proceeding — layers that materialised get wrapped; " +
+                        $"lazy ones skipped via IsShapeResolved guard): {ex}");
+                }
             }
 
             int adaptedCount = 0;
@@ -2888,8 +2974,22 @@ void OnAutoMLCandidate(IFullModel<T, TInput, TOutput> candidate)
             {
                 var originalLayer = neuralNetForLoRA.Layers[i];
 
+                // AiDotNet#1370: gate on TryDeclareShape() rather than IsShapeResolved.
+                // Layers like MHA that allocate weights from ctor-known dims return true
+                // from TryDeclareShape even when InputShape still has a -1 seq placeholder
+                // — LoRA wraps weight matrices, the seq placeholder doesn't matter.
+                //
+                // PR #1388 follow-up review C9PtZ: only probe TryDeclareShape on
+                // layers that ApplyLoRA would actually wrap. A non-target lazy
+                // layer (e.g. a lazy ActivationLayer or DropoutLayer) would get
+                // its TryDeclareShape called, potentially allocating weights or
+                // emitting a Trace warning, only for ApplyLoRA below to return
+                // it unchanged. Gate on the same IsLoRATarget predicate the
+                // pre-scan loop uses so the side effects of TryDeclareShape
+                // only run for actual adaptation candidates.
                 if (originalLayer is NeuralNetworks.Layers.LayerBase<T> lazyCheck
-                    && !lazyCheck.IsShapeResolved)
+                    && (loraTargetProbe is null || loraTargetProbe.IsLoRATarget(lazyCheck))
+                    && !TryDeclareShapeSafely(lazyCheck))
                 {
                     skippedLazyCount++;
                     continue;
 
@@ -276,77 +276,107 @@ public ILayer<T> ApplyLoRA(ILayer<T> layer)
             return layer;
         }
 
-        // Check if this is a layer type that benefits from LoRA adaptation
-        // (layers with trainable weight matrices)
+        // Graph convolutional layers - use specialized GraphConvolutionalLoRAAdapter
+        // which implements IGraphConvolutionLayer<T> and properly delegates graph methods.
+        // Kept separate from the IsLoRATargetType type-whitelist (which uses
+        // CreateAdapter for everything else) because the GraphConvolutionalLoRAAdapter
+        // ctor takes (layer, Rank, Alpha, FreezeBaseLayer) directly rather than going
+        // through the standard CreateAdapter dispatch.
+        if (layer is IGraphConvolutionLayer<T>)
+        {
+            return new GraphConvolutionalLoRAAdapter<T>(layer, Rank, Alpha, FreezeBaseLayer);
+        }
 
-        // Dense/Linear layers
-        if (layer is DenseLayer<T> || layer is FullyConnectedLayer<T> || layer is FeedForwardLayer<T>)
+        if (IsLoRATargetType(layer))
         {
             return CreateAdapter(layer);
         }
 
+        // Return layers without trainable weights unchanged
+        // (Activation, Pooling, Dropout, Flatten, Reshape, Normalization, etc.)
+        return layer;
+    }
+
+    /// <summary>
+    /// Non-mutating predicate: returns <c>true</c> when this configuration would
+    /// wrap <paramref name="layer"/> with a LoRA adapter (modulo the
+    /// shape-resolved guard, which is independent of the layer type).
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// Shares the same layer-type whitelist as <see cref="ApplyLoRA"/> so a
+    /// caller (typically <see cref="AiModelBuilder{T,TInput,TOutput}"/>'s
+    /// pre-wrap warmup-skip decision) can probe which layers will actually
+    /// participate in the LoRA pass without paying for adapter construction.
+    /// Returns <c>true</c> for graph-convolutional layers too — they route
+    /// through <see cref="GraphConvolutionalLoRAAdapter{T}"/> in
+    /// <see cref="ApplyLoRA"/>, but the warmup-skip pre-scan only needs to
+    /// know "would I wrap this", not which adapter type.
+    /// </para>
+    /// <para>
+    /// AiDotNet#1370 PR #1388 review C7iL5 — the pre-scan that decides
+    /// <c>skipWarmup</c> was treating every <see cref="LayerBase{T}"/> as a
+    /// LoRA candidate, which forced the warmup forward whenever ANY lazy
+    /// layer (even a non-target like a lazy Activation) hadn't declared.
+    /// This predicate lets the pre-scan restrict its count to actual LoRA
+    /// targets so non-target lazy layers don't block the zero-warmup path.
+    /// </para>
+    /// </remarks>
+    public bool IsLoRATarget(ILayer<T> layer)
+    {
+        if (layer is null) return false;
+        return layer is IGraphConvolutionLayer<T> || IsLoRATargetType(layer);
+    }
+
+    /// <summary>
+    /// Whitelist of concrete layer types that <see cref="ApplyLoRA"/> wraps via
+    /// <see cref="CreateAdapter"/>. Kept as a single private method so the
+    /// public <see cref="IsLoRATarget"/> probe and <see cref="ApplyLoRA"/>'s
+    /// dispatch can't drift.
+    /// </summary>
+    private static bool IsLoRATargetType(ILayer<T> layer)
+    {
+        // Dense/Linear layers
+        if (layer is DenseLayer<T> || layer is FullyConnectedLayer<T> || layer is FeedForwardLayer<T>)
+            return true;
+
         // Convolutional layers
         if (layer is ConvolutionalLayer<T> || layer is DeconvolutionalLayer<T> ||
             layer is DepthwiseSeparableConvolutionalLayer<T> || layer is DilatedConvolutionalLayer<T> ||
             layer is SeparableConvolutionalLayer<T> || layer is SubpixelConvolutionalLayer<T>)
-        {
-            return CreateAdapter(layer);
-        }
+            return true;
 
         // Recurrent layers (LSTM, GRU, etc.)
         if (layer is LSTMLayer<T> || layer is GRULayer<T> || layer is RecurrentLayer<T> ||
             layer is ConvLSTMLayer<T> || layer is BidirectionalLayer<T>)
-        {
-            return CreateAdapter(layer);
-        }
+            return true;
 
         // Attention layers
         if (layer is AttentionLayer<T> || layer is MultiHeadAttentionLayer<T> || layer is SelfAttentionLayer<T>)
-        {
-            return CreateAdapter(layer);
-        }
+            return true;
 
         // Transformer layers
         if (layer is TransformerEncoderLayer<T> || layer is TransformerDecoderLayer<T>)
-        {
-            return CreateAdapter(layer);
-        }
+            return true;
 
         // Embedding layers
         if (layer is EmbeddingLayer<T> || layer is PatchEmbeddingLayer<T>)
-        {
-            return CreateAdapter(layer);
-        }
+            return true;
 
         // Specialized layers with trainable weights
         if (layer is LocallyConnectedLayer<T> || layer is HighwayLayer<T> ||
             layer is GatedLinearUnitLayer<T> || layer is SqueezeAndExcitationLayer<T>)
-        {
-            return CreateAdapter(layer);
-        }
-
-        // Graph convolutional layers - use specialized GraphConvolutionalLoRAAdapter
-        // which implements IGraphConvolutionLayer<T> and properly delegates graph methods
-        if (layer is IGraphConvolutionLayer<T>)
-        {
-            return new GraphConvolutionalLoRAAdapter<T>(layer, Rank, Alpha, FreezeBaseLayer);
-        }
+            return true;
 
         // Capsule layers
         if (layer is CapsuleLayer<T> || layer is PrimaryCapsuleLayer<T> || layer is DigitCapsuleLayer<T>)
-        {
-            return CreateAdapter(layer);
-        }
+            return true;
 
         // CRF and other advanced layers
         if (layer is ConditionalRandomFieldLayer<T>)
-        {
-            return CreateAdapter(layer);
-        }
+            return true;
 
-        // Return layers without trainable weights unchanged
-        // (Activation, Pooling, Dropout, Flatten, Reshape, Normalization, etc.)
-        return layer;
+        return false;
     }
 
     /// <summary>