fix: resolve production issues in time series models

ooples · claude · ooples · commit 65efe4cccd5d · 2025-12-12T14:40:42.000-05:00
InformerModel: - Fixed options divergence using constructor chaining pattern - Added seed parameter to encoder/decoder blocks for unique weights per layer - Made _embeddingDim non-readonly for deserialization compatibility - Encoder Forward() now uses both attention and FFN weights - Decoder Forward() now uses both self-attention and cross-attention weights - Set _embeddingDim from deserialized matrix size in both blocks - Fixed layer initialization to pass different seeds per layer DeepARModel: - UpdateWeights now updates ALL mean weights, mean bias, scale weights, and scale bias - Previously only updated 5 out of 64+ weights per batch (tiny subset) DeepANT: - Fixed options divergence using constructor chaining pattern - Fixed copy constructor to copy all base class properties - Fixed ConvLayer kernel shape from [out, in*k] to [out, k] - was wasting weights - Added seed parameter to ConvLayer for unique weights per layer ChronosFoundationModel: - Fixed options divergence using constructor chaining pattern - Fixed copy constructor to copy all base class properties - Added seed parameter to TransformerBlock for unique weights per layer - Fixed layer initialization to pass different seeds per layer 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/src/TimeSeries/AnomalyDetection/DeepANT.cs b/src/TimeSeries/AnomalyDetection/DeepANT.cs
@@ -44,24 +44,33 @@ public class DeepANT<T> : TimeSeriesModelBase<T>
     /// Initializes a new instance of the DeepANT class.
     /// </summary>
     public DeepANT(DeepANTOptions<T>? options = null)
-        : base(options ?? new DeepANTOptions<T>())
+        : this(options ?? new DeepANTOptions<T>(), initializeModel: true)
     {
-        _options = options ?? new DeepANTOptions<T>();
+    }
+
+    /// <summary>
+    /// Private constructor for proper options instance management.
+    /// </summary>
+    private DeepANT(DeepANTOptions<T> options, bool initializeModel)
+        : base(options)
+    {
+        _options = options;
         _numOps = MathHelper.GetNumericOperations<T>();
         _convLayers = new List<ConvLayer<T>>();
         _anomalyThreshold = _numOps.FromDouble(3.0); // 3 sigma by default
 
-        InitializeModel();
+        if (initializeModel)
+            InitializeModel();
     }
 
     private void InitializeModel()
     {
         var random = new Random(42);
 
-        // Initialize convolutional layers
+        // Initialize convolutional layers with different seeds
         _convLayers.Clear();
-        _convLayers.Add(new ConvLayer<T>(_options.WindowSize, 32, 3));
-        _convLayers.Add(new ConvLayer<T>(32, 32, 3));
+        _convLayers.Add(new ConvLayer<T>(_options.WindowSize, 32, 3, seed: 42));
+        _convLayers.Add(new ConvLayer<T>(32, 32, 3, seed: 1042));
 
         // Initialize fully connected output layer
         double stddev = Math.Sqrt(2.0 / 32);
@@ -322,10 +331,23 @@ public DeepANTOptions() { }
     public DeepANTOptions(DeepANTOptions<T> other)
     {
         if (other == null) throw new ArgumentNullException(nameof(other));
+        // Copy DeepANT-specific properties
         WindowSize = other.WindowSize;
         LearningRate = other.LearningRate;
         Epochs = other.Epochs;
         BatchSize = other.BatchSize;
+
+        // Copy TimeSeriesRegressionOptions properties
+        LagOrder = other.LagOrder;
+        IncludeTrend = other.IncludeTrend;
+        SeasonalPeriod = other.SeasonalPeriod;
+        AutocorrelationCorrection = other.AutocorrelationCorrection;
+        ModelType = other.ModelType;
+        LossFunction = other.LossFunction;
+
+        // Copy RegressionOptions properties
+        DecompositionMethod = other.DecompositionMethod;
+        UseIntercept = other.UseIntercept;
     }
 }
 
@@ -335,25 +357,24 @@ public DeepANTOptions(DeepANTOptions<T> other)
 internal class ConvLayer<T>
 {
     private readonly INumericOperations<T> _numOps;
-    private readonly int _inputChannels;
     private readonly int _outputChannels;
     private readonly int _kernelSize;
     private readonly Matrix<T> _kernels;
     private readonly Vector<T> _biases;
 
     public int ParameterCount => _kernels.Rows * _kernels.Columns + _biases.Length;
 
-    public ConvLayer(int inputChannels, int outputChannels, int kernelSize)
+    public ConvLayer(int inputChannels, int outputChannels, int kernelSize, int seed = 42)
     {
         _numOps = MathHelper.GetNumericOperations<T>();
-        _inputChannels = inputChannels;
         _outputChannels = outputChannels;
         _kernelSize = kernelSize;
 
-        var random = new Random(42);
-        double stddev = Math.Sqrt(2.0 / ((double)inputChannels * kernelSize));
+        var random = new Random(seed);
+        // Use kernelSize weights per output channel for 1D convolution
+        double stddev = Math.Sqrt(2.0 / kernelSize);
 
-        _kernels = new Matrix<T>(outputChannels, inputChannels * kernelSize);
+        _kernels = new Matrix<T>(outputChannels, kernelSize);
         for (int i = 0; i < _kernels.Rows; i++)
             for (int j = 0; j < _kernels.Columns; j++)
                 _kernels[i, j] = _numOps.FromDouble((random.NextDouble() * 2 - 1) * stddev);
@@ -379,11 +400,10 @@ public Vector<T> Forward(Vector<T> input)
             {
                 T positionSum = _biases[outChannel];
 
-                // Apply kernel at this position
+                // Apply kernel at this position - use all kernelSize weights
                 for (int k = 0; k < _kernelSize && (pos + k) < input.Length; k++)
                 {
-                    int kernelIdx = k % _kernels.Columns;
-                    T weight = _kernels[outChannel, kernelIdx];
+                    T weight = _kernels[outChannel, k];
                     T inputVal = input[pos + k];
                     positionSum = _numOps.Add(positionSum, _numOps.Multiply(weight, inputVal));
                 }
diff --git a/src/TimeSeries/ChronosFoundationModel.cs b/src/TimeSeries/ChronosFoundationModel.cs
@@ -43,14 +43,23 @@ public class ChronosFoundationModel<T> : TimeSeriesModelBase<T>
     private Vector<T> _outputBias = new Vector<T>(0);
 
     public ChronosFoundationModel(ChronosOptions<T>? options = null)
-        : base(options ?? new ChronosOptions<T>())
+        : this(options ?? new ChronosOptions<T>(), initializeModel: true)
     {
-        _options = options ?? new ChronosOptions<T>();
+    }
+
+    /// <summary>
+    /// Private constructor for proper options instance management.
+    /// </summary>
+    private ChronosFoundationModel(ChronosOptions<T> options, bool initializeModel)
+        : base(options)
+    {
+        _options = options;
         _numOps = MathHelper.GetNumericOperations<T>();
         _vocabularySize = _options.VocabularySize;
         _transformerLayers = new List<TransformerBlock<T>>();
 
-        InitializeModel();
+        if (initializeModel)
+            InitializeModel();
     }
 
     private void InitializeModel()
@@ -73,10 +82,10 @@ private void InitializeModel()
             for (int j = 0; j < _tokenEmbeddings.Columns; j++)
                 _tokenEmbeddings[i, j] = _numOps.FromDouble((random.NextDouble() * 2 - 1) * stddev);
 
-        // Transformer layers
+        // Transformer layers - use different seeds for each layer
         for (int i = 0; i < _options.NumLayers; i++)
         {
-            _transformerLayers.Add(new TransformerBlock<T>(_options.EmbeddingDim, _options.NumHeads));
+            _transformerLayers.Add(new TransformerBlock<T>(_options.EmbeddingDim, _options.NumHeads, seed: 42 + i * 1000));
         }
 
         // Output projection (back to vocabulary)
@@ -422,6 +431,7 @@ public ChronosOptions() { }
     public ChronosOptions(ChronosOptions<T> other)
     {
         if (other == null) throw new ArgumentNullException(nameof(other));
+        // Copy Chronos-specific properties
         ContextLength = other.ContextLength;
         ForecastHorizon = other.ForecastHorizon;
         VocabularySize = other.VocabularySize;
@@ -430,6 +440,18 @@ public ChronosOptions(ChronosOptions<T> other)
         NumHeads = other.NumHeads;
         LearningRate = other.LearningRate;
         Epochs = other.Epochs;
+
+        // Copy TimeSeriesRegressionOptions properties
+        LagOrder = other.LagOrder;
+        IncludeTrend = other.IncludeTrend;
+        SeasonalPeriod = other.SeasonalPeriod;
+        AutocorrelationCorrection = other.AutocorrelationCorrection;
+        ModelType = other.ModelType;
+        LossFunction = other.LossFunction;
+
+        // Copy RegressionOptions properties
+        DecompositionMethod = other.DecompositionMethod;
+        UseIntercept = other.UseIntercept;
     }
 }
 
@@ -443,10 +465,10 @@ internal class TransformerBlock<T>
 
     public int ParameterCount => _weights.Rows * _weights.Columns;
 
-    public TransformerBlock(int embeddingDim, int numHeads)
+    public TransformerBlock(int embeddingDim, int numHeads, int seed = 42)
     {
         _numOps = MathHelper.GetNumericOperations<T>();
-        var random = new Random(42);
+        var random = new Random(seed);
         double stddev = Math.Sqrt(2.0 / embeddingDim);
 
         _weights = new Matrix<T>(embeddingDim, embeddingDim);
diff --git a/src/TimeSeries/DeepARModel.cs b/src/TimeSeries/DeepARModel.cs
@@ -181,11 +181,12 @@ private T ComputeBatchLoss(Matrix<T> x, Vector<T> y, int batchStart, int batchEn
     private void UpdateWeights(Matrix<T> x, Vector<T> y, int batchStart, int batchEnd, T learningRate)
     {
         T epsilon = _numOps.FromDouble(1e-6);
+        T twoEpsilon = _numOps.Multiply(_numOps.FromDouble(2.0), epsilon);
 
-        // Update mean weights (sample a few for efficiency)
-        for (int i = 0; i < Math.Min(5, _meanWeights.Rows); i++)
+        // Update all mean weights (shape is [1, HiddenSize])
+        for (int i = 0; i < _meanWeights.Rows; i++)
         {
-            for (int j = 0; j < Math.Min(5, _meanWeights.Columns); j++)
+            for (int j = 0; j < _meanWeights.Columns; j++)
             {
                 T original = _meanWeights[i, j];
 
@@ -197,14 +198,64 @@ private void UpdateWeights(Matrix<T> x, Vector<T> y, int batchStart, int batchEn
 
                 _meanWeights[i, j] = original;
 
-                T gradient = _numOps.Divide(
-                    _numOps.Subtract(lossPlus, lossMinus),
-                    _numOps.Multiply(_numOps.FromDouble(2.0), epsilon)
-                );
-
+                T gradient = _numOps.Divide(_numOps.Subtract(lossPlus, lossMinus), twoEpsilon);
                 _meanWeights[i, j] = _numOps.Subtract(original, _numOps.Multiply(learningRate, gradient));
             }
         }
+
+        // Update mean bias
+        for (int i = 0; i < _meanBias.Length; i++)
+        {
+            T original = _meanBias[i];
+
+            _meanBias[i] = _numOps.Add(original, epsilon);
+            T lossPlus = ComputeBatchLoss(x, y, batchStart, batchEnd);
+
+            _meanBias[i] = _numOps.Subtract(original, epsilon);
+            T lossMinus = ComputeBatchLoss(x, y, batchStart, batchEnd);
+
+            _meanBias[i] = original;
+
+            T gradient = _numOps.Divide(_numOps.Subtract(lossPlus, lossMinus), twoEpsilon);
+            _meanBias[i] = _numOps.Subtract(original, _numOps.Multiply(learningRate, gradient));
+        }
+
+        // Update all scale weights (shape is [1, HiddenSize])
+        for (int i = 0; i < _scaleWeights.Rows; i++)
+        {
+            for (int j = 0; j < _scaleWeights.Columns; j++)
+            {
+                T original = _scaleWeights[i, j];
+
+                _scaleWeights[i, j] = _numOps.Add(original, epsilon);
+                T lossPlus = ComputeBatchLoss(x, y, batchStart, batchEnd);
+
+                _scaleWeights[i, j] = _numOps.Subtract(original, epsilon);
+                T lossMinus = ComputeBatchLoss(x, y, batchStart, batchEnd);
+
+                _scaleWeights[i, j] = original;
+
+                T gradient = _numOps.Divide(_numOps.Subtract(lossPlus, lossMinus), twoEpsilon);
+                _scaleWeights[i, j] = _numOps.Subtract(original, _numOps.Multiply(learningRate, gradient));
+            }
+        }
+
+        // Update scale bias
+        for (int i = 0; i < _scaleBias.Length; i++)
+        {
+            T original = _scaleBias[i];
+
+            _scaleBias[i] = _numOps.Add(original, epsilon);
+            T lossPlus = ComputeBatchLoss(x, y, batchStart, batchEnd);
+
+            _scaleBias[i] = _numOps.Subtract(original, epsilon);
+            T lossMinus = ComputeBatchLoss(x, y, batchStart, batchEnd);
+
+            _scaleBias[i] = original;
+
+            T gradient = _numOps.Divide(_numOps.Subtract(lossPlus, lossMinus), twoEpsilon);
+            _scaleBias[i] = _numOps.Subtract(original, _numOps.Multiply(learningRate, gradient));
+        }
     }
 
     /// <summary>
diff --git a/src/TimeSeries/InformerModel.cs b/src/TimeSeries/InformerModel.cs