Skip to content

Commit 65efe4c

Browse files
ooplesclaude
andcommitted
fix: resolve production issues in time series models
InformerModel: - Fixed options divergence using constructor chaining pattern - Added seed parameter to encoder/decoder blocks for unique weights per layer - Made _embeddingDim non-readonly for deserialization compatibility - Encoder Forward() now uses both attention and FFN weights - Decoder Forward() now uses both self-attention and cross-attention weights - Set _embeddingDim from deserialized matrix size in both blocks - Fixed layer initialization to pass different seeds per layer DeepARModel: - UpdateWeights now updates ALL mean weights, mean bias, scale weights, and scale bias - Previously only updated 5 out of 64+ weights per batch (tiny subset) DeepANT: - Fixed options divergence using constructor chaining pattern - Fixed copy constructor to copy all base class properties - Fixed ConvLayer kernel shape from [out, in*k] to [out, k] - was wasting weights - Added seed parameter to ConvLayer for unique weights per layer ChronosFoundationModel: - Fixed options divergence using constructor chaining pattern - Fixed copy constructor to copy all base class properties - Added seed parameter to TransformerBlock for unique weights per layer - Fixed layer initialization to pass different seeds per layer 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 0cb9224 commit 65efe4c

4 files changed

Lines changed: 180 additions & 50 deletions

File tree

src/TimeSeries/AnomalyDetection/DeepANT.cs

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -44,24 +44,33 @@ public class DeepANT<T> : TimeSeriesModelBase<T>
4444
/// Initializes a new instance of the DeepANT class.
4545
/// </summary>
4646
public DeepANT(DeepANTOptions<T>? options = null)
47-
: base(options ?? new DeepANTOptions<T>())
47+
: this(options ?? new DeepANTOptions<T>(), initializeModel: true)
4848
{
49-
_options = options ?? new DeepANTOptions<T>();
49+
}
50+
51+
/// <summary>
52+
/// Private constructor for proper options instance management.
53+
/// </summary>
54+
private DeepANT(DeepANTOptions<T> options, bool initializeModel)
55+
: base(options)
56+
{
57+
_options = options;
5058
_numOps = MathHelper.GetNumericOperations<T>();
5159
_convLayers = new List<ConvLayer<T>>();
5260
_anomalyThreshold = _numOps.FromDouble(3.0); // 3 sigma by default
5361

54-
InitializeModel();
62+
if (initializeModel)
63+
InitializeModel();
5564
}
5665

5766
private void InitializeModel()
5867
{
5968
var random = new Random(42);
6069

61-
// Initialize convolutional layers
70+
// Initialize convolutional layers with different seeds
6271
_convLayers.Clear();
63-
_convLayers.Add(new ConvLayer<T>(_options.WindowSize, 32, 3));
64-
_convLayers.Add(new ConvLayer<T>(32, 32, 3));
72+
_convLayers.Add(new ConvLayer<T>(_options.WindowSize, 32, 3, seed: 42));
73+
_convLayers.Add(new ConvLayer<T>(32, 32, 3, seed: 1042));
6574

6675
// Initialize fully connected output layer
6776
double stddev = Math.Sqrt(2.0 / 32);
@@ -322,10 +331,23 @@ public DeepANTOptions() { }
322331
public DeepANTOptions(DeepANTOptions<T> other)
323332
{
324333
if (other == null) throw new ArgumentNullException(nameof(other));
334+
// Copy DeepANT-specific properties
325335
WindowSize = other.WindowSize;
326336
LearningRate = other.LearningRate;
327337
Epochs = other.Epochs;
328338
BatchSize = other.BatchSize;
339+
340+
// Copy TimeSeriesRegressionOptions properties
341+
LagOrder = other.LagOrder;
342+
IncludeTrend = other.IncludeTrend;
343+
SeasonalPeriod = other.SeasonalPeriod;
344+
AutocorrelationCorrection = other.AutocorrelationCorrection;
345+
ModelType = other.ModelType;
346+
LossFunction = other.LossFunction;
347+
348+
// Copy RegressionOptions properties
349+
DecompositionMethod = other.DecompositionMethod;
350+
UseIntercept = other.UseIntercept;
329351
}
330352
}
331353

@@ -335,25 +357,24 @@ public DeepANTOptions(DeepANTOptions<T> other)
335357
internal class ConvLayer<T>
336358
{
337359
private readonly INumericOperations<T> _numOps;
338-
private readonly int _inputChannels;
339360
private readonly int _outputChannels;
340361
private readonly int _kernelSize;
341362
private readonly Matrix<T> _kernels;
342363
private readonly Vector<T> _biases;
343364

344365
public int ParameterCount => _kernels.Rows * _kernels.Columns + _biases.Length;
345366

346-
public ConvLayer(int inputChannels, int outputChannels, int kernelSize)
367+
public ConvLayer(int inputChannels, int outputChannels, int kernelSize, int seed = 42)
347368
{
348369
_numOps = MathHelper.GetNumericOperations<T>();
349-
_inputChannels = inputChannels;
350370
_outputChannels = outputChannels;
351371
_kernelSize = kernelSize;
352372

353-
var random = new Random(42);
354-
double stddev = Math.Sqrt(2.0 / ((double)inputChannels * kernelSize));
373+
var random = new Random(seed);
374+
// Use kernelSize weights per output channel for 1D convolution
375+
double stddev = Math.Sqrt(2.0 / kernelSize);
355376

356-
_kernels = new Matrix<T>(outputChannels, inputChannels * kernelSize);
377+
_kernels = new Matrix<T>(outputChannels, kernelSize);
357378
for (int i = 0; i < _kernels.Rows; i++)
358379
for (int j = 0; j < _kernels.Columns; j++)
359380
_kernels[i, j] = _numOps.FromDouble((random.NextDouble() * 2 - 1) * stddev);
@@ -379,11 +400,10 @@ public Vector<T> Forward(Vector<T> input)
379400
{
380401
T positionSum = _biases[outChannel];
381402

382-
// Apply kernel at this position
403+
// Apply kernel at this position - use all kernelSize weights
383404
for (int k = 0; k < _kernelSize && (pos + k) < input.Length; k++)
384405
{
385-
int kernelIdx = k % _kernels.Columns;
386-
T weight = _kernels[outChannel, kernelIdx];
406+
T weight = _kernels[outChannel, k];
387407
T inputVal = input[pos + k];
388408
positionSum = _numOps.Add(positionSum, _numOps.Multiply(weight, inputVal));
389409
}

src/TimeSeries/ChronosFoundationModel.cs

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,23 @@ public class ChronosFoundationModel<T> : TimeSeriesModelBase<T>
4343
private Vector<T> _outputBias = new Vector<T>(0);
4444

4545
public ChronosFoundationModel(ChronosOptions<T>? options = null)
46-
: base(options ?? new ChronosOptions<T>())
46+
: this(options ?? new ChronosOptions<T>(), initializeModel: true)
4747
{
48-
_options = options ?? new ChronosOptions<T>();
48+
}
49+
50+
/// <summary>
51+
/// Private constructor for proper options instance management.
52+
/// </summary>
53+
private ChronosFoundationModel(ChronosOptions<T> options, bool initializeModel)
54+
: base(options)
55+
{
56+
_options = options;
4957
_numOps = MathHelper.GetNumericOperations<T>();
5058
_vocabularySize = _options.VocabularySize;
5159
_transformerLayers = new List<TransformerBlock<T>>();
5260

53-
InitializeModel();
61+
if (initializeModel)
62+
InitializeModel();
5463
}
5564

5665
private void InitializeModel()
@@ -73,10 +82,10 @@ private void InitializeModel()
7382
for (int j = 0; j < _tokenEmbeddings.Columns; j++)
7483
_tokenEmbeddings[i, j] = _numOps.FromDouble((random.NextDouble() * 2 - 1) * stddev);
7584

76-
// Transformer layers
85+
// Transformer layers - use different seeds for each layer
7786
for (int i = 0; i < _options.NumLayers; i++)
7887
{
79-
_transformerLayers.Add(new TransformerBlock<T>(_options.EmbeddingDim, _options.NumHeads));
88+
_transformerLayers.Add(new TransformerBlock<T>(_options.EmbeddingDim, _options.NumHeads, seed: 42 + i * 1000));
8089
}
8190

8291
// Output projection (back to vocabulary)
@@ -422,6 +431,7 @@ public ChronosOptions() { }
422431
public ChronosOptions(ChronosOptions<T> other)
423432
{
424433
if (other == null) throw new ArgumentNullException(nameof(other));
434+
// Copy Chronos-specific properties
425435
ContextLength = other.ContextLength;
426436
ForecastHorizon = other.ForecastHorizon;
427437
VocabularySize = other.VocabularySize;
@@ -430,6 +440,18 @@ public ChronosOptions(ChronosOptions<T> other)
430440
NumHeads = other.NumHeads;
431441
LearningRate = other.LearningRate;
432442
Epochs = other.Epochs;
443+
444+
// Copy TimeSeriesRegressionOptions properties
445+
LagOrder = other.LagOrder;
446+
IncludeTrend = other.IncludeTrend;
447+
SeasonalPeriod = other.SeasonalPeriod;
448+
AutocorrelationCorrection = other.AutocorrelationCorrection;
449+
ModelType = other.ModelType;
450+
LossFunction = other.LossFunction;
451+
452+
// Copy RegressionOptions properties
453+
DecompositionMethod = other.DecompositionMethod;
454+
UseIntercept = other.UseIntercept;
433455
}
434456
}
435457

@@ -443,10 +465,10 @@ internal class TransformerBlock<T>
443465

444466
public int ParameterCount => _weights.Rows * _weights.Columns;
445467

446-
public TransformerBlock(int embeddingDim, int numHeads)
468+
public TransformerBlock(int embeddingDim, int numHeads, int seed = 42)
447469
{
448470
_numOps = MathHelper.GetNumericOperations<T>();
449-
var random = new Random(42);
471+
var random = new Random(seed);
450472
double stddev = Math.Sqrt(2.0 / embeddingDim);
451473

452474
_weights = new Matrix<T>(embeddingDim, embeddingDim);

src/TimeSeries/DeepARModel.cs

Lines changed: 59 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -181,11 +181,12 @@ private T ComputeBatchLoss(Matrix<T> x, Vector<T> y, int batchStart, int batchEn
181181
private void UpdateWeights(Matrix<T> x, Vector<T> y, int batchStart, int batchEnd, T learningRate)
182182
{
183183
T epsilon = _numOps.FromDouble(1e-6);
184+
T twoEpsilon = _numOps.Multiply(_numOps.FromDouble(2.0), epsilon);
184185

185-
// Update mean weights (sample a few for efficiency)
186-
for (int i = 0; i < Math.Min(5, _meanWeights.Rows); i++)
186+
// Update all mean weights (shape is [1, HiddenSize])
187+
for (int i = 0; i < _meanWeights.Rows; i++)
187188
{
188-
for (int j = 0; j < Math.Min(5, _meanWeights.Columns); j++)
189+
for (int j = 0; j < _meanWeights.Columns; j++)
189190
{
190191
T original = _meanWeights[i, j];
191192

@@ -197,14 +198,64 @@ private void UpdateWeights(Matrix<T> x, Vector<T> y, int batchStart, int batchEn
197198

198199
_meanWeights[i, j] = original;
199200

200-
T gradient = _numOps.Divide(
201-
_numOps.Subtract(lossPlus, lossMinus),
202-
_numOps.Multiply(_numOps.FromDouble(2.0), epsilon)
203-
);
204-
201+
T gradient = _numOps.Divide(_numOps.Subtract(lossPlus, lossMinus), twoEpsilon);
205202
_meanWeights[i, j] = _numOps.Subtract(original, _numOps.Multiply(learningRate, gradient));
206203
}
207204
}
205+
206+
// Update mean bias
207+
for (int i = 0; i < _meanBias.Length; i++)
208+
{
209+
T original = _meanBias[i];
210+
211+
_meanBias[i] = _numOps.Add(original, epsilon);
212+
T lossPlus = ComputeBatchLoss(x, y, batchStart, batchEnd);
213+
214+
_meanBias[i] = _numOps.Subtract(original, epsilon);
215+
T lossMinus = ComputeBatchLoss(x, y, batchStart, batchEnd);
216+
217+
_meanBias[i] = original;
218+
219+
T gradient = _numOps.Divide(_numOps.Subtract(lossPlus, lossMinus), twoEpsilon);
220+
_meanBias[i] = _numOps.Subtract(original, _numOps.Multiply(learningRate, gradient));
221+
}
222+
223+
// Update all scale weights (shape is [1, HiddenSize])
224+
for (int i = 0; i < _scaleWeights.Rows; i++)
225+
{
226+
for (int j = 0; j < _scaleWeights.Columns; j++)
227+
{
228+
T original = _scaleWeights[i, j];
229+
230+
_scaleWeights[i, j] = _numOps.Add(original, epsilon);
231+
T lossPlus = ComputeBatchLoss(x, y, batchStart, batchEnd);
232+
233+
_scaleWeights[i, j] = _numOps.Subtract(original, epsilon);
234+
T lossMinus = ComputeBatchLoss(x, y, batchStart, batchEnd);
235+
236+
_scaleWeights[i, j] = original;
237+
238+
T gradient = _numOps.Divide(_numOps.Subtract(lossPlus, lossMinus), twoEpsilon);
239+
_scaleWeights[i, j] = _numOps.Subtract(original, _numOps.Multiply(learningRate, gradient));
240+
}
241+
}
242+
243+
// Update scale bias
244+
for (int i = 0; i < _scaleBias.Length; i++)
245+
{
246+
T original = _scaleBias[i];
247+
248+
_scaleBias[i] = _numOps.Add(original, epsilon);
249+
T lossPlus = ComputeBatchLoss(x, y, batchStart, batchEnd);
250+
251+
_scaleBias[i] = _numOps.Subtract(original, epsilon);
252+
T lossMinus = ComputeBatchLoss(x, y, batchStart, batchEnd);
253+
254+
_scaleBias[i] = original;
255+
256+
T gradient = _numOps.Divide(_numOps.Subtract(lossPlus, lossMinus), twoEpsilon);
257+
_scaleBias[i] = _numOps.Subtract(original, _numOps.Multiply(learningRate, gradient));
258+
}
208259
}
209260

210261
/// <summary>

0 commit comments

Comments
 (0)