Skip to content

Commit b6e18bc

Browse files
authored
[CRE][DONTime] Respect batchSize config (#2184)
1 parent f39eba3 commit b6e18bc

4 files changed

Lines changed: 198 additions & 26 deletions

File tree

pkg/workflows/dontime/pb/dontime.pb.go

Lines changed: 12 additions & 13 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/workflows/dontime/pb/dontime.proto

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ option go_package = "github.com/smartcontractkit/chainlink-common/pkg/workflows/
55
message Observation {
66
int64 timestamp = 1;
77
map<string, int64> requests = 2;
8-
// Flag to roll out execution pruning fix. Can be removed after rollout (once unused in the outcome phase).
9-
bool prune_executions = 3;
8+
reserved 3;
9+
bool limit_by_batch_size_flag = 4;
1010
}
1111

1212
message ObservedDonTimes {

pkg/workflows/dontime/plugin.go

Lines changed: 88 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@ import (
2222
)
2323

2424
type pluginMetrics struct {
25-
donTime metric.Int64Gauge
26-
donTimeEntries metric.Int64Gauge
27-
outcomeSize metric.Int64Gauge
25+
donTime metric.Int64Gauge
26+
donTimeEntries metric.Int64Gauge
27+
outcomeSize metric.Int64Gauge
28+
observationBatchOverflow metric.Int64Gauge
29+
outcomeBatchOverflow metric.Int64Gauge
2830
}
2931

3032
func newPluginMetrics() (pluginMetrics, error) {
@@ -54,10 +56,28 @@ func newPluginMetrics() (pluginMetrics, error) {
5456
return pluginMetrics{}, fmt.Errorf("failed to create outcome_size gauge: %w", err)
5557
}
5658

59+
observationBatchOverflow, err := meter.Int64Gauge("platform_dontime_observation_batch_overflow",
60+
metric.WithDescription("Number of pending requests excluded from the observation due to batch size limit"),
61+
metric.WithUnit("{request}"),
62+
)
63+
if err != nil {
64+
return pluginMetrics{}, fmt.Errorf("failed to create observation_batch_overflow gauge: %w", err)
65+
}
66+
67+
outcomeBatchOverflow, err := meter.Int64Gauge("platform_dontime_outcome_batch_overflow",
68+
metric.WithDescription("Number of workflow execution entries removed from the outcome due to batch size limit"),
69+
metric.WithUnit("{entry}"),
70+
)
71+
if err != nil {
72+
return pluginMetrics{}, fmt.Errorf("failed to create outcome_batch_overflow gauge: %w", err)
73+
}
74+
5775
return pluginMetrics{
58-
donTime: donTime,
59-
donTimeEntries: donTimeEntries,
60-
outcomeSize: outcomeSize,
76+
donTime: donTime,
77+
donTimeEntries: donTimeEntries,
78+
outcomeSize: outcomeSize,
79+
observationBatchOverflow: observationBatchOverflow,
80+
outcomeBatchOverflow: outcomeBatchOverflow,
6181
}, nil
6282
}
6383

@@ -106,14 +126,33 @@ func (p *Plugin) Query(_ context.Context, _ ocr3types.OutcomeContext) (types.Que
106126
return nil, nil
107127
}
108128

109-
func (p *Plugin) Observation(_ context.Context, outctx ocr3types.OutcomeContext, query types.Query) (types.Observation, error) {
129+
func sortedRequests(requests map[string]*Request) []*Request {
130+
if len(requests) == 0 {
131+
return nil
132+
}
133+
134+
ids := make([]string, 0, len(requests))
135+
for id := range requests {
136+
ids = append(ids, id)
137+
}
138+
slices.Sort(ids)
139+
140+
sorted := make([]*Request, 0, len(ids))
141+
for _, id := range ids {
142+
sorted = append(sorted, requests[id])
143+
}
144+
return sorted
145+
}
146+
147+
func (p *Plugin) Observation(ctx context.Context, outctx ocr3types.OutcomeContext, query types.Query) (types.Observation, error) {
110148
previousOutcome := &pb.Outcome{}
111149
if err := proto.Unmarshal(outctx.PreviousOutcome, previousOutcome); err != nil {
112150
p.lggr.Errorf("failed to unmarshal previous outcome in Observation phase")
113151
}
114152

153+
sortedRequests := sortedRequests(p.store.GetRequests())
115154
requests := map[string]int64{} // Maps executionID --> seqNum
116-
for _, req := range p.store.GetRequests() {
155+
for _, req := range sortedRequests {
117156
// Validate request sequence number
118157
numObservedDonTimes := 0
119158
times, ok := previousOutcome.ObservedDonTimes[req.WorkflowExecutionID]
@@ -135,12 +174,27 @@ func (p *Plugin) Observation(_ context.Context, outctx ocr3types.OutcomeContext,
135174
}
136175

137176
requests[req.WorkflowExecutionID] = int64(req.SeqNum)
177+
if len(requests) >= p.batchSize {
178+
break
179+
}
138180
}
139181

182+
overflowCount := len(sortedRequests) - len(requests)
183+
p.lggr.Debugw("Observation batch processed",
184+
"inputRequests", len(sortedRequests),
185+
"batchSize", p.batchSize,
186+
"includedRequests", len(requests),
187+
"overflowRequests", overflowCount,
188+
)
189+
if overflowCount > 0 {
190+
p.lggr.Warnw("Observation batch overflow", "overflowRequests", overflowCount)
191+
}
192+
p.metrics.observationBatchOverflow.Record(ctx, int64(overflowCount))
193+
140194
observation := &pb.Observation{
141-
Timestamp: time.Now().UTC().UnixMilli(),
142-
Requests: requests,
143-
PruneExecutions: true,
195+
Timestamp: time.Now().UTC().UnixMilli(),
196+
Requests: requests,
197+
LimitByBatchSizeFlag: true,
144198
}
145199

146200
return proto.MarshalOptions{Deterministic: true}.Marshal(observation)
@@ -162,6 +216,7 @@ func (p *Plugin) Outcome(ctx context.Context, outctx ocr3types.OutcomeContext, _
162216
OffsetFromMedian int64
163217
}
164218
var timestampNodePairs []timestampNodePair
219+
limitByBatchSizeFlagEnabled := true
165220

166221
prevOutcome := &pb.Outcome{}
167222
if err := proto.Unmarshal(outctx.PreviousOutcome, prevOutcome); err != nil {
@@ -178,6 +233,10 @@ func (p *Plugin) Outcome(ctx context.Context, outctx ocr3types.OutcomeContext, _
178233
continue
179234
}
180235

236+
if !observation.GetLimitByBatchSizeFlag() {
237+
limitByBatchSizeFlagEnabled = false
238+
}
239+
181240
for id, requestSeqNum := range observation.Requests {
182241
var currSeqNum int64
183242
if times, ok := prevOutcome.ObservedDonTimes[id]; ok {
@@ -249,13 +308,31 @@ func (p *Plugin) Outcome(ctx context.Context, outctx ocr3types.OutcomeContext, _
249308
}
250309
}
251310

311+
var outcomeBatchOverflowCount int64
312+
if len(outcome.ObservedDonTimes) > p.batchSize && limitByBatchSizeFlagEnabled {
313+
ids := make([]string, 0, len(outcome.ObservedDonTimes))
314+
for id := range outcome.ObservedDonTimes {
315+
ids = append(ids, id)
316+
}
317+
slices.Sort(ids)
318+
outcomeBatchOverflowCount = int64(len(ids) - p.batchSize)
319+
for _, id := range ids[p.batchSize:] {
320+
delete(outcome.ObservedDonTimes, id)
321+
}
322+
p.lggr.Warnw("Trimmed outcome observed don times to batch size",
323+
"batchSize", p.batchSize,
324+
"removedEntries", outcomeBatchOverflowCount,
325+
)
326+
}
327+
252328
outcomeBytes, err := proto.MarshalOptions{Deterministic: true}.Marshal(outcome)
253329
p.lggr.Infow("Outcome computed",
254330
"observedDonTimesEntries", len(outcome.ObservedDonTimes),
255331
"outcomeSizeBytes", len(outcomeBytes),
256332
)
257333
p.metrics.donTime.Record(ctx, outcome.Timestamp)
258334
p.metrics.donTimeEntries.Record(ctx, int64(len(outcome.ObservedDonTimes)))
335+
p.metrics.outcomeBatchOverflow.Record(ctx, outcomeBatchOverflowCount)
259336
p.metrics.outcomeSize.Record(ctx, int64(len(outcomeBytes)))
260337
return outcomeBytes, err
261338
}

pkg/workflows/dontime/plugin_test.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,35 @@ func TestPlugin_Observation(t *testing.T) {
8282
require.Equal(t, expectedRequests, obsProto.Requests)
8383
store.deleteExecutionID("workflow-123")
8484
})
85+
86+
t.Run("batch limit excludes overflow requests", func(t *testing.T) {
87+
offchainCfg := newTestPluginOffchainConfig(t)
88+
offchainCfg.MaxBatchSize = 2
89+
plugin, err := NewPlugin(store, config, offchainCfg, lggr)
90+
require.NoError(t, err)
91+
92+
for _, id := range []string{"workflow-c", "workflow-a", "workflow-b"} {
93+
_ = store.RequestDonTime(id, 0)
94+
}
95+
96+
observation, err := plugin.Observation(ctx, outcomeCtx, query)
97+
require.NoError(t, err)
98+
99+
obsProto := &pb.Observation{}
100+
err = proto.Unmarshal(observation, obsProto)
101+
require.NoError(t, err)
102+
103+
expectedRequests := map[string]int64{
104+
"workflow-a": 0,
105+
"workflow-b": 0,
106+
}
107+
require.Equal(t, expectedRequests, obsProto.Requests)
108+
require.NotNil(t, store.GetRequest("workflow-c"))
109+
110+
for _, id := range []string{"workflow-a", "workflow-b", "workflow-c"} {
111+
store.deleteExecutionID(id)
112+
}
113+
})
85114
}
86115

87116
func TestPlugin_ValidateObservation(t *testing.T) {
@@ -518,6 +547,73 @@ func TestPlugin_FinishedExecutions(t *testing.T) {
518547
})
519548
}
520549

550+
func TestPlugin_Outcome_TrimByBatchSize(t *testing.T) {
551+
lggr := logger.Test(t)
552+
store := NewStore(DefaultRequestTimeout)
553+
config, offchainCfg := newTestPluginConfig(t), newTestPluginOffchainConfig(t)
554+
offchainCfg.MaxBatchSize = 2
555+
ctx := t.Context()
556+
557+
plugin, err := NewPlugin(store, config, offchainCfg, lggr)
558+
require.NoError(t, err)
559+
560+
query, err := plugin.Query(ctx, ocr3types.OutcomeContext{PreviousOutcome: []byte("")})
561+
require.NoError(t, err)
562+
563+
timestamp := time.Now().UnixMilli()
564+
makeObservations := func(limitByBatchSize bool) []types.AttributedObservation {
565+
aos := make([]types.AttributedObservation, 4)
566+
for i := 0; i < 4; i++ {
567+
obs := &pb.Observation{
568+
Timestamp: timestamp + int64(i),
569+
Requests: map[string]int64{},
570+
LimitByBatchSizeFlag: limitByBatchSize,
571+
}
572+
rawObs, err := proto.Marshal(obs)
573+
require.NoError(t, err)
574+
aos[i] = types.AttributedObservation{
575+
Observation: rawObs,
576+
Observer: commontypes.OracleID(i),
577+
}
578+
}
579+
return aos
580+
}
581+
582+
prevOutcome := &pb.Outcome{
583+
Timestamp: timestamp - 1000,
584+
ObservedDonTimes: map[string]*pb.ObservedDonTimes{
585+
"workflow-a": {Timestamps: []int64{timestamp - 1000}},
586+
"workflow-b": {Timestamps: []int64{timestamp - 1000}},
587+
"workflow-c": {Timestamps: []int64{timestamp - 1000}},
588+
},
589+
}
590+
prevOutcomeBytes, err := proto.Marshal(prevOutcome)
591+
require.NoError(t, err)
592+
593+
t.Run("trims when all observations set batch size flag", func(t *testing.T) {
594+
outcome, err := plugin.Outcome(ctx, ocr3types.OutcomeContext{PreviousOutcome: prevOutcomeBytes}, query, makeObservations(true))
595+
require.NoError(t, err)
596+
597+
outcomeProto := &pb.Outcome{}
598+
err = proto.Unmarshal(outcome, outcomeProto)
599+
require.NoError(t, err)
600+
require.Len(t, outcomeProto.ObservedDonTimes, 2)
601+
require.Contains(t, outcomeProto.ObservedDonTimes, "workflow-a")
602+
require.Contains(t, outcomeProto.ObservedDonTimes, "workflow-b")
603+
require.NotContains(t, outcomeProto.ObservedDonTimes, "workflow-c")
604+
})
605+
606+
t.Run("does not trim when batch size flag is missing", func(t *testing.T) {
607+
outcome, err := plugin.Outcome(ctx, ocr3types.OutcomeContext{PreviousOutcome: prevOutcomeBytes}, query, makeObservations(false))
608+
require.NoError(t, err)
609+
610+
outcomeProto := &pb.Outcome{}
611+
err = proto.Unmarshal(outcome, outcomeProto)
612+
require.NoError(t, err)
613+
require.Len(t, outcomeProto.ObservedDonTimes, 3)
614+
})
615+
}
616+
521617
func TestPlugin_ExpiredRequest(t *testing.T) {
522618
lggr := logger.Test(t)
523619
store := NewStore(0)

0 commit comments

Comments
 (0)