77 "slices"
88 "time"
99
10+ "go.opentelemetry.io/otel/metric"
1011 "google.golang.org/protobuf/proto"
1112 "google.golang.org/protobuf/types/known/structpb"
1213
@@ -15,10 +16,51 @@ import (
1516 "github.com/smartcontractkit/libocr/offchainreporting2plus/types"
1617 "github.com/smartcontractkit/libocr/quorumhelper"
1718
19+ "github.com/smartcontractkit/chainlink-common/pkg/beholder"
1820 "github.com/smartcontractkit/chainlink-common/pkg/logger"
1921 "github.com/smartcontractkit/chainlink-common/pkg/workflows/dontime/pb"
2022)
2123
24+ type pluginMetrics struct {
25+ donTime metric.Int64Gauge
26+ donTimeEntries metric.Int64Gauge
27+ outcomeSize metric.Int64Gauge
28+ }
29+
30+ func newPluginMetrics () (pluginMetrics , error ) {
31+ meter := beholder .GetMeter ()
32+
33+ donTime , err := meter .Int64Gauge ("dontime_outcome_don_time_ms" ,
34+ metric .WithDescription ("DON consensus timestamp included in the latest outcome, in milliseconds" ),
35+ metric .WithUnit ("ms" ),
36+ )
37+ if err != nil {
38+ return pluginMetrics {}, fmt .Errorf ("failed to create don_time gauge: %w" , err )
39+ }
40+
41+ donTimeEntries , err := meter .Int64Gauge ("dontime_outcome_entries" ,
42+ metric .WithDescription ("Number of workflow execution entries tracked in the latest outcome" ),
43+ metric .WithUnit ("{entry}" ),
44+ )
45+ if err != nil {
46+ return pluginMetrics {}, fmt .Errorf ("failed to create don_time_entries gauge: %w" , err )
47+ }
48+
49+ outcomeSize , err := meter .Int64Gauge ("dontime_outcome_size_bytes" ,
50+ metric .WithDescription ("Serialised size of the latest outcome in bytes" ),
51+ metric .WithUnit ("By" ),
52+ )
53+ if err != nil {
54+ return pluginMetrics {}, fmt .Errorf ("failed to create outcome_size gauge: %w" , err )
55+ }
56+
57+ return pluginMetrics {
58+ donTime : donTime ,
59+ donTimeEntries : donTimeEntries ,
60+ outcomeSize : outcomeSize ,
61+ }, nil
62+ }
63+
2264type Plugin struct {
2365 store * Store
2466 config ocr3types.ReportingPluginConfig
@@ -27,6 +69,8 @@ type Plugin struct {
2769
2870 batchSize int
2971 minTimeIncrease int64
72+
73+ metrics pluginMetrics
3074}
3175
3276var _ ocr3types.ReportingPlugin [[]byte ] = (* Plugin )(nil )
@@ -42,13 +86,19 @@ func NewPlugin(store *Store, config ocr3types.ReportingPluginConfig, offchainCfg
4286 return nil , errors .New ("execution removal time must be positive" )
4387 }
4488
89+ metrics , err := newPluginMetrics ()
90+ if err != nil {
91+ return nil , err
92+ }
93+
4594 return & Plugin {
4695 store : store ,
4796 config : config ,
4897 offChainConfig : offchainCfg ,
4998 lggr : logger .Named (lggr , "DONTimePlugin" ),
5099 batchSize : int (offchainCfg .MaxBatchSize ),
51100 minTimeIncrease : offchainCfg .MinTimeIncrease / int64 (time .Millisecond ),
101+ metrics : metrics ,
52102 }, nil
53103}
54104
@@ -97,8 +147,9 @@ func (p *Plugin) Observation(_ context.Context, outctx ocr3types.OutcomeContext,
97147 }
98148
99149 observation := & pb.Observation {
100- Timestamp : time .Now ().UTC ().UnixMilli (),
101- Requests : requests ,
150+ Timestamp : time .Now ().UTC ().UnixMilli (),
151+ Requests : requests ,
152+ PruneExecutions : true ,
102153 }
103154
104155 return proto.MarshalOptions {Deterministic : true }.Marshal (observation )
@@ -112,7 +163,7 @@ func (p *Plugin) ObservationQuorum(_ context.Context, _ ocr3types.OutcomeContext
112163 return quorumhelper .ObservationCountReachesObservationQuorum (quorumhelper .QuorumTwoFPlusOne , p .config .N , p .config .F , aos ), nil
113164}
114165
115- func (p * Plugin ) Outcome (_ context.Context , outctx ocr3types.OutcomeContext , _ types.Query , aos []types.AttributedObservation ) (ocr3types.Outcome , error ) {
166+ func (p * Plugin ) Outcome (ctx context.Context , outctx ocr3types.OutcomeContext , _ types.Query , aos []types.AttributedObservation ) (ocr3types.Outcome , error ) {
116167 observationCounts := map [string ]int64 {} // counts how many nodes reported where a new DON timestamp might be needed
117168 type timestampNodePair struct {
118169 Timestamp int64
@@ -129,14 +180,33 @@ func (p *Plugin) Outcome(_ context.Context, outctx ocr3types.OutcomeContext, _ t
129180 prevOutcome .ObservedDonTimes = make (map [string ]* pb.ObservedDonTimes )
130181 }
131182
183+ // Unmarshal all observations once and compute pruneExecutions.
184+ // Only prune when all nodes are updated. Even if this rolls back, the logic is still correct.
185+ parsedAOs := make ([]* pb.Observation , len (aos ))
186+ pruneExecutions := true
132187 for idx , ao := range aos {
133188 observation := & pb.Observation {}
134189 if err := proto .Unmarshal (ao .Observation , observation ); err != nil {
135190 p .lggr .Errorf ("failed to unmarshal observation in Outcome phase" )
136191 continue
137192 }
193+ parsedAOs [idx ] = observation
194+ if ! observation .PruneExecutions {
195+ pruneExecutions = false // need all nodes to agree
196+ }
197+ }
198+
199+ for idx , observation := range parsedAOs {
200+ if observation == nil {
201+ continue
202+ }
138203
139204 for id , requestSeqNum := range observation .Requests {
205+ if ! pruneExecutions { // TODO(CRE-2497): legacy behavior, remove after rollout
206+ if _ , ok := prevOutcome .ObservedDonTimes [id ]; ! ok {
207+ prevOutcome .ObservedDonTimes [id ] = & pb.ObservedDonTimes {}
208+ }
209+ }
140210 var currSeqNum int64
141211 if times , ok := prevOutcome .ObservedDonTimes [id ]; ok {
142212 currSeqNum = int64 (len (times .Timestamps ))
@@ -196,14 +266,23 @@ func (p *Plugin) Outcome(_ context.Context, outctx ocr3types.OutcomeContext, _ t
196266
197267 // Remove expired and empty workflow executions
198268 for id , observedTimes := range outcome .ObservedDonTimes {
199- if observedTimes == nil || len (observedTimes .Timestamps ) == 0 {
200- delete (outcome .ObservedDonTimes , id )
201- p .store .deleteExecutionID (id )
202- continue
203- }
204- if donTime >= observedTimes .Timestamps [0 ]+ p .offChainConfig .ExecutionRemovalTime .AsDuration ().Milliseconds () {
205- delete (outcome .ObservedDonTimes , id )
206- p .store .deleteExecutionID (id )
269+ if ! pruneExecutions { // TODO(CRE-2497): legacy behavior, remove after rollout
270+ if observedTimes != nil && len (observedTimes .Timestamps ) > 0 {
271+ if donTime >= observedTimes .Timestamps [0 ]+ p .offChainConfig .ExecutionRemovalTime .AsDuration ().Milliseconds () {
272+ delete (outcome .ObservedDonTimes , id )
273+ p .store .deleteExecutionID (id )
274+ }
275+ }
276+ } else {
277+ if observedTimes == nil || len (observedTimes .Timestamps ) == 0 {
278+ delete (outcome .ObservedDonTimes , id )
279+ p .store .deleteExecutionID (id )
280+ continue
281+ }
282+ if donTime >= observedTimes .Timestamps [0 ]+ p .offChainConfig .ExecutionRemovalTime .AsDuration ().Milliseconds () {
283+ delete (outcome .ObservedDonTimes , id )
284+ p .store .deleteExecutionID (id )
285+ }
207286 }
208287 }
209288
@@ -212,6 +291,9 @@ func (p *Plugin) Outcome(_ context.Context, outctx ocr3types.OutcomeContext, _ t
212291 "observedDonTimesEntries" , len (outcome .ObservedDonTimes ),
213292 "outcomeSizeBytes" , len (outcomeBytes ),
214293 )
294+ p .metrics .donTime .Record (ctx , outcome .Timestamp )
295+ p .metrics .donTimeEntries .Record (ctx , int64 (len (outcome .ObservedDonTimes )))
296+ p .metrics .outcomeSize .Record (ctx , int64 (len (outcomeBytes )))
215297 return outcomeBytes , err
216298}
217299
0 commit comments