Skip to content

Commit 0b92e83

Browse files
authored
Flaky oplog_processor_locality_recovery fix (#3185)
* increased startup timeout for oplog_processor_locality_recovery * Fix oplog processor delivery during locality recovery * Format and remove temporary non_flaky
1 parent 83e0371 commit 0b92e83

42 files changed

Lines changed: 2772 additions & 3853 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
syntax = "proto3";
2+
3+
package golem.worker;
4+
5+
import "golem/common/account_id.proto";
6+
import "golem/common/environment.proto";
7+
import "golem/common/uuid.proto";
8+
import "golem/component/plugin.proto";
9+
import "golem/worker/idempotency_key.proto";
10+
import "golem/worker/public_oplog.proto";
11+
import "golem/worker/retry_policy.proto";
12+
import "golem/worker/worker_error.proto";
13+
import "golem/worker/worker_id.proto";
14+
import "google/protobuf/timestamp.proto";
15+
16+
// An oplog entry in its raw/internal representation.
17+
message RawOplogEntry {
18+
google.protobuf.Timestamp timestamp = 1;
19+
oneof entry {
20+
RawCreateParameters create = 2;
21+
RawHostCallParameters host_call = 3;
22+
RawAgentInvocationStartedParameters agent_invocation_started = 4;
23+
RawAgentInvocationFinishedParameters agent_invocation_finished = 5;
24+
RawTimestampOnly suspend = 6;
25+
RawErrorParameters error = 7;
26+
RawTimestampOnly no_op = 8;
27+
RawJumpParameters jump = 9;
28+
RawTimestampOnly interrupted = 10;
29+
RawTimestampOnly exited = 11;
30+
RawTimestampOnly begin_atomic_region = 12;
31+
RawEndAtomicRegionParameters end_atomic_region = 13;
32+
RawTimestampOnly begin_remote_write = 14;
33+
RawEndRemoteWriteParameters end_remote_write = 15;
34+
RawPendingAgentInvocationParameters pending_agent_invocation = 16;
35+
RawPendingUpdateParameters pending_update = 17;
36+
RawSuccessfulUpdateParameters successful_update = 18;
37+
RawFailedUpdateParameters failed_update = 19;
38+
RawGrowMemoryParameters grow_memory = 20;
39+
RawFilesystemStorageUsageUpdateParameters filesystem_storage_usage_update = 21;
40+
RawCreateResourceParameters create_resource = 22;
41+
RawDropResourceParameters drop_resource = 23;
42+
RawLogParameters log = 24;
43+
RawTimestampOnly restart = 25;
44+
RawActivatePluginParameters activate_plugin = 26;
45+
RawDeactivatePluginParameters deactivate_plugin = 27;
46+
RawRevertParameters revert = 28;
47+
RawCancelPendingInvocationParameters cancel_pending_invocation = 29;
48+
RawStartSpanParameters start_span = 30;
49+
RawFinishSpanParameters finish_span = 31;
50+
RawSetSpanAttributeParameters set_span_attribute = 32;
51+
RawChangePersistenceLevelParameters change_persistence_level = 33;
52+
RawBeginRemoteTransactionParameters begin_remote_transaction = 34;
53+
RawRemoteTransactionParameters pre_commit_remote_transaction = 35;
54+
RawRemoteTransactionParameters pre_rollback_remote_transaction = 36;
55+
RawRemoteTransactionParameters committed_remote_transaction = 37;
56+
RawRemoteTransactionParameters rolled_back_remote_transaction = 38;
57+
RawSnapshotParameters snapshot = 39;
58+
RawOplogProcessorCheckpointParameters oplog_processor_checkpoint = 40;
59+
RawSetRetryPolicyParameters set_retry_policy = 41;
60+
RawRemoveRetryPolicyParameters remove_retry_policy = 42;
61+
}
62+
}
63+
64+
// Empty message used for timestamp-only oplog variants.
65+
message RawTimestampOnly {}
66+
67+
// Payload that can be stored inline or externally.
68+
message RawOplogPayload {
69+
oneof payload {
70+
bytes inline_data = 1;
71+
RawExternalPayload external = 2;
72+
}
73+
}
74+
75+
// Reference to an externally stored payload.
76+
message RawExternalPayload {
77+
golem.common.UUID payload_id = 1;
78+
bytes md5_hash = 2;
79+
}
80+
81+
// Span data in raw form.
82+
message RawSpanData {
83+
oneof span {
84+
RawLocalSpan local_span = 1;
85+
RawExternalSpan external_span = 2;
86+
}
87+
}
88+
89+
message RawLocalSpan {
90+
string span_id = 1;
91+
google.protobuf.Timestamp start = 2;
92+
optional string parent_id = 3;
93+
repeated RawSpanData linked_context = 4;
94+
map<string, string> attributes = 5;
95+
bool inherited = 6;
96+
}
97+
98+
message RawExternalSpan {
99+
string span_id = 1;
100+
}
101+
102+
// Update description in raw form.
103+
message RawUpdateDescription {
104+
oneof description {
105+
uint64 automatic_target_revision = 1;
106+
RawSnapshotBasedUpdate snapshot_based = 2;
107+
}
108+
}
109+
110+
message RawSnapshotBasedUpdate {
111+
uint64 target_revision = 1;
112+
RawOplogPayload payload = 2;
113+
string mime_type = 3;
114+
}
115+
116+
// Oplog region (start..end index range).
117+
message RawOplogRegion {
118+
uint64 start = 1;
119+
uint64 end = 2;
120+
}
121+
122+
// Key-value pair for environment variables.
123+
message RawEnvVar {
124+
string key = 1;
125+
string value = 2;
126+
}
127+
128+
// --- Variant-specific parameter messages ---
129+
130+
message RawCreateParameters {
131+
AgentId agent_id = 1;
132+
uint64 component_revision = 2;
133+
repeated RawEnvVar env = 3;
134+
golem.common.EnvironmentId environment_id = 4;
135+
golem.common.AccountId created_by = 5;
136+
optional AgentId parent = 6;
137+
uint64 component_size = 7;
138+
uint64 initial_total_linear_memory_size = 8;
139+
repeated golem.component.EnvironmentPluginGrantId initial_active_plugins = 9;
140+
map<string, string> config_vars = 10;
141+
repeated bytes local_agent_config = 11;
142+
optional golem.common.UUID original_phantom_id = 12;
143+
}
144+
145+
message RawHostCallParameters {
146+
string function_name = 1;
147+
RawOplogPayload request = 2;
148+
RawOplogPayload response = 3;
149+
WrappedFunctionType durable_function_type = 4;
150+
}
151+
152+
message RawAgentInvocationStartedParameters {
153+
IdempotencyKey idempotency_key = 1;
154+
RawOplogPayload payload = 2;
155+
string trace_id = 3;
156+
repeated string trace_states = 4;
157+
repeated RawSpanData invocation_context = 5;
158+
}
159+
160+
message RawAgentInvocationFinishedParameters {
161+
RawOplogPayload result = 1;
162+
int64 consumed_fuel = 2;
163+
uint64 component_revision = 3;
164+
}
165+
166+
message RawErrorParameters {
167+
AgentError error = 1;
168+
uint64 retry_from = 2;
169+
bool inside_atomic_region = 3;
170+
optional bytes retry_policy_state = 4;
171+
}
172+
173+
message RawJumpParameters {
174+
RawOplogRegion jump = 1;
175+
}
176+
177+
message RawEndAtomicRegionParameters {
178+
uint64 begin_index = 1;
179+
}
180+
181+
message RawEndRemoteWriteParameters {
182+
uint64 begin_index = 1;
183+
}
184+
185+
message RawPendingAgentInvocationParameters {
186+
IdempotencyKey idempotency_key = 1;
187+
RawOplogPayload payload = 2;
188+
string trace_id = 3;
189+
repeated string trace_states = 4;
190+
repeated RawSpanData invocation_context = 5;
191+
}
192+
193+
message RawPendingUpdateParameters {
194+
RawUpdateDescription description = 1;
195+
}
196+
197+
message RawSuccessfulUpdateParameters {
198+
uint64 target_revision = 1;
199+
uint64 new_component_size = 2;
200+
repeated golem.component.EnvironmentPluginGrantId new_active_plugins = 3;
201+
}
202+
203+
message RawFailedUpdateParameters {
204+
uint64 target_revision = 1;
205+
optional string details = 2;
206+
}
207+
208+
message RawGrowMemoryParameters {
209+
uint64 delta = 1;
210+
}
211+
212+
message RawFilesystemStorageUsageUpdateParameters {
213+
int64 delta = 1;
214+
}
215+
216+
message RawResourceTypeId {
217+
string name = 1;
218+
string owner = 2;
219+
}
220+
221+
message RawCreateResourceParameters {
222+
uint64 id = 1;
223+
RawResourceTypeId resource_type_id = 2;
224+
}
225+
226+
message RawDropResourceParameters {
227+
uint64 id = 1;
228+
RawResourceTypeId resource_type_id = 2;
229+
}
230+
231+
message RawLogParameters {
232+
OplogLogLevel level = 1;
233+
string context = 2;
234+
string message = 3;
235+
}
236+
237+
message RawActivatePluginParameters {
238+
golem.component.EnvironmentPluginGrantId plugin_grant_id = 1;
239+
}
240+
241+
message RawDeactivatePluginParameters {
242+
golem.component.EnvironmentPluginGrantId plugin_grant_id = 1;
243+
}
244+
245+
message RawRevertParameters {
246+
RawOplogRegion dropped_region = 1;
247+
}
248+
249+
message RawCancelPendingInvocationParameters {
250+
IdempotencyKey idempotency_key = 1;
251+
}
252+
253+
message RawStartSpanParameters {
254+
string span_id = 1;
255+
optional string parent = 2;
256+
optional string linked_context_id = 3;
257+
map<string, string> attributes = 4;
258+
}
259+
260+
message RawFinishSpanParameters {
261+
string span_id = 1;
262+
}
263+
264+
message RawSetSpanAttributeParameters {
265+
string span_id = 1;
266+
string key = 2;
267+
string value = 3;
268+
}
269+
270+
message RawChangePersistenceLevelParameters {
271+
PersistenceLevel persistence_level = 1;
272+
}
273+
274+
message RawBeginRemoteTransactionParameters {
275+
string transaction_id = 1;
276+
optional uint64 original_begin_index = 2;
277+
}
278+
279+
message RawRemoteTransactionParameters {
280+
uint64 begin_index = 1;
281+
}
282+
283+
message RawSnapshotParameters {
284+
RawOplogPayload data = 1;
285+
string mime_type = 2;
286+
}
287+
288+
message RawOplogProcessorCheckpointParameters {
289+
golem.component.EnvironmentPluginGrantId plugin_grant_id = 1;
290+
AgentId target_agent_id = 2;
291+
uint64 confirmed_up_to = 3;
292+
uint64 sending_up_to = 4;
293+
uint64 last_batch_start = 5;
294+
}
295+
296+
message RawSetRetryPolicyParameters {
297+
golem.worker.retry.NamedRetryPolicy policy = 1;
298+
}
299+
300+
message RawRemoveRetryPolicyParameters {
301+
string name = 1;
302+
}
303+
304+
message RawOplogEntryWithIndex {
305+
uint64 oplog_index = 1;
306+
RawOplogEntry entry = 2;
307+
}

golem-api-grpc/proto/golem/worker/v1/worker_service.proto

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ syntax = "proto3";
33
package golem.worker.v1;
44

55
import "golem/auth/auth_ctx.proto";
6+
import "golem/common/account_id.proto";
67
import "golem/common/empty.proto";
78
import "golem/common/environment.proto";
89
import "golem/common/revert_worker_response.proto";
@@ -23,6 +24,7 @@ import "golem/worker/update_mode.proto";
2324
import "golem/worker/v1/worker_error.proto";
2425
import "golem/worker/worker_filter.proto";
2526
import "golem/worker/worker_id.proto";
27+
import "golem/worker/raw_oplog.proto";
2628
import "golem/worker/worker_metadata.proto";
2729
import "google/protobuf/timestamp.proto";
2830

@@ -35,6 +37,7 @@ service WorkerService {
3537
rpc CompletePromise (CompletePromiseRequest) returns (CompletePromiseResponse);
3638
rpc InvokeAgent (InvokeAgentRequest) returns (InvokeAgentResponse);
3739
rpc CancelInvocation (CancelInvocationRequest) returns (CancelInvocationResponse);
40+
rpc ProcessOplogEntries (ProcessOplogEntriesRequest) returns (ProcessOplogEntriesResponse);
3841
}
3942

4043
message LaunchNewWorkerRequest {
@@ -172,3 +175,23 @@ message CancelInvocationResponse {
172175
AgentError error = 2;
173176
}
174177
}
178+
179+
message ProcessOplogEntriesRequest {
180+
golem.worker.AgentId agent_id = 1;
181+
golem.common.EnvironmentId environment_id = 2;
182+
uint64 component_revision = 3;
183+
golem.worker.IdempotencyKey idempotency_key = 4;
184+
golem.common.AccountId account_id = 5;
185+
map<string, string> config = 6;
186+
golem.worker.AgentMetadata metadata = 7;
187+
uint64 first_entry_index = 8;
188+
repeated golem.worker.RawOplogEntry entries = 9;
189+
golem.auth.AuthCtx auth_ctx = 10;
190+
}
191+
192+
message ProcessOplogEntriesResponse {
193+
oneof result {
194+
golem.common.Empty success = 1;
195+
golem.worker.v1.AgentError error = 2;
196+
}
197+
}

golem-api-grpc/proto/golem/workerexecutor/v1/worker_executor.proto

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import "golem/worker/log_event.proto";
2121
import "golem/worker/oplog_cursor.proto";
2222
import "golem/worker/promise_id.proto";
2323
import "golem/worker/public_oplog.proto";
24+
import "golem/worker/raw_oplog.proto";
2425
import "golem/worker/update_mode.proto";
2526
import "golem/worker/v1/worker_execution_error.proto";
2627
import "golem/worker/worker_filter.proto";
@@ -56,6 +57,8 @@ service WorkerExecutor {
5657
rpc DeactivatePlugin(DeactivatePluginRequest) returns (DeactivatePluginResponse);
5758

5859
rpc InvokeAgent(InvokeAgentRequest) returns (InvokeAgentResponse);
60+
61+
rpc ProcessOplogEntries(ProcessOplogEntriesRequest) returns (ProcessOplogEntriesResponse);
5962
}
6063

6164
message ForkWorkerRequest {
@@ -450,3 +453,23 @@ message InvokeAgentSuccess {
450453
optional uint64 component_revision = 3;
451454
optional golem.worker.InvocationStatus status = 4;
452455
}
456+
457+
message ProcessOplogEntriesRequest {
458+
golem.worker.AgentId agent_id = 1;
459+
golem.common.EnvironmentId environment_id = 2;
460+
uint64 component_revision = 3;
461+
golem.worker.IdempotencyKey idempotency_key = 4;
462+
golem.common.AccountId account_id = 5;
463+
map<string, string> config = 6;
464+
golem.worker.AgentMetadata metadata = 7;
465+
uint64 first_entry_index = 8;
466+
repeated golem.worker.RawOplogEntry entries = 9;
467+
golem.auth.AuthCtx auth_ctx = 10;
468+
}
469+
470+
message ProcessOplogEntriesResponse {
471+
oneof result {
472+
golem.common.Empty success = 1;
473+
golem.worker.v1.WorkerExecutionError failure = 2;
474+
}
475+
}

0 commit comments

Comments
 (0)