Skip to content

Commit 93f4ead

Browse files
authored
Fix the scheduler timeouts and errors (TraceMachina#2083)
* Fix the scheduler timeouts by removing the keepalives * Fix the memory leak by removing the counter
1 parent e38af3d commit 93f4ead

14 files changed

Lines changed: 873 additions & 539 deletions
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
{
2+
stores: [
3+
{
4+
name: "AC_MAIN_STORE",
5+
filesystem: {
6+
content_path: "/tmp/nativelink/data-worker-test/content_path-ac",
7+
temp_path: "/tmp/nativelink/data-worker-test/tmp_path-ac",
8+
eviction_policy: {
9+
max_bytes: 1000000000,
10+
},
11+
},
12+
},
13+
{
14+
name: "WORKER_FAST_SLOW_STORE",
15+
fast_slow: {
16+
fast: {
17+
filesystem: {
18+
content_path: "/tmp/nativelink/data-worker-test/content_path-cas",
19+
temp_path: "/tmp/nativelink/data-worker-test/tmp_path-cas",
20+
eviction_policy: {
21+
max_bytes: 10000000000,
22+
},
23+
},
24+
},
25+
slow: {
26+
noop: {},
27+
},
28+
},
29+
},
30+
{
31+
name: "SCHEDULER_REDIS_STORE",
32+
redis_store: {
33+
addresses: [
34+
"redis://127.0.0.1:6379",
35+
],
36+
connection_pool_size: 10,
37+
experimental_pub_sub_channel: "scheduler_key_change",
38+
},
39+
},
40+
],
41+
schedulers: [
42+
{
43+
name: "MAIN_SCHEDULER",
44+
simple: {
45+
worker_timeout_s: 30,
46+
worker_match_logging_interval_s: -1,
47+
supported_platform_properties: {
48+
cpu_count: "minimum",
49+
memory_kb: "minimum",
50+
network_kbps: "minimum",
51+
disk_read_iops: "minimum",
52+
disk_read_bps: "minimum",
53+
disk_write_iops: "minimum",
54+
disk_write_bps: "minimum",
55+
shm_size: "minimum",
56+
gpu_count: "minimum",
57+
gpu_model: "exact",
58+
cpu_vendor: "exact",
59+
cpu_arch: "exact",
60+
cpu_model: "exact",
61+
kernel_version: "exact",
62+
OSFamily: "priority",
63+
"container-image": "priority",
64+
"lre-rs": "priority",
65+
ISA: "exact",
66+
},
67+
experimental_backend: {
68+
redis: {
69+
redis_store: "SCHEDULER_REDIS_STORE",
70+
},
71+
},
72+
},
73+
},
74+
],
75+
workers: [
76+
{
77+
local: {
78+
worker_api_endpoint: {
79+
uri: "grpc://127.0.0.1:50061",
80+
},
81+
cas_fast_slow_store: "WORKER_FAST_SLOW_STORE",
82+
upload_action_result: {
83+
ac_store: "AC_MAIN_STORE",
84+
},
85+
work_directory: "/tmp/nativelink/work",
86+
platform_properties: {
87+
cpu_count: {
88+
values: [
89+
"14",
90+
],
91+
},
92+
memory_kb: {
93+
values: [
94+
"32000000",
95+
],
96+
},
97+
network_kbps: {
98+
values: [
99+
"100000",
100+
],
101+
},
102+
cpu_arch: {
103+
values: [
104+
"aarch64",
105+
],
106+
},
107+
OSFamily: {
108+
values: [
109+
"Darwin",
110+
"",
111+
],
112+
},
113+
"container-image": {
114+
values: [
115+
"",
116+
],
117+
},
118+
"lre-rs": {
119+
values: [
120+
"",
121+
],
122+
},
123+
ISA: {
124+
values: [
125+
"aarch64",
126+
],
127+
},
128+
},
129+
},
130+
},
131+
],
132+
servers: [
133+
{
134+
name: "public",
135+
listener: {
136+
http: {
137+
socket_address: "0.0.0.0:50051",
138+
},
139+
},
140+
services: {
141+
cas: [
142+
{
143+
instance_name: "main",
144+
cas_store: "WORKER_FAST_SLOW_STORE",
145+
},
146+
],
147+
ac: [
148+
{
149+
instance_name: "main",
150+
ac_store: "AC_MAIN_STORE",
151+
},
152+
],
153+
execution: [
154+
{
155+
instance_name: "main",
156+
cas_store: "WORKER_FAST_SLOW_STORE",
157+
scheduler: "MAIN_SCHEDULER",
158+
},
159+
],
160+
capabilities: [
161+
{
162+
instance_name: "main",
163+
remote_execution: {
164+
scheduler: "MAIN_SCHEDULER",
165+
},
166+
},
167+
],
168+
bytestream: [
169+
{
170+
instance_name: "main",
171+
cas_store: "WORKER_FAST_SLOW_STORE",
172+
},
173+
],
174+
health: {},
175+
admin: {},
176+
},
177+
},
178+
{
179+
name: "private_workers_servers",
180+
listener: {
181+
http: {
182+
socket_address: "0.0.0.0:50061",
183+
},
184+
},
185+
services: {
186+
worker_api: {
187+
scheduler: "MAIN_SCHEDULER",
188+
},
189+
admin: {},
190+
health: {},
191+
},
192+
},
193+
],
194+
global: {
195+
max_open_files: 24576,
196+
},
197+
}

nativelink-scheduler/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ rust_library(
2525
"src/simple_scheduler_state_manager.rs",
2626
"src/store_awaited_action_db.rs",
2727
"src/worker.rs",
28+
"src/worker_registry.rs",
2829
"src/worker_scheduler.rs",
2930
],
3031
proc_macro_deps = [

0 commit comments

Comments
 (0)