Skip to content

Commit d4cd4dc

Browse files
guptaakacopybara-github
authored andcommitted
Rename pw-service-example to pw-service
PiperOrigin-RevId: 903319076
1 parent 66e9754 commit d4cd4dc

2 files changed

Lines changed: 153 additions & 1 deletion

File tree

pathwaysutils/experimental/shared_pathways_service/deploy_pathways_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
_TEMPLATE_FILE = flags.DEFINE_string(
4444
"template_file",
4545
os.path.join(
46-
os.path.dirname(__file__), "yamls/pw-service-example.yaml",
46+
os.path.dirname(__file__), "yamls/pw-service.yaml",
4747
),
4848
"Path to the JobSet YAML template file",
4949
)
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
apiVersion: jobset.x-k8s.io/v1alpha2
2+
kind: JobSet
3+
metadata:
4+
name: ${JOBSET_NAME}
5+
namespace: default
6+
spec:
7+
coordinator:
8+
replicatedJob: pathways-head
9+
failurePolicy:
10+
maxRestarts: 1
11+
restartStrategy: Recreate
12+
network:
13+
enableDNSHostnames: true
14+
publishNotReadyAddresses: true
15+
replicatedJobs:
16+
- name: pathways-head
17+
replicas: 1
18+
template:
19+
metadata:
20+
annotations:
21+
alpha.jobset.sigs.k8s.io/exclusive-topology: kubernetes.io/hostname
22+
spec:
23+
backoffLimit: 3
24+
completionMode: Indexed
25+
completions: 1
26+
parallelism: 1
27+
template:
28+
metadata:
29+
annotations:
30+
alpha.jobset.sigs.k8s.io/exclusive-topology: kubernetes.io/hostname
31+
spec:
32+
containers:
33+
- name: pathways-rm
34+
image: ${SERVER_IMAGE}
35+
imagePullPolicy: Always
36+
args:
37+
- --server_port=29001
38+
- --gcs_scratch_location=${GCS_SCRATCH_LOCATION}
39+
- --node_type=resource_manager
40+
- --instance_count=${NUM_SLICES}
41+
- --instance_type=${INSTANCE_TYPE}
42+
env:
43+
- name: REPLICATED_JOB_NAME
44+
valueFrom:
45+
fieldRef:
46+
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
47+
- name: JOBSET_NAME
48+
valueFrom:
49+
fieldRef:
50+
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
51+
- name: HOST_ADDRESS
52+
valueFrom:
53+
fieldRef:
54+
fieldPath: metadata.labels['jobset.sigs.k8s.io/coordinator']
55+
- name: TPU_SKIP_MDS_QUERY
56+
value: "true"
57+
ports:
58+
- containerPort: 29001
59+
protocol: TCP
60+
- containerPort: 29002
61+
protocol: TCP
62+
resources:
63+
limits:
64+
cpu: "8"
65+
memory: 32G
66+
dnsPolicy: ClusterFirstWithHostNet
67+
hostNetwork: true
68+
restartPolicy: OnFailure
69+
- name: worker
70+
replicas: ${NUM_SLICES}
71+
template:
72+
spec:
73+
backoffLimit: 1000000
74+
completionMode: Indexed
75+
completions: ${VMS_PER_SLICE}
76+
parallelism: ${VMS_PER_SLICE}
77+
template:
78+
metadata:
79+
annotations:
80+
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
81+
spec:
82+
containers:
83+
- name: pathways-worker
84+
image: ${SERVER_IMAGE}
85+
imagePullPolicy: Always
86+
args:
87+
- --server_port=29005
88+
- --resource_manager_address=$$(PATHWAYS_HEAD):29001
89+
- --gcs_scratch_location=${GCS_SCRATCH_LOCATION}
90+
env:
91+
- name: TPU_MIN_LOG_LEVEL
92+
value: "0"
93+
- name: TF_CPP_MIN_LOG_LEVEL
94+
value: "0"
95+
- name: XCLOUD_ENVIRONMENT
96+
value: GCP
97+
- name: MEGASCALE_GRPC_ENABLE_XOR_TRACER
98+
value: "false"
99+
- name: MEGASCALE_NUM_SLICES
100+
valueFrom:
101+
fieldRef:
102+
fieldPath: metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']
103+
- name: JOBSET_NAME
104+
valueFrom:
105+
fieldRef:
106+
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
107+
- name: REPLICATED_JOB_NAME
108+
valueFrom:
109+
fieldRef:
110+
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
111+
- name: MEGASCALE_SLICE_ID
112+
valueFrom:
113+
fieldRef:
114+
fieldPath: metadata.labels['jobset.sigs.k8s.io/job-index']
115+
- name: PATHWAYS_HEAD
116+
valueFrom:
117+
fieldRef:
118+
fieldPath: metadata.labels['jobset.sigs.k8s.io/coordinator']
119+
- name: MEGASCALE_COORDINATOR_ADDRESS
120+
valueFrom:
121+
fieldRef:
122+
fieldPath: metadata.labels['jobset.sigs.k8s.io/coordinator']
123+
ports:
124+
- containerPort: 29005
125+
protocol: TCP
126+
- containerPort: 29006
127+
protocol: TCP
128+
- containerPort: 8471
129+
protocol: TCP
130+
- containerPort: 8080
131+
protocol: TCP
132+
resources:
133+
limits:
134+
google.com/tpu: "${CHIPS_PER_VM}"
135+
volumeMounts:
136+
- mountPath: /tmp
137+
name: shared-tmp
138+
dnsPolicy: ClusterFirstWithHostNet
139+
hostNetwork: true
140+
nodeSelector:
141+
cloud.google.com/gke-tpu-accelerator: ${ACCELERATOR_LABEL}
142+
cloud.google.com/gke-tpu-topology: ${TOPOLOGY}
143+
restartPolicy: OnFailure
144+
volumes:
145+
- name: shared-tmp
146+
hostPath:
147+
path: /tmp
148+
type: DirectoryOrCreate
149+
startupPolicy:
150+
startupPolicyOrder: InOrder
151+
successPolicy:
152+
operator: All

0 commit comments

Comments
 (0)