Skip to content

Commit aca675f

Browse files
committed
Add run-ai support for hyperpod and eks
1 parent 9c5dbd5 commit aca675f

45 files changed

Lines changed: 1919 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
README.pdf

Container-Root/hyperpod/deployment/eks/run-ai/README.md

Lines changed: 1401 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
3+
echo ""
4+
echo "Showing content of Run:ai FSX volumes ..."
5+
6+
echo ""
7+
echo "Starting fsx-pod ..."
8+
kubectl apply -f ./fsx-pod.yaml -n runai-backend
9+
10+
echo ""
11+
# Wait for fsx-pod to start running
12+
STATUS=$(kubectl get pods -n runai-backend| grep fsx-pod | awk '{print $3}')
13+
while [ ! "$STATUS" == "Running" ]; do
14+
echo "Waiting for fsx-pod to start ..."
15+
sleep 3
16+
STATUS=$(kubectl get pods -n runai-backend | grep fsx-pod | awk '{print $3}')
17+
done
18+
19+
echo ""
20+
echo "Showing fsx volumes ..."
21+
kubectl -n runai-backend exec -it fsx-pod -- bash -c 'echo "" && echo /fsx-postgres && ls -alh /fsx-postgres && echo "" && echo /fsx-redis && ls -alh /fsx-redis && echo "" && echo /fsx-thanos && ls -alh /fsx-thanos'
22+
23+
echo ""
24+
echo "Removing fsx-pod ..."
25+
kubectl delete -f ./fsx-pod.yaml -n runai-backend
26+
27+
echo ""
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
3+
echo ""
4+
echo "Patching Run:ai FSX volumes ..."
5+
6+
echo ""
7+
echo "Starting fsx-pod ..."
8+
kubectl apply -f ./fsx-pod.yaml -n runai-backend
9+
10+
echo ""
11+
# Wait for fsx-pod to start running
12+
STATUS=$(kubectl get pods -n runai-backend| grep fsx-pod | awk '{print $3}')
13+
while [ ! "$STATUS" == "Running" ]; do
14+
echo "Waiting for fsx-pod to start ..."
15+
sleep 3
16+
STATUS=$(kubectl get pods -n runai-backend | grep fsx-pod | awk '{print $3}')
17+
done
18+
19+
echo ""
20+
echo "Setting owner of fsx volumes ..."
21+
kubectl -n runai-backend exec -it fsx-pod -- bash -c 'chown -R 1001:1001 /fsx-postgres && chown -R 1001:1001 /fsx-redis && chown -R 1001:1001 /fsx-thanos'
22+
23+
echo ""
24+
echo "Showing fsx volumes ..."
25+
kubectl -n runai-backend exec -it fsx-pod -- bash -c 'echo "" && echo /fsx-postgres && ls -alh /fsx-postgres && echo "" && echo /fsx-redis && ls -alh /fsx-redis && echo "" && echo /fsx-thanos && ls -alh /fsx-thanos'
26+
27+
echo ""
28+
echo "Removing fsx-pod ..."
29+
kubectl delete -f ./fsx-pod.yaml -n runai-backend
30+
31+
echo ""
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: fsx-pod
5+
spec:
6+
containers:
7+
- name: app
8+
image: ubuntu
9+
command: ["/bin/sh"]
10+
args: ["-c", "while true; do echo \"FSx volume is accessible on\" $(date) >> /fsx/health.log; sleep 5; done"]
11+
volumeMounts:
12+
- name: fsx-postgres
13+
mountPath: /fsx-postgres
14+
- name: fsx-redis
15+
mountPath: /fsx-redis
16+
- name: fsx-thanos
17+
mountPath: /fsx-thanos
18+
volumes:
19+
- name: fsx-postgres
20+
persistentVolumeClaim:
21+
claimName: data-runai-backend-postgresql-0
22+
- name: fsx-redis
23+
persistentVolumeClaim:
24+
claimName: data-runai-backend-redis-queue-master-0
25+
- name: fsx-thanos
26+
persistentVolumeClaim:
27+
claimName: data-runai-backend-thanos-receive-0
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
export RUNAI_VERSION=2.20.24
4+
export DNS_NAME=runai-remote.iankouls.do.wwso.aws.dev
5+
6+
7+
echo ""
8+
echo "Installing Run.ai control plane version $RUNAI_VERSION at DNS $DNS_NAME ..."
9+
helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod
10+
helm repo update
11+
helm upgrade --install runai-backend -n runai-backend runai-backend/control-plane --version "$RUNAI_VERSION" --set global.domain=$DNS_NAME --set global.ingress.tlsSecretName=runai-backend-tls --set global.customCA.enabled=true
12+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
echo ""
4+
echo "Removing Run.ai control plane ..."
5+
helm uninstall runai-backend -n runai-backend
6+
7+
echo ""
8+
echo "Deleting Run.ai PVCs ..."
9+
kubectl -n runai-backend delete pvc data-runai-backend-postgresql-0 data-runai-backend-redis-queue-master-0 data-runai-backend-thanos-receive-0
10+
11+
echo ""
12+
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
# Ensure runai-reg-creds.yaml is applied to the runai namespace
4+
kubectl get secret -n runai | grep runai-reg-creds
5+
6+
export RUNAI_VERSION=2.20.24
7+
export DNS_NAME=$DNS_NAME
8+
export CLIENT_SECRET=$CLIENT_SECRET
9+
export CLUSTER_UID=$CLUSTER_UID
10+
11+
echo ""
12+
echo "Installing Run:ai local cluster version ${RUNAI_VERSION} at $DNS_NAME ..."
13+
14+
helm repo add runai https://runai.jfrog.io/artifactory/api/helm/run-ai-charts --force-update
15+
helm repo update
16+
helm upgrade -i runai-cluster runai/runai-cluster -n runai \
17+
--set controlPlane.url=${DNS_NAME} \
18+
--set controlPlane.clientSecret=${CLIENT_SECRET} \
19+
--set cluster.uid=${CLUSTER_UID} \
20+
--set cluster.url=${DNS_NAME} \
21+
--set global.customCA.enabled=true \
22+
--version="${RUNAI_VERSION}" --create-namespace
23+
24+
echo ""
25+
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
3+
# Ensure runai-reg-creds.yaml is applied to the runai namespace
4+
kubectl get secret -n runai | grep runai-reg-creds
5+
6+
export RUNAI_VERSION=2.20.24
7+
export DNS_NAME=$DNS_NAME
8+
export CLIENT_SECRET=$CLIENT_SECRET
9+
export CLUSTER_UID=$CLUSTER_UID
10+
export CLUSTER_URL=$CLUSTER_URL
11+
12+
echo ""
13+
echo "Installing Run:ai remote cluster version ${RUNAI_VERSION} at $CLUSTER_URL"
14+
echo "on control plane https://$DNS_NAME ..."
15+
16+
helm repo add runai https://runai.jfrog.io/artifactory/api/helm/run-ai-charts --force-update
17+
helm repo update
18+
helm upgrade -i runai-cluster runai/runai-cluster -n runai \
19+
--set controlPlane.url=${DNS_NAME} \
20+
--set controlPlane.clientSecret=${CLIENT_SECRET} \
21+
--set cluster.uid=${CLUSTER_UID} \
22+
--set cluster.url=${CLUSTER_URL} \
23+
--set global.customCA.enabled=true \
24+
--version="${RUNAI_VERSION}" --create-namespace
25+
26+
echo ""
27+
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
echo ""
4+
echo "Removing Run:ai cluster ..."
5+
6+
helm uninstall runai-cluster -n runai
7+
8+
echo ""
9+
echo "Removing runaiconfig finalizer ..."
10+
kubectl patch runaiconfig runai -n runai -p '{"metadata":{"finalizers":null}}' --type=merge
11+
12+
echo ""
13+
echo "Removing runaiconfig runai ..."
14+
kubectl delete runaiconfig runai -n runai
15+
16+
echo ""
17+

0 commit comments

Comments
 (0)