Skip to content

Commit e7e10f9

Browse files
committed
[Automation] GCS Permission check and fix
1 parent 94ddada commit e7e10f9

4 files changed

Lines changed: 172 additions & 1 deletion

File tree

Ironwood/guides/automation/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ You can configure the behavior using the following environment variable:
5252
5353
| Variable | Description | Required | Default |
5454
| :--- | :--- | :--- | :--- |
55-
| `GCS_BUCKET_ROOT_DIR` | The root GCS path where results will be stored. Must start with `gs://`. | **Yes** | `gs://amylin-microbenchmark` (Change this!) |
55+
| `GCS_BUCKET_ROOT_DIR` | The root GCS path where results will be stored. Must start with `gs://`. | **Yes** | `gs://example-microbenchmark` (Change this!) |
5656
5757
## Usage Guide
5858

Ironwood/guides/automation/automation_launch.sh

100644100755
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
77
export GCS_BUCKET_ROOT_DIR=""
88
export GCS_SA_NAME="gcs-writer" # Service account with write access to GCS_BUCKET_ROOT_DIR
9+
export PROJECT_ID=$(gcloud config get-value project 2>/dev/null)
910

1011
MAX_RETRIES=3
1112
TIMEOUT_SECOND=3600
@@ -45,6 +46,19 @@ for topology in "${required_topologies[@]}"; do
4546
envsubst '${TOPOLOGY} ${TPUS}' < ${SCRIPT_DIR}/job-queue.yaml | kubectl apply -f -
4647
done
4748

49+
######################################################################
50+
# GCS PERMISSION CHECK
51+
######################################################################
52+
53+
# Run the GCS permission check
54+
export SA_NAME="${GCS_SA_NAME}"
55+
export PROJECT_ID="${PROJECT_ID}"
56+
if ! bash "${SCRIPT_DIR}/check_gcs_permissions.sh"; then
57+
echo "GCS Permission Check Failed. Exiting."
58+
exit 1
59+
fi
60+
61+
4862
######################################################################
4963
# LAUNCH JOBS & WAIT FOR COMPLETION
5064
######################################################################
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env bash
2+
3+
# This script checks if the configured Service Account has write permissions to the specified GCS bucket.
4+
# If permissions are missing, it attempts to fix them by creating the SA and granting roles/storage.admin.
5+
#
6+
# Expected Environment Variables:
7+
# GCS_BUCKET_ROOT_DIR: The GCS path (must start with gs://)
8+
# SA_NAME: The Service Account name (default: gcs-writer)
9+
# PROJECT_ID: The GCP Project ID (optional, will try to detect if not set)
10+
11+
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
12+
SA_NAME="${SA_NAME:-gcs-writer}"
13+
PROJECT_ID="${PROJECT_ID:-$(gcloud config get-value project 2>/dev/null)}"
14+
15+
if [[ -z "${GCS_BUCKET_ROOT_DIR}" || "${GCS_BUCKET_ROOT_DIR}" != "gs://"* ]]; then
16+
echo "Error: GCS_BUCKET_ROOT_DIR must be set and start with gs://"
17+
exit 1
18+
fi
19+
20+
fix_gcs_permissions() {
21+
# See more context in https://docs.cloud.google.com/kubernetes-engine/docs/how-to/workload-identity#authenticating_to
22+
echo "Attempting to fix GCS permissions..."
23+
24+
if [[ -z "${PROJECT_ID}" ]]; then
25+
echo "Error: PROJECT_ID is not set and could not be detected."
26+
echo "Please export PROJECT_ID=<your-project-id> and rerun."
27+
exit 1
28+
fi
29+
30+
local bucket_name=$(echo "${GCS_BUCKET_ROOT_DIR}" | sed 's|^gs://||' | cut -d/ -f1)
31+
local ns_name="default"
32+
33+
echo "Ensuring ServiceAccount ${SA_NAME} exists in namespace ${ns_name}..."
34+
kubectl create serviceaccount "${SA_NAME}" --namespace "${ns_name}" --dry-run=client -o yaml | kubectl apply -f -
35+
36+
local project_number=$(gcloud projects describe "${PROJECT_ID}" --format="value(projectNumber)")
37+
38+
echo "Granting roles/storage.admin to ${SA_NAME} on gs://${bucket_name}..."
39+
gcloud storage buckets add-iam-policy-binding "gs://${bucket_name}" \
40+
--role=roles/storage.admin \
41+
--member="principal://iam.googleapis.com/projects/${project_number}/locations/global/workloadIdentityPools/${PROJECT_ID}.svc.id.goog/subject/ns/${ns_name}/sa/${SA_NAME}"
42+
43+
echo "Permission fix command executed."
44+
}
45+
46+
check_gcs_permission() {
47+
echo "Checking GCS write permissions..."
48+
export GCS_CHECK_PATH="${GCS_BUCKET_ROOT_DIR}/permission-check-$(date +%s).txt"
49+
export SA_NAME="${SA_NAME}"
50+
51+
# Check if ServiceAccount exists first to fail fast
52+
if ! kubectl get serviceaccount "${SA_NAME}" &> /dev/null; then
53+
echo "ServiceAccount '${SA_NAME}' not found."
54+
return 1
55+
fi
56+
57+
# Launch check pod
58+
# We capture the pod name from the output of kubectl create
59+
local apply_output=$(envsubst '${SA_NAME} ${GCS_CHECK_PATH}' < "${SCRIPT_DIR}/gcs-write.yaml" | kubectl create -f -)
60+
# output example: pod/gcs-writer-test-abcde created
61+
local pod_name=$(echo "${apply_output}" | awk -F'/' '{print $2}' | awk '{print $1}')
62+
63+
echo "Launched GCS check pod: ${pod_name}"
64+
65+
# Wait for completion
66+
local check_status="FAILED"
67+
for i in {1..20}; do
68+
sleep 5
69+
if kubectl get pod "${pod_name}" -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Succeeded"; then
70+
check_status="SUCCESS"
71+
break
72+
fi
73+
if kubectl get pod "${pod_name}" -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Failed"; then
74+
check_status="FAILED"
75+
break
76+
fi
77+
done
78+
79+
# Check logs
80+
if kubectl logs "${pod_name}" 2>/dev/null | grep -q "GCS test complete!"; then
81+
echo "GCS permission check PASSED."
82+
check_status="SUCCESS"
83+
else
84+
echo "GCS permission check FAILED."
85+
check_status="FAILED"
86+
echo "Logs from ${pod_name}:"
87+
kubectl logs "${pod_name}" 2>/dev/null | tail -n 10
88+
fi
89+
90+
# Cleanup
91+
kubectl delete pod "${pod_name}" --grace-period=0 --force &> /dev/null
92+
93+
if [[ "${check_status}" != "SUCCESS" ]]; then
94+
return 1
95+
fi
96+
return 0
97+
}
98+
99+
# Main Logic
100+
echo "======================================================================"
101+
echo "Starting GCS Permission Check (SA: ${SA_NAME}, Bucket: ${GCS_BUCKET_ROOT_DIR})"
102+
echo "======================================================================"
103+
104+
if ! check_gcs_permission; then
105+
echo "GCS check failed. Attempting to fix..."
106+
fix_gcs_permissions
107+
108+
echo "Retrying GCS check..."
109+
if ! check_gcs_permission; then
110+
echo "GCS permissions check failed even after attempted fix."
111+
echo "Please verify your Service Account '${SA_NAME}' has proper permissions on ${GCS_BUCKET_ROOT_DIR}"
112+
exit 1
113+
fi
114+
fi
115+
116+
echo "GCS Check Verified Successfully."
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
generateName: gcs-writer-test-
5+
namespace: default
6+
spec:
7+
serviceAccountName: ${SA_NAME}
8+
containers:
9+
- name: gcs-test-container
10+
image: google/cloud-sdk:slim
11+
command:
12+
- bash
13+
- -c
14+
- |
15+
set -ex
16+
TIMESTAMP=$(date +%s)
17+
LOCAL_FILE="/tmp/test-file-${TIMESTAMP}.txt"
18+
19+
# GCS_CHECK_PATH is substituted by envsubst
20+
echo "Using GCS Path: ${GCS_CHECK_PATH}"
21+
22+
echo "Testing GCS write from pod at $(date)" > "${LOCAL_FILE}"
23+
24+
echo "--- Configuration ---"
25+
gcloud auth list
26+
gcloud config list
27+
# Try to get service account email, but don't fail if metadata server is slow/unreachable (though it should be reachable)
28+
curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/email" || echo "Could not fetch SA email"
29+
echo
30+
31+
echo "--- Writing to GCS ---"
32+
gsutil cp "${LOCAL_FILE}" "${GCS_CHECK_PATH}"
33+
34+
echo "--- Verifying from GCS ---"
35+
gsutil cat "${GCS_CHECK_PATH}"
36+
37+
echo "--- Cleaning up GCS object ---"
38+
gsutil rm "${GCS_CHECK_PATH}"
39+
40+
echo "GCS test complete!"
41+
restartPolicy: Never

0 commit comments

Comments
 (0)