marcelorobj
diff --git a/‎3-fleetscope/modules/private_install_manifest/main.tf‎
Lines changed: 1 addition & 1 deletion b/‎3-fleetscope/modules/private_install_manifest/main.tf‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎5-appinfra/modules/hpc-ai-training-infra/Dockerfile‎
Lines changed: 20 additions & 0 deletions b/‎5-appinfra/modules/hpc-ai-training-infra/Dockerfile‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎5-appinfra/modules/hpc-ai-training-infra/container_image.tf‎
Lines changed: 117 additions & 0 deletions b/‎5-appinfra/modules/hpc-ai-training-infra/container_image.tf‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎5-appinfra/modules/hpc-ai-training-infra/main.tf‎
Lines changed: 47 additions & 0 deletions b/‎5-appinfra/modules/hpc-ai-training-infra/main.tf‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎5-appinfra/modules/hpc-ai-training-infra/outputs.tf‎
Lines changed: 20 additions & 0 deletions b/‎5-appinfra/modules/hpc-ai-training-infra/outputs.tf‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎5-appinfra/modules/hpc-ai-training-infra/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎5-appinfra/modules/hpc-ai-training-infra/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎5-appinfra/modules/hpc-ai-training-infra/tensorflow_mnist_train_distributed.py‎
Lines changed: 117 additions & 0 deletions b/‎5-appinfra/modules/hpc-ai-training-infra/tensorflow_mnist_train_distributed.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎5-appinfra/modules/hpc-ai-training-infra/variables.tf‎
Lines changed: 57 additions & 0 deletions b/‎5-appinfra/modules/hpc-ai-training-infra/variables.tf‎
Lines changed: 57 additions & 0 deletions
@@ -73,7 +73,7 @@ module "kubectl" {
   membership_name         = var.cluster_name
   membership_location     = var.cluster_region
   kubectl_create_command  = "kubectl apply --server-side -f ${path.module}/manifest-${random_uuid.uid.result}-${var.cluster_name}.yaml"
-  kubectl_destroy_command = "kubectl delete -f ${path.module}/manifest-${random_uuid.uid.result}-${var.cluster_name}.yaml || exit 0"
+  kubectl_destroy_command = "timeout 300s kubectl delete -f ${path.module}/manifest-${random_uuid.uid.result}-${var.cluster_name}.yaml || exit 0"
 
   module_depends_on = [
     local_file.downloaded_file.filename,
 
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM tensorflow/tensorflow:latest-gpu@sha256:1f16fbd9be8bb84891de12533e332bbd500511caeb5cf4db501dbe39d422f9c7
+WORKDIR /data/tensorflow-mnist-example
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+CMD ["/bin/bash", "-c", "--", "python tensorflow_mnist_train_distributed.py"]
@@ -0,0 +1,117 @@
+/**
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+resource "google_service_account" "builder" {
+  project    = var.infra_project
+  account_id = "ai-builder"
+}
+
+resource "google_storage_bucket" "build_logs" {
+  name                        = "cb-ai-builder-logs-${var.infra_project}"
+  project                     = var.infra_project
+  uniform_bucket_level_access = true
+  force_destroy               = var.bucket_force_destroy
+  location                    = var.region
+}
+
+# IAM Roles required to build the terraform image on Google Cloud Build
+resource "google_storage_bucket_iam_member" "builder_admin" {
+  member = google_service_account.builder.member
+  bucket = google_storage_bucket.build_logs.name
+  role   = "roles/storage.admin"
+}
+
+resource "google_project_iam_member" "builder_object_user" {
+  member  = google_service_account.builder.member
+  project = var.infra_project
+  role    = "roles/storage.objectUser"
+}
+
+resource "google_artifact_registry_repository_iam_member" "builder" {
+  project    = google_artifact_registry_repository.private_images.project
+  location   = google_artifact_registry_repository.private_images.location
+  repository = google_artifact_registry_repository.private_images.name
+  role       = "roles/artifactregistry.repoAdmin"
+  member     = google_service_account.builder.member
+}
+
+resource "google_artifact_registry_repository_iam_member" "allow_cluster_sa_download" {
+  for_each   = var.cluster_service_accounts
+  project    = google_artifact_registry_repository.private_images.project
+  location   = google_artifact_registry_repository.private_images.location
+  repository = google_artifact_registry_repository.private_images.name
+  role       = "roles/artifactregistry.reader"
+  member     = "serviceAccount:${each.value}"
+}
+
+resource "time_sleep" "wait_iam_propagation" {
+  create_duration = "60s"
+
+  depends_on = [
+    google_artifact_registry_repository_iam_member.builder,
+    google_storage_bucket_iam_member.builder_admin,
+    google_project_iam_member.builder_object_user,
+  ]
+}
+
+
+resource "time_sleep" "wait_api" {
+  create_duration = "20s"
+
+  depends_on = [
+    google_project_service.enable_apis
+  ]
+}
+
+resource "google_artifact_registry_repository" "private_images" {
+  location      = var.region
+  project       = var.infra_project
+  repository_id = "private-images"
+  description   = "Docker repository for private images"
+  format        = "DOCKER"
+
+  depends_on = [
+    time_sleep.wait_api
+  ]
+}
+
+module "build_ai_run_image_image" {
+  source  = "terraform-google-modules/gcloud/google"
+  version = "~> 3.5"
+  upgrade = false
+
+  create_cmd_triggers = {
+    "tag_version" = local.docker_tag_version_terraform
+  }
+
+  create_cmd_entrypoint = "bash"
+
+  create_cmd_body = <<EOF
+gcloud builds submit ${path.module} \
+  --tag ${var.region}-docker.pkg.dev/${var.infra_project}/${google_artifact_registry_repository.private_images.name}/ai-train:${local.docker_tag_version_terraform} \
+  --project=${var.infra_project} \
+  --service-account=${google_service_account.builder.id} \
+  --gcs-log-dir=${google_storage_bucket.build_logs.url} || (
+    sleep 45 && gcloud builds submit ${path.module} \
+      --tag ${var.region}-docker.pkg.dev/${var.infra_project}/${google_artifact_registry_repository.private_images.name}/ai-train:${local.docker_tag_version_terraform} \
+      --project=${var.infra_project} \
+      --service-account=${google_service_account.builder.id} \
+      --gcs-log-dir=${google_storage_bucket.build_logs.url}
+  )
+EOF
+
+  module_depends_on = [time_sleep.wait_iam_propagation]
+}
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+locals {
+  docker_tag_version_terraform = "v1"
+  image_url                    = "${google_artifact_registry_repository.private_images.location}-docker.pkg.dev/${google_artifact_registry_repository.private_images.project}/${google_artifact_registry_repository.private_images.repository_id}/ai-train:${local.docker_tag_version_terraform}"
+  namespace                    = "${var.team}-${var.env}"
+}
+
+resource "google_project_iam_member" "team_roles" {
+  for_each = toset([
+    "roles/storage.objectUser",
+    "roles/pubsub.publisher",
+    "roles/pubsub.viewer"
+  ])
+
+  project = var.infra_project
+  role    = each.value
+  member  = "principalSet://iam.googleapis.com/projects/${var.cluster_project_number}/locations/global/workloadIdentityPools/${var.cluster_project}.svc.id.goog/namespace/${local.namespace}"
+}
+
+resource "google_project_service" "enable_apis" {
+  for_each = toset([
+    "storage.googleapis.com",
+    "cloudresourcemanager.googleapis.com",
+    "logging.googleapis.com",
+    "batch.googleapis.com",
+    "cloudbuild.googleapis.com",
+    "artifactregistry.googleapis.com",
+  ])
+  project            = var.infra_project
+  service            = each.key
+  disable_on_destroy = false
+}
@@ -0,0 +1,20 @@
+/**
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+output "image_url" {
+  description = "AI Image URL"
+  value       = local.image_url
+}
@@ -0,0 +1 @@
+tensorflow-datasets
@@ -0,0 +1,117 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+import tensorflow_datasets as tfds
+import tensorflow as tf
+import keras
+import glob
+
+datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
+
+mnist_train, mnist_test = datasets['train'], datasets['test']
+
+print('******************')
+print('MNIST TRAINING JOB')
+print('******************')
+
+strategy = tf.distribute.MirroredStrategy()
+print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
+num_train_examples = info.splits['train'].num_examples
+num_test_examples = info.splits['test'].num_examples
+
+BUFFER_SIZE = 10000
+
+BATCH_SIZE_PER_REPLICA = 64
+BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
+
+def scale(image, label):
+    image = tf.cast(image, tf.float32)
+    image /= 255
+
+    return image, label
+
+train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
+eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)
+
+with strategy.scope():
+    model = keras.Sequential([
+        keras.Input(shape=(28, 28, 1)),
+        keras.layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
+        keras.layers.MaxPooling2D(),
+        keras.layers.Flatten(),
+        keras.layers.Dense(64, activation='relu'),
+        keras.layers.Dense(10)
+    ])
+
+    model.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+                    optimizer=keras.optimizers.Adam(),
+                    metrics=['accuracy'])
+
+# Define the checkpoint directory to store the checkpoints.
+checkpoint_dir = './training_checkpoints'
+# Define the name of the checkpoint files.
+checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")
+
+def decay(epoch):
+    if epoch < 3:
+        return 1e-3
+    elif epoch >= 3 and epoch < 7:
+        return 1e-4
+    else:
+        return 1e-5
+
+# Define a callback for printing the learning rate at the end of each epoch.
+class PrintLR(keras.callbacks.Callback):
+    def on_epoch_end(self, epoch, logs=None):
+        print('\nLearning rate for epoch {} is {}'.format(epoch + 1,
+                                                        model.optimizer.learning_rate.numpy()))
+
+callbacks = [
+    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
+    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
+                                       save_weights_only=True),
+    tf.keras.callbacks.LearningRateScheduler(decay),
+    PrintLR()
+]
+
+EPOCHS = 12
+
+model.fit(train_dataset, epochs=EPOCHS, callbacks=callbacks)
+
+# Function to find the latest .h5 file
+def find_latest_h5_checkpoint(checkpoint_dir):
+    list_of_files = glob.glob(f'{checkpoint_dir}/*.h5')
+    if list_of_files:
+        latest_file = max(list_of_files, key=os.path.getctime)
+        return latest_file
+    else:
+        return None
+
+model.load_weights(find_latest_h5_checkpoint(checkpoint_dir))
+
+eval_loss, eval_acc = model.evaluate(eval_dataset)
+
+print('Eval loss: {}, Eval accuracy: {}'.format(eval_loss, eval_acc))
+
+path = '/data/mnist_saved_model'
+os.makedirs(path, exist_ok=True)
+
+model_file = '/data/mnist_saved_model/mnist.keras'
+model.save(model_file)
+
+print('Training finished. Model saved')
+
@@ -0,0 +1,57 @@
+
+/**
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+variable "infra_project" {
+  description = "The infrastructure project where resources will be managed."
+  type        = string
+}
+
+variable "cluster_project" {
+  description = "The project that hosts the Kubernetes cluster."
+  type        = string
+}
+
+variable "region" {
+  description = "The region where the cloud resources will be deployed."
+  type        = string
+}
+
+variable "bucket_force_destroy" {
+  description = "When deleting a bucket, this boolean option will delete all contained objects. If false, Terraform will fail to delete buckets which contain objects."
+  type        = bool
+  default     = false
+}
+
+variable "cluster_project_number" {
+  description = "The numerical identifier for the cluster project."
+  type        = string
+}
+
+variable "env" {
+  description = "The environment in which resources are deployed (e.g., development, nonproduction, production)."
+  type        = string
+}
+
+variable "cluster_service_accounts" {
+  description = "A map of service accounts emails associated with the Kubernetes cluster, these will be granted access to created Docker images."
+  type        = map(any)
+}
+
+variable "team" {
+  description = "Environment Team, must be the same as the fleet scope team"
+  type        = string
+}