terraform-google-sie/examples/dev-l4-spot/main.tf at main · superlinked/terraform-google-sie · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# SIE GKE Cluster - Development Example (L4 Spot)
#
# Creates a GKE cluster with GPU nodes. K8s resources (KEDA, Prometheus,
# SIE application) are deployed via Helm after this terraform apply.
#
# See oci://ghcr.io/superlinked/charts/sie-cluster  for the Helm chart.
#
# Features:
#   - 1x L4 GPU spot pool (scale 0-5)
#   - NAP enabled for automatic node provisioning
#   - Workload Identity for GCS access
#   - Artifact Registry for SIE images
#
# Prerequisites:
#   1. GCP project with billing enabled
#   2. GPU quota (check with: gcloud compute regions describe REGION --format='table(quotas.filter(metric:NVIDIA))')
#   3. APIs enabled: container.googleapis.com, compute.googleapis.com
#
# Usage:
#   export TF_VAR_project_id="your-project-id"
#   terraform init
#   terraform plan
#   terraform apply
#
# After apply, deploy K8s resources (batteries-included Helm chart):
#   $(terraform output -raw kubectl_command)
#   helm upgrade --install sie-cluster deploy/helm/sie-cluster \
#     -f values-gke.yaml \
#     --create-namespace -n sie \
#     --set serviceAccount.annotations."iam\.gke\.io/gcp-service-account"="$(terraform output -raw sie_workload_service_account)"
#
# Cleanup:
#   helm uninstall sie-cluster
#   terraform destroy

terraform {
  required_version = ">= 1.14"

  # Uncomment to use GCS backend for state
  # backend "gcs" {
  #   bucket = "your-terraform-state-bucket"
  #   prefix = "sie/gke"
  # }
}

# =============================================================================
# Variables
# =============================================================================

variable "project_id" {
  description = "GCP project ID"
  type        = string
}

variable "region" {
  description = "GCP region"
  type        = string
  default     = "us-central1"
}

variable "cluster_name" {
  description = "GKE cluster name"
  type        = string
  default     = "sie-dev"
}

variable "create_artifact_registry" {
  description = "Whether to create an Artifact Registry for SIE images"
  type        = bool
  default     = true
}

variable "deployer_service_account" {
  description = "Email of the service account running Terraform (optional, for CI/CD)"
  type        = string
  default     = ""
}

# =============================================================================
# SIE GKE Infra Module
# =============================================================================

module "infra" {
  source  = "superlinked/sie/google"
  version = "0.3.4"

  project_id               = var.project_id
  region                   = var.region
  cluster_name             = var.cluster_name
  deployer_service_account = var.deployer_service_account
  deletion_protection      = false # Dev cluster - allow easy cleanup

  # Network
  create_network = true
  network        = "sie-network"
  subnetwork     = "sie-subnet"

  # Private cluster with NAT
  enable_private_nodes = true

  # Node Auto-Provisioning (NAP)
  enable_node_auto_provisioning = true
  nap_max_cpu                   = 100
  nap_max_memory_gb             = 400

  # CPU node pool for system workloads
  cpu_node_pool = {
    machine_type   = "e2-standard-4"
    min_node_count = 1
    max_node_count = 3
  }

  # GPU node pool - L4 for inference
  gpu_node_pools = [
    {
      name            = "l4-spot"
      machine_type    = "g2-standard-8" # 8 vCPU, 32GB RAM, 1x L4
      gpu_type        = "nvidia-l4"
      gpu_count       = 1
      min_node_count  = 0 # Scale to zero when idle
      max_node_count  = 5
      spot            = true # ~60% savings
      local_ssd_count = 1    # 375GB local SSD for model cache
      zones           = ["us-central1-a", "us-central1-b", "us-central1-c"]
      taints = [{
        key    = "nvidia.com/gpu"
        value  = "present"
        effect = "NO_SCHEDULE"
      }]
      labels = {
        "sie.superlinked.com/gpu-type" = "l4"
      }
    }
  ]

  # Workload Identity for GCS access
  enable_workload_identity = true
  sie_namespace            = "sie"
  sie_service_account_name = "sie-server"

  # Artifact Registry for SIE images
  create_artifact_registry = var.create_artifact_registry

  # GKE native logging
  enable_cloud_logging = true

  labels = {
    "environment" = "dev"
    "managed-by"  = "terraform"
  }
}

# =============================================================================
# Outputs
# =============================================================================

output "cluster_name" {
  description = "GKE cluster name"
  value       = module.infra.cluster_name
}

output "kubectl_command" {
  description = "Command to configure kubectl"
  value       = module.infra.kubectl_config_command
}

output "artifact_registry_url" {
  description = "Artifact Registry URL for pushing images"
  value       = module.infra.artifact_registry_url
}

output "workload_identity_annotation" {
  description = "Annotation for Kubernetes service accounts (Workload Identity)"
  value       = module.infra.workload_identity_annotation
}