Skip to content

Commit 2db381f

Browse files
authored
Merge pull request #11 from cristofima/dev - docs: update CI/CD documentation and cost estimates; add architecture diagrams
- Added Python script to generate 5 architecture diagrams using the diagrams library - Updated cost estimates consistently across all documentation files - Enhanced CI/CD documentation with detailed IAM policy structure and Amplify auto-deployment flow - Added visual architecture diagrams to README and documentation files
2 parents e51a961 + 1f600a4 commit 2db381f

16 files changed

Lines changed: 711 additions & 94 deletions

.github/SETUP_CICD.md

Lines changed: 197 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,16 @@ This project uses GitHub Actions for automated CI/CD with AWS using OIDC (OpenID
66

77
## Architecture
88

9+
![CI/CD Pipeline](../docs/diagrams/architecture-cicd.png)
10+
11+
<details>
12+
<summary>Text version</summary>
13+
914
```
1015
Commit to dev → Auto Deploy to DEV → Build Training Container
1116
Commit to main → Plan → Manual Approval → Deploy to PROD → Build Container
1217
```
18+
</details>
1319

1420
---
1521

@@ -74,30 +80,165 @@ cat > github-actions-permissions.json <<EOF
7480
"Version": "2012-10-17",
7581
"Statement": [
7682
{
83+
"Sid": "TerraformStateManagement",
7784
"Effect": "Allow",
7885
"Action": [
79-
"s3:*",
80-
"dynamodb:*",
81-
"lambda:*",
82-
"apigateway:*",
83-
"batch:*",
84-
"ecr:*",
85-
"iam:*",
86-
"logs:*",
87-
"ec2:Describe*",
86+
"s3:GetObject",
87+
"s3:PutObject",
88+
"s3:DeleteObject",
89+
"s3:ListBucket"
90+
],
91+
"Resource": [
92+
"arn:aws:s3:::automl-lite-terraform-state-*",
93+
"arn:aws:s3:::automl-lite-terraform-state-*/*"
94+
]
95+
},
96+
{
97+
"Sid": "TerraformStateLocking",
98+
"Effect": "Allow",
99+
"Action": [
100+
"dynamodb:GetItem",
101+
"dynamodb:PutItem",
102+
"dynamodb:DeleteItem"
103+
],
104+
"Resource": "arn:aws:dynamodb:*:*:table/automl-lite-terraform-locks"
105+
},
106+
{
107+
"Sid": "S3Management",
108+
"Effect": "Allow",
109+
"Action": "s3:*",
110+
"Resource": [
111+
"arn:aws:s3:::automl-lite-*",
112+
"arn:aws:s3:::automl-lite-*/*"
113+
]
114+
},
115+
{
116+
"Sid": "DynamoDBManagement",
117+
"Effect": "Allow",
118+
"Action": "dynamodb:*",
119+
"Resource": "arn:aws:dynamodb:*:*:table/automl-lite-*"
120+
},
121+
{
122+
"Sid": "LambdaManagement",
123+
"Effect": "Allow",
124+
"Action": "lambda:*",
125+
"Resource": "arn:aws:lambda:*:*:function:automl-lite-*"
126+
},
127+
{
128+
"Sid": "APIGatewayManagement",
129+
"Effect": "Allow",
130+
"Action": "apigateway:*",
131+
"Resource": "*"
132+
},
133+
{
134+
"Sid": "BatchManagement",
135+
"Effect": "Allow",
136+
"Action": "batch:*",
137+
"Resource": "*"
138+
},
139+
{
140+
"Sid": "ECRManagement",
141+
"Effect": "Allow",
142+
"Action": "ecr:*",
143+
"Resource": "arn:aws:ecr:*:*:repository/automl-lite-*"
144+
},
145+
{
146+
"Sid": "ECRAuth",
147+
"Effect": "Allow",
148+
"Action": "ecr:GetAuthorizationToken",
149+
"Resource": "*"
150+
},
151+
{
152+
"Sid": "AmplifyManagement",
153+
"Effect": "Allow",
154+
"Action": "amplify:*",
155+
"Resource": "arn:aws:amplify:*:*:apps/*"
156+
},
157+
{
158+
"Sid": "IAMRoleManagement",
159+
"Effect": "Allow",
160+
"Action": [
161+
"iam:GetRole",
162+
"iam:CreateRole",
163+
"iam:DeleteRole",
164+
"iam:UpdateRole",
165+
"iam:AttachRolePolicy",
166+
"iam:DetachRolePolicy",
167+
"iam:PutRolePolicy",
168+
"iam:DeleteRolePolicy",
169+
"iam:GetRolePolicy",
170+
"iam:PassRole",
171+
"iam:ListRolePolicies",
172+
"iam:ListAttachedRolePolicies",
173+
"iam:ListInstanceProfilesForRole",
174+
"iam:TagRole",
175+
"iam:UntagRole"
176+
],
177+
"Resource": "arn:aws:iam::*:role/automl-lite-*"
178+
},
179+
{
180+
"Sid": "IAMServiceLinkedRoles",
181+
"Effect": "Allow",
182+
"Action": [
183+
"iam:CreateServiceLinkedRole",
184+
"iam:DeleteServiceLinkedRole",
185+
"iam:GetServiceLinkedRoleDeletionStatus"
186+
],
187+
"Resource": [
188+
"arn:aws:iam::*:role/aws-service-role/batch.amazonaws.com/*",
189+
"arn:aws:iam::*:role/aws-service-role/ecs.amazonaws.com/*",
190+
"arn:aws:iam::*:role/aws-service-role/spot.amazonaws.com/*",
191+
"arn:aws:iam::*:role/aws-service-role/spotfleet.amazonaws.com/*"
192+
]
193+
},
194+
{
195+
"Sid": "NetworkingForBatch",
196+
"Effect": "Allow",
197+
"Action": [
198+
"ec2:DescribeVpcs",
199+
"ec2:DescribeSubnets",
200+
"ec2:DescribeSecurityGroups",
201+
"ec2:DescribeNetworkInterfaces",
202+
"ec2:DescribeAccountAttributes",
203+
"ec2:DescribeInternetGateways",
204+
"ec2:DescribeRouteTables",
88205
"ec2:CreateSecurityGroup",
89206
"ec2:DeleteSecurityGroup",
90207
"ec2:AuthorizeSecurityGroupIngress",
91208
"ec2:RevokeSecurityGroupIngress",
92209
"ec2:AuthorizeSecurityGroupEgress",
93210
"ec2:RevokeSecurityGroupEgress",
94-
"ecs:*",
95-
"xray:*",
96-
"cloudwatch:*",
97-
"amplify:*",
98-
"sts:GetCallerIdentity"
211+
"ec2:CreateTags",
212+
"ec2:DeleteTags"
99213
],
100214
"Resource": "*"
215+
},
216+
{
217+
"Sid": "CloudWatchLogs",
218+
"Effect": "Allow",
219+
"Action": [
220+
"logs:CreateLogGroup",
221+
"logs:DeleteLogGroup",
222+
"logs:PutRetentionPolicy",
223+
"logs:DescribeLogGroups",
224+
"logs:ListTagsLogGroup",
225+
"logs:TagLogGroup",
226+
"logs:UntagLogGroup",
227+
"logs:ListTagsForResource",
228+
"logs:TagResource",
229+
"logs:UntagResource"
230+
],
231+
"Resource": [
232+
"arn:aws:logs:*:*:log-group:/aws/lambda/automl-lite-*",
233+
"arn:aws:logs:*:*:log-group:/aws/batch/automl-lite-*",
234+
"arn:aws:logs:*:*:log-group:/aws/apigateway/automl-lite-*"
235+
]
236+
},
237+
{
238+
"Sid": "CallerIdentity",
239+
"Effect": "Allow",
240+
"Action": "sts:GetCallerIdentity",
241+
"Resource": "*"
101242
}
102243
]
103244
}
@@ -200,14 +341,12 @@ Go to: **Settings → Actions → General**
200341
**Fast deployment:** ~3-5 minutes (only container, no infrastructure)
201342

202343
### **deploy-frontend.yml** - Frontend Deployment
203-
**Triggers:** Changes to `frontend/` or manual
344+
**Triggers:** Manual only (Amplify auto-deploys on push)
204345
**Actions:**
205-
1. Check infrastructure exists (auto-validation)
206-
2. Get API URL from Terraform outputs (automatic)
207-
3. Build Next.js static export with API URL
208-
4. Deploy to S3 bucket
209-
5. Invalidate CloudFront cache
210-
6. Test frontend accessibility
346+
1. Get Amplify App ID for environment
347+
2. Trigger Amplify build job
348+
3. Wait for build completion
349+
4. Output deployment URL
211350

212351
**Smart features:**
213352
- Automatically validates infrastructure is deployed first
@@ -271,7 +410,7 @@ git add frontend/
271410
git commit -m "feat: Add training progress bar"
272411
git push origin dev
273412

274-
#Only frontend updated (S3 + CloudFront invalidation)
413+
#Amplify auto-deploys on push (webhook)
275414
# ✅ Infrastructure untouched
276415
# ✅ API untouched
277416
# ✅ Automatically gets API URL from Terraform
@@ -333,11 +472,11 @@ The workflows are smart and **automatically validate dependencies**:
333472

334473
```
335474
Infrastructure (Terraform)
336-
↓ (creates API Gateway, S3, CloudFront)
475+
↓ (creates API Gateway, Amplify, S3, ECR)
337476
├→ Backend API (Lambda)
338477
├→ Training Container (ECR/Batch)
339-
└→ Frontend (S3 + CloudFront)
340-
↓ (automatically gets API URL from Terraform outputs)
478+
└→ Frontend (Amplify - auto-deploys on push)
479+
↓ (gets API URL from Amplify environment variables)
341480
```
342481

343482
### ✅ First-Time Setup Order
@@ -352,7 +491,7 @@ terraform apply
352491
```
353492
- Creates all AWS resources
354493
- Outputs API Gateway URL
355-
- Creates S3 + CloudFront for frontend
494+
- Creates Amplify app for frontend
356495
- Takes ~5-10 minutes
357496
358497
2. **Deploy Backend API** (Automatic after infrastructure)
@@ -373,16 +512,14 @@ terraform apply
373512
- Builds and pushes Docker image to ECR
374513
- Takes ~3-5 minutes
375514
376-
4. **Deploy Frontend** (Automatic dependency check)
515+
4. **Deploy Frontend** (Automatic via Amplify)
377516
```bash
378-
# Either:
379-
# - Push changes to frontend/
380-
# - Or manually: Actions → Deploy Frontend
517+
# Push to branch triggers Amplify auto-deploy
518+
git push origin dev
381519
```
382-
- **Automatically checks if infrastructure exists**
383-
- **Automatically retrieves API URL from Terraform**
384-
- Builds Next.js static site with correct API URL
385-
- Deploys to S3 and invalidates CloudFront
520+
- **Amplify auto-deploys on push** (webhook)
521+
- **API URL set in Amplify environment variables**
522+
- Builds Next.js SSR app
386523
- Takes ~3-5 minutes
387524
388525
### 🔄 Subsequent Deployments
@@ -412,48 +549,39 @@ After initial setup, you can deploy components **independently**:
412549
413550
### 🎯 How API URL is Passed (Automatic)
414551
415-
You asked: *"¿Cómo se pasa la variable para la URL del backend?"*
416-
417-
**Answer: Completely automatic via Terraform outputs!**
418-
419-
```yaml
420-
# In deploy-frontend.yml workflow:
552+
**Answer: Automatic via Terraform → Amplify environment variables!**
421553
422-
# Step 1: Get API URL from Terraform (automatic)
423-
- name: Get Infrastructure Outputs
424-
run: |
425-
cd infrastructure/terraform
426-
terraform workspace select ${{ env.ENVIRONMENT }}
427-
API_URL=$(terraform output -raw api_gateway_url) # ← Automatic!
428-
echo "api_url=$API_URL" >> $GITHUB_OUTPUT
429-
430-
# Step 2: Build with API URL as environment variable
431-
- name: Build Frontend
432-
env:
433-
NEXT_PUBLIC_API_URL: ${{ steps.infra.outputs.api_url }} # ← Injected automatically!
434-
run: pnpm build
554+
```hcl
555+
# In amplify.tf:
556+
resource "aws_amplify_app" "frontend" {
557+
# ...
558+
environment_variables = {
559+
NEXT_PUBLIC_API_URL = aws_api_gateway_stage.main.invoke_url # ← Set by Terraform!
560+
}
561+
}
435562
```
436563
437-
**No manual configuration needed!** The workflow:
438-
1. ✅ Checks infrastructure exists
439-
2. ✅ Retrieves API URL from Terraform state
440-
3. ✅ Builds frontend with correct API URL
441-
4. ✅ Deploys to S3
442-
5. ✅ Invalidates CloudFront cache
564+
When Amplify builds, it reads `NEXT_PUBLIC_API_URL` from its environment variables (set by Terraform) and injects it into the Next.js build.
565+
566+
**No manual configuration needed!** The flow is:
567+
1. ✅ Terraform creates API Gateway
568+
2. ✅ Terraform creates Amplify app with `NEXT_PUBLIC_API_URL` = API Gateway URL
569+
3. ✅ Push to branch triggers Amplify webhook
570+
4. ✅ Amplify builds with correct API URL
571+
5. ✅ Amplify deploys to CDN
443572
444573
### 🚦 Validation Flow
445574
446575
```mermaid
447576
graph TD
448-
A[Frontend Workflow Starts] --> B{Infrastructure exists?}
449-
B -->|No| C[❌ Fail with instructions]
450-
B -->|Yes| D[Get API URL from Terraform]
451-
D --> E[Build Next.js with API URL]
452-
E --> F{Frontend resources exist?}
453-
F -->|No| G[⚠️ Build only, skip deploy]
454-
F -->|Yes| H[Deploy to S3]
455-
H --> I[Invalidate CloudFront]
456-
I --> J[✅ Success]
577+
A[Push to Branch] --> B{Amplify webhook triggers?}
578+
B -->|Yes| C[Amplify starts build]
579+
C --> D[Install dependencies]
580+
D --> E[Build Next.js SSR]
581+
E --> F[Deploy to Amplify CDN]
582+
F --> G[✅ Success]
583+
B -->|No webhook| H[Manual: Actions → Re-deploy Frontend]
584+
H --> C
457585
```
458586
459587
### 📝 Manual Override (Optional)
@@ -546,7 +674,7 @@ But in CI/CD, **it's always automatic** - no manual steps required!
546674
- **Previous approach:** ~100-200 deployments/month (full deploys only)
547675
548676
### AWS Costs
549-
- Infrastructure: ~$7-10/month (as documented)
677+
- Infrastructure: ~$10-25/month (depends on usage)
550678
- No additional CI/CD costs
551679
- Monitor: AWS Cost Explorer with Project tag filter
552680

.github/copilot-instructions.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Serverless AutoML platform with **split architecture**:
1313
|-----------|------------|------------|-----|
1414
| Backend API | FastAPI + Mangum | Lambda ZIP (5MB) | Fast cold starts, simple deploys |
1515
| Training | FLAML + scikit-learn | Docker on AWS Batch | 265MB deps, >15min runtime exceed Lambda limits |
16-
| Frontend | Next.js 16 App Router | AWS Amplify | SSR support |
16+
| Frontend | Next.js 16 App Router | AWS Amplify | SSR support, auto-deploy on push |
1717
| Infrastructure | Terraform | `infrastructure/terraform/` | State management |
1818

1919
**Key insight:** Containers ONLY for training - ML deps (265MB) exceed Lambda's 250MB limit.
@@ -87,6 +87,9 @@ cd infrastructure/terraform; terraform apply -target=aws_lambda_function.api
8787
$EcrUrl = terraform output -raw ecr_repository_url
8888
docker build -t automl-training:latest backend/training
8989
docker tag automl-training:latest "$EcrUrl:latest"; docker push "$EcrUrl:latest"
90+
91+
# Generate architecture diagrams (requires: pip install diagrams + Graphviz)
92+
python scripts/generate_architecture_diagram.py
9093
```
9194

9295
## Common Pitfalls
@@ -120,7 +123,17 @@ Backend Pydantic and Frontend TypeScript schemas must match. When adding fields:
120123
- Local API: `http://localhost:8000/docs` (Swagger UI)
121124
- Env var mismatch: Compare `batch_service.py` containerOverrides with `train.py` os.getenv()
122125

126+
## Utility Scripts
127+
128+
| Script | Purpose |
129+
|--------|---------|
130+
| `scripts/run-training-local.py` | Test training in local Docker container |
131+
| `scripts/predict.py` | Make predictions with trained models (Docker) |
132+
| `scripts/generate_architecture_diagram.py` | Generate AWS architecture diagrams |
133+
123134
## Key Docs
124135

125136
- `docs/LESSONS_LEARNED.md` - Critical debugging insights
137+
- `docs/QUICKSTART.md` - Deployment guide
138+
- `.github/SETUP_CICD.md` - CI/CD with GitHub Actions
126139
- `infrastructure/terraform/ARCHITECTURE_DECISIONS.md` - Why Lambda + Batch split

0 commit comments

Comments
 (0)