Skip to content

Commit f7c0b19

Browse files
committed
refactor: implement polars engine and decouple CI/CD infrastructure
- Implemented Polars as assembly and semantic stage engine with full lazy/streaming support - Enforced IaC and decoupled application CI/CD from infrastructure provisioning - Restructured logging and resolved logical error handling bugs - Simplified report structure initialization in semantic stage - Aligned test suites and documentation with stage refactoring
1 parent 755b5cf commit f7c0b19

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+1480
-31085
lines changed

.dockerignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ __pycache__/
1414
.pytest_cache/
1515
.coverage
1616
.coveragerc
17+
.ruff_cache/
1718

1819
.vscode/
1920
.idea/

.gcp/terraforms/jobs.tf

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
resource "google_cloud_run_v2_job" "pipeline" {
2+
name = "operations-pipeline-${var.environment}"
3+
location = var.region
4+
depends_on = [google_project_service.enabled_APIs]
5+
6+
template {
7+
template {
8+
service_account = google_service_account.platform_accounts["ops-pipeline-sa"].email
9+
10+
# 30-minute timeout and 0 retries
11+
timeout = "1800s"
12+
max_retries = 0
13+
14+
containers {
15+
image = "us-docker.pkg.dev/cloudrun/container/hello"
16+
17+
resources {
18+
limits = {
19+
cpu = "2"
20+
memory = "4Gi"
21+
}
22+
}
23+
}
24+
}
25+
}
26+
lifecycle {
27+
ignore_changes = [
28+
template[0].template[0].containers[0].image,
29+
client,
30+
client_version
31+
]
32+
}
33+
}
34+
35+
resource "google_cloud_run_v2_job" "extractor" {
36+
name = "drive-extractor-${var.environment}"
37+
location = var.region
38+
depends_on = [google_project_service.enabled_APIs]
39+
40+
template {
41+
template {
42+
service_account = google_service_account.platform_accounts["drive-extractor-sa"].email
43+
44+
# 15-minute timeout and 2 retry
45+
timeout = "900s"
46+
max_retries = 2
47+
48+
containers {
49+
image = "us-docker.pkg.dev/cloudrun/container/hello"
50+
51+
resources {
52+
limits = {
53+
cpu = "1"
54+
memory = "1Gi"
55+
}
56+
}
57+
}
58+
}
59+
}
60+
lifecycle {
61+
ignore_changes = [
62+
template[0].template[0].containers[0].image,
63+
client,
64+
client_version
65+
]
66+
}
67+
}
68+
69+
70+
71+
72+
resource "google_artifact_registry_repository" "ops_repo" {
73+
location = var.region
74+
repository_id = "operations-artifacts-${var.environment}"
75+
description = "Operations Artifacts Repository"
76+
format = "DOCKER"
77+
}
78+

.gcp/terraforms/main.tf

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,25 @@
1+
terraform {
2+
required_providers {
3+
google = {
4+
source = "hashicorp/google"
5+
version = "~> 7.0"
6+
}
7+
}
8+
}
9+
110
provider "google" {
211
project = var.project_id
312
region = var.region
413
}
514

15+
# Upload tfstate to state storage
16+
terraform {
17+
backend "gcs" {
18+
bucket = "operations-terraform-state-vault-2026"
19+
prefix = "terraform/state"
20+
}
21+
}
22+
623
# Enable needed GCP APIs
724
locals {
825
services = [
@@ -12,7 +29,7 @@ locals {
1229
"workflows.googleapis.com",
1330
"eventarc.googleapis.com",
1431
"cloudscheduler.googleapis.com",
15-
"iamcredentials.googleapis.com"
32+
"iamcredentials.googleapis.com",
1633
]
1734
}
1835

.gcp/terraforms/monitoring.tf

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
resource "google_monitoring_notification_channel" "email" {
2+
3+
for_each = nonsensitive(var.alert_email_map)
4+
display_name = "Pipeline Alert - ${each.value}"
5+
type = "email"
6+
labels = { email_address = each.value }
7+
}
8+
9+
# All severity CRITICAL are open for 6 hours with 30mins interval repeat notification
10+
resource "google_monitoring_alert_policy" "pipeline_failure" {
11+
display_name = "Pipeline Failure Alert"
12+
combiner = "OR"
13+
severity = "CRITICAL"
14+
15+
notification_channels = [for channel in google_monitoring_notification_channel.email : channel.name]
16+
17+
conditions {
18+
display_name = "pipeline_crashed"
19+
20+
condition_matched_log {
21+
filter = <<-EOT
22+
resource.type="cloud_run_job"
23+
resource.labels.job_name="operations-pipeline${var.environment}"
24+
textPayload:"[ERROR]"
25+
EOT
26+
}
27+
}
28+
alert_strategy {
29+
30+
auto_close = "21600s"
31+
notification_prompts = ["OPENED"]
32+
33+
notification_rate_limit {
34+
period = "1800s"
35+
}
36+
}
37+
38+
documentation {
39+
mime_type = "text/markdown"
40+
content = <<-EOT
41+
## ALERT: Operations Pipeline Processing Failed!
42+
43+
**What Happened:** The `operations-pipeline-${var.environment}` Cloud Run Job crashed during data processing.
44+
45+
**Impact:** The raw data was extracted, but the final Parquet files were NOT updated. Dashboards will show yesterday's data.
46+
47+
**Next Steps for On-Call:**
48+
49+
1. Check job logs: Did it run out of memory (OOM)?
50+
2. Check runtime artifact logs: Was there any captured error logs?
51+
4. Fix the underlying issue and manually execute the pipeline job.
52+
EOT
53+
}
54+
}
55+
56+
resource "google_monitoring_alert_policy" "extractor_failure" {
57+
58+
display_name = "Drive Extractor Failure Alert"
59+
combiner = "OR"
60+
severity = "CRITICAL"
61+
62+
notification_channels = [for channel in google_monitoring_notification_channel.email : channel.name]
63+
64+
conditions {
65+
display_name = "extractor_crashed"
66+
67+
condition_matched_log {
68+
filter = <<-EOT
69+
resource.type="cloud_run_job"
70+
resource.labels.job_name="drive-extractor-${var.environment}"
71+
severity="ERROR"
72+
EOT
73+
}
74+
}
75+
76+
alert_strategy {
77+
auto_close = "21600s"
78+
notification_prompts = ["OPENED"]
79+
80+
notification_rate_limit {
81+
period = "1800s"
82+
}
83+
}
84+
85+
documentation {
86+
mime_type = "text/markdown"
87+
content = <<-EOT
88+
## ALERT: Drive Extractor Job Crashed!
89+
90+
**What Happened:** The `drive-extractor-${var.environment}` Cloud Run Job threw a fatal error. The pipeline is halted.
91+
92+
**Impact:**
93+
- Raw CSVs were not successfully pulled from Drive.
94+
- The `metadata.json` was not written.
95+
- Or the `.success` flag was not planted.
96+
97+
**Next Steps for On-Call Responder:**
98+
1. Check Cloud Run Job logs for Python tracebacks.
99+
2. Verify that the Google Drive folder is shared with the Drive Extractor SA email.
100+
3. Once fixed, manually execute the job.
101+
EOT
102+
}
103+
}
104+
105+
106+
resource "google_monitoring_alert_policy" "workflow_failure" {
107+
display_name = "Pipeline Dispatcher Failure Alert"
108+
combiner = "OR"
109+
severity = "CRITICAL"
110+
111+
notification_channels = [for channel in google_monitoring_notification_channel.email : channel.name]
112+
113+
conditions {
114+
display_name = "pipeline_dispatch_failed"
115+
116+
condition_matched_log {
117+
filter = <<-EOT
118+
resource.type="workflows.googleapis.com/Workflow"
119+
resource.labels.workflow_id="pipeline-dispatcher-${var.environment}"
120+
severity>=ERROR
121+
EOT
122+
}
123+
}
124+
alert_strategy {
125+
auto_close = "21600s"
126+
notification_prompts = ["OPENED"]
127+
128+
notification_rate_limit {
129+
period = "1800s"
130+
}
131+
}
132+
documentation {
133+
mime_type = "text/markdown"
134+
content = <<EOT
135+
## ALERT: Running Operations Pipeline has failed!
136+
137+
**What Happened**: The Eventarc Workflow `pipeline-dispatcher-${var.environment}` encountered a fatal error.
138+
139+
**Impact**: Dashboard consumers will see stale data.
140+
141+
**Next Steps for On-Call Responder:**
142+
143+
1. Click the "View Logs" button below to see the exact error.
144+
2. Check if the `drive-extractor` successfully dropped the `.success` file.
145+
3. Check if the `operations-pipeline` Cloud Run Job ran out of memory.
146+
EOT
147+
}
148+
}
149+
150+
151+
resource "google_monitoring_alert_policy" "scheduler_failure" {
152+
display_name = "Midnight Scheduler Failure Alert"
153+
combiner = "OR"
154+
severity = "CRITICAL"
155+
156+
notification_channels = [for channel in google_monitoring_notification_channel.email : channel.name]
157+
158+
conditions {
159+
display_name = "midnight_scheduler_failed"
160+
161+
condition_matched_log {
162+
filter = <<EOT
163+
resource.type="cloud_scheduler_job"
164+
jsonPayload.debugInfo="URL_ERROR-ERROR_NOT_FOUND. Original HTTP response code number = 404" OR jsonPayload.debugInfo="URL_ERROR-ERROR_AUTHENTICATION. Original HTTP response code number = 401"
165+
resource.labels.job_id="midnight-trigger-${var.environment}"
166+
EOT
167+
}
168+
}
169+
alert_strategy {
170+
auto_close = "21600s"
171+
notification_prompts = ["OPENED"]
172+
173+
notification_rate_limit {
174+
period = "1800s"
175+
}
176+
}
177+
documentation {
178+
mime_type = "text/markdown"
179+
content = <<EOT
180+
## ALERT: Cloud Scheduler Failed to Run!
181+
182+
**What Happened:** The `midnight-trigger-${var.environment}` job failed to execute.
183+
184+
**Impact:** The entire data pipeline has not started today.
185+
186+
**Next Steps for On-Call Responder:**
187+
188+
1. Check the Cloud Scheduler logs. Look for a 401 (Auth token expired) or 403 (IAM permission revoked).
189+
2. Manually click "Force Run" in the Cloud Scheduler console to start today's extraction.
190+
EOT
191+
}
192+
}

.gcp/terraforms/orchestration.tf

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Google Workflows
22
resource "google_workflows_workflow" "pipeline_dispatcher" {
3-
name = "pipeline-trigger-flow-${var.environment}"
3+
name = "pipeline-dispatcher-${var.environment}"
44
region = var.region
55
description = "Evaluates .success files and triggers pipeline"
66
service_account = google_service_account.platform_accounts["eventarc-invoker-sa"].email
@@ -12,8 +12,8 @@ resource "google_workflows_workflow" "pipeline_dispatcher" {
1212
}
1313

1414
# Pipeline Trigger: Eventarc
15-
resource "google_eventarc_trigger" "archival_success_trigger" {
16-
name = "archival-success-trigger-${var.environment}"
15+
resource "google_eventarc_trigger" "pipeline_dispatcher" {
16+
name = "pipeline-trigger-${var.environment}"
1717
location = var.region
1818

1919
# Monitor Archival Bucket
@@ -33,7 +33,6 @@ resource "google_eventarc_trigger" "archival_success_trigger" {
3333

3434
service_account = google_service_account.platform_accounts["eventarc-invoker-sa"].email
3535

36-
# Waits on these SAs
3736
depends_on = [
3837
google_project_iam_member.eventarc_event_receiver,
3938
google_project_iam_member.eventarc_workflows_invoker
@@ -42,7 +41,7 @@ resource "google_eventarc_trigger" "archival_success_trigger" {
4241

4342
# Drive Extractor Trigger: Cloud Scheduler
4443
resource "google_cloud_scheduler_job" "extractor_trigger" {
45-
name = "midnight-extractor-trigger-${var.environment}"
44+
name = "midnight-trigger-${var.environment}"
4645
description = "Execute drive-extractor daily 12AM (PHT)"
4746
schedule = "0 0 * * *"
4847
time_zone = "Asia/Manila"
@@ -51,7 +50,7 @@ resource "google_cloud_scheduler_job" "extractor_trigger" {
5150
http_target {
5251
http_method = "POST"
5352
# Points to the Deployed Cloud Run job (data extractor)
54-
uri = "https://${var.region}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${var.project_id}/jobs/drive-extractor:run"
53+
uri = "https://${var.region}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${var.project_id}/jobs/drive-extractor-${var.environment}:run"
5554

5655
oauth_token {
5756
service_account_email = google_service_account.platform_accounts["job-invoker-sa"].email

.gcp/terraforms/storage.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Archival Bucket
22
resource "google_storage_bucket" "ops_archival_bucket" {
3-
name = "ops-archival-bucket-${var.environment}"
3+
name = "ops-archival-storage-${var.environment}"
44
location = var.region
55
force_destroy = false
66
uniform_bucket_level_access = true
@@ -30,7 +30,7 @@ resource "google_storage_bucket" "ops_archival_bucket" {
3030

3131
# Pipeline Bucket
3232
resource "google_storage_bucket" "ops_pipeline_bucket" {
33-
name = "ops-pipeline-bucket-${var.environment}"
33+
name = "ops-pipeline-storage-${var.environment}"
3434
location = var.region
3535
force_destroy = false
3636
uniform_bucket_level_access = true

.gcp/terraforms/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,9 @@ variable "github_repo" {
1818
description = "GitHub Repository (Format: owner/repository)"
1919
type = string
2020
}
21+
22+
variable "alert_email_map" {
23+
type = map(string)
24+
description = "List of emails to receive pipeline alerts"
25+
sensitive = true
26+
}

.gcp/terraforms/wif.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ resource "google_iam_workload_identity_pool_provider" "github_provider" {
2828
}
2929
}
3030

31-
# CLI output
3231
output "GITHUB_WIF_PROVIDER_NAME" {
3332
value = google_iam_workload_identity_pool_provider.github_provider.name
3433
description = "GitHub Repository Secret: WIF_PROVIDER"
34+
sensitive = true
3535
}

0 commit comments

Comments
 (0)