Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
5ec25da
remote function
max-ostapenko Feb 24, 2025
b3297c5
connection
max-ostapenko Feb 24, 2025
76d39f1
masthead update
max-ostapenko Feb 24, 2025
7ce60cd
formatting
max-ostapenko Feb 24, 2025
62744e1
Merge branch 'main' into judicial-snake
max-ostapenko Feb 24, 2025
496fcb7
Merge branch 'main' into judicial-snake
max-ostapenko Feb 25, 2025
5bc3de9
extend description
max-ostapenko Mar 1, 2025
2470a8b
reservation off
max-ostapenko Mar 2, 2025
c2b84e0
tf update
max-ostapenko Mar 2, 2025
54a3e2f
dataform export routine
max-ostapenko Mar 2, 2025
e0ef441
bq connections
max-ostapenko Mar 2, 2025
363a482
spark procedure role
max-ostapenko Mar 2, 2025
14ab9a9
docker update
max-ostapenko Mar 2, 2025
a48ca80
lint
max-ostapenko Mar 2, 2025
bd1cf17
lint
max-ostapenko Mar 2, 2025
d58054d
lint
max-ostapenko Mar 2, 2025
55fe58b
lint
max-ostapenko Mar 2, 2025
fc0d496
more spark roles
max-ostapenko Mar 3, 2025
6ee4cfd
mh submodule
max-ostapenko Mar 12, 2025
4f17535
Merge remote-tracking branch 'origin/main' into judicial-snake
max-ostapenko Mar 13, 2025
5baab97
submodule
max-ostapenko Mar 13, 2025
f2047d3
lint
max-ostapenko Mar 13, 2025
a1fc9bf
lint
max-ostapenko Mar 13, 2025
f2e2600
use connections from dataform
max-ostapenko Mar 13, 2025
a95f907
sync with latest version
max-ostapenko Mar 13, 2025
6e246fb
fix package versions
max-ostapenko Mar 13, 2025
492dadb
remove submodule
max-ostapenko Mar 13, 2025
b13806d
update packages
max-ostapenko Mar 13, 2025
d3fd4b4
test
max-ostapenko Mar 13, 2025
b9dc12d
Merge branch 'judicial-snake' into judicial-snake
max-ostapenko Mar 13, 2025
d96e87e
test
max-ostapenko Mar 13, 2025
14452e9
Merge branch 'judicial-snake' into judicial-snake
max-ostapenko Mar 13, 2025
9976f3a
rewrite triggers
max-ostapenko Mar 13, 2025
4b477c2
adjust bq export
max-ostapenko Mar 13, 2025
202b345
mh roles update
max-ostapenko Mar 13, 2025
a0dc7a0
cleanup
max-ostapenko Mar 13, 2025
22df13b
packages update
max-ostapenko Mar 13, 2025
57ca803
test
max-ostapenko Mar 13, 2025
40d09ae
Merge branch 'judicial-snake' into judicial-snake
max-ostapenko Mar 13, 2025
e5aaf33
packages update
max-ostapenko Mar 13, 2025
90add5b
arguments renamed
max-ostapenko Mar 13, 2025
a1529fa
current nodejs
max-ostapenko Mar 13, 2025
0a38251
Update definitions/output/reports/tech_report_technologies.js
max-ostapenko Mar 14, 2025
fcf1672
remove reservation
max-ostapenko Mar 17, 2025
e3ec99f
Merge branch 'judicial-snake' of https://github.com/HTTPArchive/dataf…
max-ostapenko Mar 17, 2025
f96cdf1
Merge branch 'main' into judicial-snake
max-ostapenko Apr 10, 2025
37fa3b2
switch order
max-ostapenko Apr 10, 2025
8227d9b
deactivate spark procedure
max-ostapenko Apr 14, 2025
cd482ba
deactivate standard exports
max-ostapenko Apr 14, 2025
3bdd444
updated function calling
max-ostapenko Apr 14, 2025
06960f2
standard reports export draft
max-ostapenko Apr 14, 2025
a43a3db
update description
max-ostapenko Apr 14, 2025
95bca5b
dependabot update
max-ostapenko Apr 15, 2025
00fae36
fix month
max-ostapenko Apr 15, 2025
6b444c8
fix query generation
max-ostapenko Apr 15, 2025
77f355d
update
max-ostapenko Apr 15, 2025
71fa9e7
fix env var
max-ostapenko Apr 15, 2025
461077a
fix export config
max-ostapenko Apr 15, 2025
0747c8f
Merge branch 'main' into judicial-snake
max-ostapenko Apr 15, 2025
a722e60
formatting
max-ostapenko Apr 15, 2025
6c45fe0
Merge branch 'judicial-snake' of https://github.com/HTTPArchive/dataf…
max-ostapenko Apr 15, 2025
793fc64
fix
max-ostapenko Apr 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@ updates:
schedule:
interval: "weekly"
- package-ecosystem: "npm"
directory: "/src"
directory: "/infra/bigquery-export"
schedule:
interval: "weekly"
- package-ecosystem: "npm"
directory: "infra/dataform-export"
schedule:
interval: "weekly"
- package-ecosystem: "npm"
directory: "infra/dataform-trigger"
schedule:
interval: "weekly"
- package-ecosystem: "terraform"
directory: "infra/tf/"
schedule:
interval: "weekly"
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,9 @@ tf_plan:

tf_apply:
terraform -chdir=infra/tf init && terraform -chdir=infra/tf apply -auto-approve
cd infra/bigquery-export/ && npm install && npm run buildpack

bigquery_export_deploy:
cd infra/bigquery-export && npm install && npm run buildpack

#bigquery_export_spark_deploy:
# cd infra/bigquery_export_spark && gcloud builds submit --region=global --tag us-docker.pkg.dev/httparchive/bigquery-spark-procedures/firestore_export:latest
39 changes: 36 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ Consumers:

### Triggering workflows

In order to unify the workflow triggering mechanism, we use [a Cloud Run function](./src/README.md) that can be invoked in a number of ways (e.g. listen to PubSub messages), do intermediate checks and trigger the particular Dataform workflow execution configuration.
In order to unify the workflow triggering mechanism, we use [a Cloud Run function](./infra/README.md) that can be invoked in a number of ways (e.g. listen to PubSub messages), do intermediate checks and trigger the particular Dataform workflow execution configuration.

## Contributing

Expand All @@ -59,5 +59,38 @@ In order to unify the workflow triggering mechanism, we use [a Cloud Run functio

#### Workspace hints

1. In `workflow_settings.yaml` set `env_name: dev` to process sampled data.
2. In `includes/constants.js` set `today` or other variables to a custome value.
1. In `workflow_settings.yaml` set `environment: dev` to process sampled data.
2. For development and testing, you can modify variables in `includes/constants.js`, but note that these are programmatically generated.

## Repository Structure

- `definitions/` - Contains the core Dataform SQL definitions and declarations
- `output/` - Contains the main pipeline transformation logic
- `declarations/` - Contains referenced tables/views declarations and other resources definitions
- `includes/` - Contains shared JavaScript utilities and constants
- `infra/` - Infrastructure code and deployment configurations
- `dataform-trigger/` - Cloud Run function for workflow automation
- `tf/` - Terraform configurations
- `bigquery-export/` - BigQuery export configurations
- `docs/` - Additional documentation

## Development Setup

1. Install dependencies:

```bash
npm install
```

2. Available Scripts:

- `npm run format` - Format code using Standard.js, fix Markdown issues, and format Terraform files
- `npm run lint` - Run linting checks on JavaScript, Markdown files, and compile Dataform configs

## Code Quality

This repository uses:

- Standard.js for JavaScript code style
- Markdownlint for Markdown file formatting
- Dataform's built-in compiler for SQL validation
17 changes: 15 additions & 2 deletions definitions/output/reports/cwv_tech_adoption.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ publish('cwv_tech_adoption', {
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
`).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "date": "${pastMonth}", "name": "adoption", "type": "report"} */
SELECT
date,
app AS technology,
Expand All @@ -30,4 +29,18 @@ GROUP BY
app,
rank,
geo
`)
`).postOps(ctx => `
SELECT
reports.run_export_job(
JSON '''{
"destination": "firestore",
"config": {
"database": "tech-report-apis-${constants.environment}",
"collection": "adoption",
"type": "report",
"date": "${pastMonth}"
},
"query": "SELECT STRING(date) AS date, * EXCEPT(date) FROM ${ctx.self()} WHERE date = '${pastMonth}'"
}'''
);
`)
18 changes: 15 additions & 3 deletions definitions/output/reports/cwv_tech_categories.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ publish('cwv_tech_categories', {
type: 'table',
tags: ['crux_ready']
}).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */
WITH pages AS (
SELECT DISTINCT
client,
Expand Down Expand Up @@ -50,7 +49,7 @@ technology_stats AS (
SELECT
technology,
category_obj AS categories,
SUM(origins.dektop + origins.mobile) AS total_origins
SUM(origins.desktop + origins.mobile) AS total_origins
FROM ${ctx.ref('reports', 'cwv_tech_technologies')}
GROUP BY
technology,
Expand Down Expand Up @@ -91,4 +90,17 @@ SELECT
) AS origins,
NULL AS technologies
FROM total_pages
`)
`).postOps(ctx => `
SELECT
reports.run_export_job(
JSON '''{
"destination": "firestore",
"config": {
"database": "tech-report-apis-${constants.environment}",
"collection": "categories",
"type": "dict"
},
"query": "SELECT * FROM ${ctx.self()}"
}'''
);
`)
17 changes: 15 additions & 2 deletions definitions/output/reports/cwv_tech_core_web_vitals.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ return Object.values(vitals)
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
`).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "date": "${pastMonth}", "name": "core_web_vitals", "type": "report"} */
SELECT
date,
app AS technology,
Expand Down Expand Up @@ -98,4 +97,18 @@ GROUP BY
app,
rank,
geo
`)
`).postOps(ctx => `
SELECT
reports.run_export_job(
JSON '''{
"destination": "firestore",
"config": {
"database": "tech-report-apis-${constants.environment}",
"collection": "core_web_vitals",
"type": "report",
"date": "${pastMonth}"
},
"query": "SELECT STRING(date) AS date, * EXCEPT(date) FROM ${ctx.self()} WHERE date = '${pastMonth}'"
}'''
);
`)
17 changes: 15 additions & 2 deletions definitions/output/reports/cwv_tech_lighthouse.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ return Object.values(lighthouse)
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
`).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "date": "${pastMonth}", "name": "lighthouse", "type": "report"} */
SELECT
date,
app AS technology,
Expand All @@ -75,4 +74,18 @@ GROUP BY
app,
rank,
geo
`)
`).postOps(ctx => `
SELECT
reports.run_export_job(
JSON '''{
"destination": "firestore",
"config": {
"database": "tech-report-apis-${constants.environment}",
"collection": "lighthouse",
"type": "report",
"date": "${pastMonth}"
},
"query": "SELECT STRING(date) AS date, * EXCEPT(date) FROM ${ctx.self()} WHERE date = '${pastMonth}'"
}'''
);
`)
17 changes: 15 additions & 2 deletions definitions/output/reports/cwv_tech_page_weight.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ return Object.values(pageWeight)
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
`).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "date": "${pastMonth}", "name": "page_weight", "type": "report"} */
SELECT
date,
app AS technology,
Expand All @@ -65,4 +64,18 @@ GROUP BY
app,
rank,
geo
`)
`).postOps(ctx => `
SELECT
reports.run_export_job(
JSON '''{
"destination": "firestore",
"config": {
"database": "tech-report-apis-${constants.environment}",
"collection": "page_weight",
"type": "report",
"date": "${pastMonth}"
},
"query": "SELECT STRING(date) AS date, * EXCEPT(date) FROM ${ctx.self()} WHERE date = '${pastMonth}'"
}'''
);
`)
16 changes: 14 additions & 2 deletions definitions/output/reports/cwv_tech_technologies.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ publish('cwv_tech_technologies', {
type: 'table',
tags: ['crux_ready']
}).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "technologies", "type": "dict"} */
WITH pages AS (
SELECT DISTINCT
client,
Expand Down Expand Up @@ -86,4 +85,17 @@ SELECT
MAX(IF(client = 'mobile', origins, 0)) AS mobile
) AS origins
FROM total_pages
`)
`).postOps(ctx => `
SELECT
reports.run_export_job(
JSON '''{
"destination": "firestore",
"config": {
"database": "tech-report-apis-${constants.environment}",
"collection": "technologies",
"type": "dict"
},
"query": "SELECT * FROM ${ctx.self()}"
}'''
);
`)
82 changes: 71 additions & 11 deletions definitions/output/reports/reports_dynamic.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,48 @@
const configs = new reports.HTTPArchiveReports()
const metrics = configs.listMetrics()

const bucket = 'httparchive'
const storagePath = '/reports/'

function generateExportQuery (metric, sql, params, ctx) {
let query = ''
if (sql.type === 'histogram') {
query = `
SELECT
* EXCEPT(date)
FROM ${ctx.self()}
WHERE date = '${params.date}'
`
} else if (sql.type === 'timeseries') {
query = `
SELECT
FORMAT_DATE('%Y_%m_%d', date) AS date,
* EXCEPT(date)
FROM ${ctx.self()}
`
} else {
throw new Error('Unknown SQL type')
}

const queryOutput = query.replace(/[\r\n]+/g, ' ')
return queryOutput
}

function generateExportPath (metric, sql, params) {
if (sql.type === 'histogram') {
return `${storagePath}${params.date.replaceAll('-', '_')}/${metric.id}.json`
} else if (sql.type === 'timeseries') {
return `${storagePath}${metric.id}.json`
} else {
throw new Error('Unknown SQL type')
}
}

const iterations = []
for (
let month = constants.currentMonth; month >= constants.currentMonth; month = constants.fnPastMonth(month)) {
let date = constants.currentMonth; date >= constants.currentMonth; date = constants.fnPastMonth(date)) {
iterations.push({
date: month,
date,
devRankFilter: constants.devRankFilter
})
}
Expand All @@ -18,29 +55,52 @@ if (iterations.length === 1) {
type: 'incremental',
protected: true,
bigquery: sql.type === 'histogram' ? { partitionBy: 'date', clusterBy: ['client'] } : {},
schema: 'reports',
tags: ['crawl_complete', 'http_reports']
schema: 'reports'
// tags: ['crawl_complete', 'http_reports']
}).preOps(ctx => `
--DELETE FROM ${ctx.self()}
--WHERE date = '${params.date}';
`).query(ctx => `
/* {"dataform_trigger": "report_complete", "date": "${params.date}", "name": "${metric.id}", "type": "${sql.type}"} */` +
sql.query(ctx, params))
`).query(
ctx => sql.query(ctx, params)
).postOps(ctx => `
SELECT
reports.run_export_job(
JSON '''{
"destination": "cloud_storage",
"config": {
"bucket": "${bucket}",
"name": "${generateExportPath(metric, sql, params)}"
},
"query": "${generateExportQuery(metric, sql, params, ctx)}"
}'''
);
`)
})
})
} else {
iterations.forEach((params, i) => {
metrics.forEach(metric => {
metric.SQL.forEach(sql => {
operate(metric.id + '_' + sql.type + '_' + params.date, {
tags: ['crawl_complete']
// tags: ['crawl_complete']
}).queries(ctx => `
DELETE FROM reports.${metric.id}_${sql.type}
WHERE date = '${params.date}';

/* {"dataform_trigger": "report_complete", "date": "${params.date}", "name": "${metric.id}", "type": "${sql.type}"} */
INSERT INTO reports.${metric.id}_${sql.type}` +
sql.query(ctx, params))
INSERT INTO reports.${metric.id}_${sql.type}` + sql.query(ctx, params)
).postOps(ctx => `
SELECT
reports.run_export_job(
JSON '''{
"destination": "cloud_storage",
"config": {
"bucket": "${bucket}",
"name": "${generateExportPath(metric, sql, params)}"
},
"query": "${generateExportQuery(metric, sql, params, ctx)}"
}'''
);
`)
})
})
})
Expand Down
15 changes: 14 additions & 1 deletion definitions/output/reports/tech_report_adoption.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ publish('tech_report_adoption', {
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
`).query(ctx => `
/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "adoption", "type": "report"} */
SELECT
date,
geo,
Expand All @@ -32,4 +31,18 @@ GROUP BY
rank,
technology,
version
`).postOps(ctx => `
SELECT
reports.run_export_job(
JSON '''{
"destination": "firestore",
"config": {
"database": "tech-report-api-${constants.environment}",
"collection": "adoption",
"type": "report",
"date": "${pastMonth}"
},
"query": "SELECT STRING(date) AS date, * EXCEPT(date) FROM ${ctx.self()} WHERE date = '${pastMonth}'"
}'''
);
`)
Loading