From 950b831628001be771fae2bdb81ab9c2d13e427e Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Wed, 30 Jul 2025 21:18:30 +0200 Subject: [PATCH 1/2] report --- .../reports_html_elements_popularity.js | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 definitions/output/reports/reports_html_elements_popularity.js diff --git a/definitions/output/reports/reports_html_elements_popularity.js b/definitions/output/reports/reports_html_elements_popularity.js new file mode 100644 index 00000000..18a68c35 --- /dev/null +++ b/definitions/output/reports/reports_html_elements_popularity.js @@ -0,0 +1,76 @@ +const pastMonth = constants.fnPastMonth(constants.currentMonth) + +publish('html_elements_popularity', { + schema: 'reports', + type: 'incremental', + tags: ['crux_ready'], + description: `Contact: https://github.com/bkardell` +}).preOps(` +CREATE TEMPORARY FUNCTION getElements(payload STRING) +RETURNS ARRAY LANGUAGE js AS ''' +try { + var elements = JSON.parse(payload); + if (Array.isArray(elements) || typeof elements != 'object') return []; + return Object.keys(elements); +} catch (e) { + return []; +} +'''; + +DELETE FROM ${ctx.self()} +WHERE date = '${pastMonth}'; +`).query(ctx => ` +WITH pages_data AS ( + SELECT + date, + client, + root_page, + page, + custom_metrics.element_count + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${pastMonth}' ${constants.devRankFilter} +), + +totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total + FROM pages_data + GROUP BY client +) + +SELECT + p.date, + p.client, + element, + COUNT(DISTINCT p.root_page) AS pages, + t.total, + COUNT(DISTINCT p.root_page) / t.total AS pct, + ARRAY_TO_STRING(ARRAY_AGG(DISTINCT p.page LIMIT 5), ' ') AS sample_urls +FROM pages_data p +JOIN totals t +ON p.client = t.client, + UNNEST(getElements(TO_JSON_STRING(p.element_count))) AS element +GROUP BY + p.client, + t.total, + element +HAVING + COUNT(DISTINCT p.root_page) >= 10 +ORDER BY + pages / total DESC, + client +`).postOps(ctx => ` +SELECT + reports.run_export_job( + JSON '''{ + "destination": "cloud_storage", + "config": { + "bucket": "${constants.bucket}", + "name": "${constants.storagePath}${pastMonth.replaceAll('-', '_')}/htmlElementPopularity.json" + }, + "query": "SELECT * EXCEPT(date) FROM ${ctx.self()} WHERE date = '${pastMonth}'" + }''' + ); +`) From ad919d045173b12d971769341facc518c9438078 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Wed, 30 Jul 2025 21:34:52 +0200 Subject: [PATCH 2/2] fixes --- .../output/reports/reports_html_elements_popularity.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/definitions/output/reports/reports_html_elements_popularity.js b/definitions/output/reports/reports_html_elements_popularity.js index 18a68c35..fe5f4f94 100644 --- a/definitions/output/reports/reports_html_elements_popularity.js +++ b/definitions/output/reports/reports_html_elements_popularity.js @@ -3,9 +3,9 @@ const pastMonth = constants.fnPastMonth(constants.currentMonth) publish('html_elements_popularity', { schema: 'reports', type: 'incremental', - tags: ['crux_ready'], - description: `Contact: https://github.com/bkardell` -}).preOps(` + tags: ['crawl_complete'], + description: 'Contact: https://github.com/bkardell' +}).preOps(ctx => ` CREATE TEMPORARY FUNCTION getElements(payload STRING) RETURNS ARRAY LANGUAGE js AS ''' try { @@ -53,6 +53,7 @@ JOIN totals t ON p.client = t.client, UNNEST(getElements(TO_JSON_STRING(p.element_count))) AS element GROUP BY + p.date, p.client, t.total, element