-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathreports_html_elements_popularity.js
More file actions
77 lines (73 loc) · 1.73 KB
/
reports_html_elements_popularity.js
File metadata and controls
77 lines (73 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
const pastMonth = constants.fnPastMonth(constants.currentMonth)
publish('html_elements_popularity', {
schema: 'reports',
type: 'incremental',
tags: ['crawl_complete'],
description: 'Contact: https://github.com/bkardell'
}).preOps(ctx => `
CREATE TEMPORARY FUNCTION getElements(payload STRING)
RETURNS ARRAY<STRING> LANGUAGE js AS '''
try {
var elements = JSON.parse(payload);
if (Array.isArray(elements) || typeof elements != 'object') return [];
return Object.keys(elements);
} catch (e) {
return [];
}
''';
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
`).query(ctx => `
WITH pages_data AS (
SELECT
date,
client,
root_page,
page,
custom_metrics.element_count
FROM ${ctx.ref('crawl', 'pages')}
WHERE
date = '${pastMonth}' ${constants.devRankFilter}
),
totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS total
FROM pages_data
GROUP BY client
)
SELECT
p.date,
p.client,
element,
COUNT(DISTINCT p.root_page) AS pages,
t.total,
COUNT(DISTINCT p.root_page) / t.total AS pct,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT p.page LIMIT 5), ' ') AS sample_urls
FROM pages_data p
JOIN totals t
ON p.client = t.client,
UNNEST(getElements(TO_JSON_STRING(p.element_count))) AS element
GROUP BY
p.date,
p.client,
t.total,
element
HAVING
COUNT(DISTINCT p.root_page) >= 10
ORDER BY
pages / total DESC,
client
`).postOps(ctx => `
SELECT
reports.run_export_job(
JSON '''{
"destination": "cloud_storage",
"config": {
"bucket": "${constants.bucket}",
"name": "${constants.storagePath}${pastMonth.replaceAll('-', '_')}/htmlElementPopularity.json"
},
"query": "SELECT * EXCEPT(date) FROM ${ctx.self()} WHERE date = '${pastMonth}'"
}'''
);
`)