@@ -6,36 +6,65 @@ TAGS "Activity preprocessing pipeline"
66NODE activities_deduplicated_copy_pipe_0
77SQL >
88 SELECT
9- a.id,
10- a.timestamp,
11- a.platform,
12- a.type,
13- a.channel,
14- a.isContribution,
15- a.sourceId,
16- a.sourceParentId,
17- a.sentimentLabel,
18- a.sentimentScore,
19- (a.gitInsertions + gitDeletions) as gitChangedLines,
20- case
21- when gitChangedLines > 0 and gitChangedLines < 10
22- then '1-9'
23- when gitChangedLines > 9 and gitChangedLines < 60
24- then '10-59'
25- when gitChangedLines > 59 and gitChangedLines < 100
26- then '60-99'
27- when gitChangedLines > 99 and gitChangedLines < 500
28- then '100-499'
29- when gitChangedLines > 499
30- then '500+'
31- else ''
32- end as "gitChangedLinesBucket",
33- a.score,
34- a.attributes,
35- a.body,
36- a.title,
37- a.url
38- FROM activities a final
9+ id,
10+ timestamp,
11+ platform,
12+ type,
13+ channel,
14+ isContribution,
15+ sourceId,
16+ sourceParentId,
17+ sentimentLabel,
18+ sentimentScore,
19+ gitChangedLines,
20+ multiIf(
21+ gitChangedLines BETWEEN 1 AND 9,
22+ '1-9',
23+ gitChangedLines BETWEEN 10 AND 59,
24+ '10-59',
25+ gitChangedLines BETWEEN 60 AND 99,
26+ '60-99',
27+ gitChangedLines BETWEEN 100 AND 499,
28+ '100-499',
29+ gitChangedLines >= 500,
30+ '500+',
31+ ''
32+ ) AS gitChangedLinesBucket,
33+ score,
34+ attributes,
35+ body,
36+ title,
37+ url,
38+ updatedAt
39+ FROM
40+ (
41+ SELECT
42+ a.id,
43+ a.timestamp,
44+ a.platform,
45+ a.type,
46+ a.channel,
47+ a.isContribution,
48+ a.sourceId,
49+ a.sourceParentId,
50+ a.sentimentLabel,
51+ a.sentimentScore,
52+ (a.gitInsertions + a.gitDeletions) AS gitChangedLines,
53+ a.score,
54+ a.attributes,
55+ a.body,
56+ a.title,
57+ a.url,
58+ a.updatedAt
59+ FROM activities a
60+ WHERE
61+ a.updatedAt > (
62+ SELECT coalesce(max(updatedAt), toDateTime64('1970-01-01', 3))
63+ FROM activities_deduplicated_ds
64+ )
65+ ORDER BY a.id, a.updatedAt DESC, a.sourceId DESC
66+ )
67+ LIMIT 1 BY id
3968
4069TYPE COPY
4170TARGET_DATASOURCE activities_deduplicated_ds
0 commit comments