Skip to content

Commit 66be4e8

Browse files
committed
feat: add updatedAt on activities_deduplicated_ds, filter by updatedAt on copy pipe
1 parent bb01bce commit 66be4e8

2 files changed

Lines changed: 65 additions & 35 deletions

File tree

services/libs/tinybird/datasources/activities_deduplicated_ds.datasource

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,13 @@ SCHEMA >
3939
`attributes` String,
4040
`body` String DEFAULT '',
4141
`title` String DEFAULT '',
42-
`url` String DEFAULT ''
42+
`url` String DEFAULT '',
43+
`updatedAt` DateTime64(3)
44+
45+
INDEXES >
46+
idx_body_ngram3 body TYPE ngrambf_v1(3, 2048, 6, 0) GRANULARITY 64
47+
idx_title_ngram3 title TYPE ngrambf_v1(3, 512, 6, 0) GRANULARITY 64
4348

4449
ENGINE MergeTree
4550
ENGINE_PARTITION_KEY toYear(timestamp)
4651
ENGINE_SORTING_KEY id, platform, channel
47-
48-
INDEXES >
49-
INDEX idx_body_ngram3 body TYPE ngrambf_v1(3, 2048, 6, 0) GRANULARITY 64,
50-
INDEX idx_title_ngram3 title TYPE ngrambf_v1(3, 512, 6, 0) GRANULARITY 64

services/libs/tinybird/pipes/activities_deduplicated_copy_pipe.pipe

Lines changed: 59 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,36 +6,65 @@ TAGS "Activity preprocessing pipeline"
66
NODE activities_deduplicated_copy_pipe_0
77
SQL >
88
SELECT
9-
a.id,
10-
a.timestamp,
11-
a.platform,
12-
a.type,
13-
a.channel,
14-
a.isContribution,
15-
a.sourceId,
16-
a.sourceParentId,
17-
a.sentimentLabel,
18-
a.sentimentScore,
19-
(a.gitInsertions + gitDeletions) as gitChangedLines,
20-
case
21-
when gitChangedLines > 0 and gitChangedLines < 10
22-
then '1-9'
23-
when gitChangedLines > 9 and gitChangedLines < 60
24-
then '10-59'
25-
when gitChangedLines > 59 and gitChangedLines < 100
26-
then '60-99'
27-
when gitChangedLines > 99 and gitChangedLines < 500
28-
then '100-499'
29-
when gitChangedLines > 499
30-
then '500+'
31-
else ''
32-
end as "gitChangedLinesBucket",
33-
a.score,
34-
a.attributes,
35-
a.body,
36-
a.title,
37-
a.url
38-
FROM activities a final
9+
id,
10+
timestamp,
11+
platform,
12+
type,
13+
channel,
14+
isContribution,
15+
sourceId,
16+
sourceParentId,
17+
sentimentLabel,
18+
sentimentScore,
19+
gitChangedLines,
20+
multiIf(
21+
gitChangedLines BETWEEN 1 AND 9,
22+
'1-9',
23+
gitChangedLines BETWEEN 10 AND 59,
24+
'10-59',
25+
gitChangedLines BETWEEN 60 AND 99,
26+
'60-99',
27+
gitChangedLines BETWEEN 100 AND 499,
28+
'100-499',
29+
gitChangedLines >= 500,
30+
'500+',
31+
''
32+
) AS gitChangedLinesBucket,
33+
score,
34+
attributes,
35+
body,
36+
title,
37+
url,
38+
updatedAt
39+
FROM
40+
(
41+
SELECT
42+
a.id,
43+
a.timestamp,
44+
a.platform,
45+
a.type,
46+
a.channel,
47+
a.isContribution,
48+
a.sourceId,
49+
a.sourceParentId,
50+
a.sentimentLabel,
51+
a.sentimentScore,
52+
(a.gitInsertions + a.gitDeletions) AS gitChangedLines,
53+
a.score,
54+
a.attributes,
55+
a.body,
56+
a.title,
57+
a.url,
58+
a.updatedAt
59+
FROM activities a
60+
WHERE
61+
a.updatedAt > (
62+
SELECT coalesce(max(updatedAt), toDateTime64('1970-01-01', 3))
63+
FROM activities_deduplicated_ds
64+
)
65+
ORDER BY a.id, a.updatedAt DESC, a.sourceId DESC
66+
)
67+
LIMIT 1 BY id
3968

4069
TYPE COPY
4170
TARGET_DATASOURCE activities_deduplicated_ds

0 commit comments

Comments
 (0)