Skip to content

Commit ce9ae1e

Browse files
committed
fix: add indexes, optimize query to use it
1 parent 9efe39d commit ce9ae1e

2 files changed

Lines changed: 57 additions & 19 deletions

File tree

services/libs/tinybird/datasources/activities_deduplicated_ds.datasource

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,5 @@ ENGINE_PARTITION_KEY toYear(timestamp)
4646
ENGINE_SORTING_KEY id, platform, channel
4747

4848
INDEXES >
49-
INDEX idx_body_ngrams body TYPE bloom_filter GRANULARITY 3,
50-
INDEX idx_title_ngrams title TYPE bloom_filter GRANULARITY 3
51-
49+
INDEX idx_body_ngram3 body TYPE ngrambf_v1(3, 2048, 6, 0) GRANULARITY 64,
50+
INDEX idx_title_ngram3 title TYPE ngrambf_v1(3, 512, 6, 0) GRANULARITY 64

services/libs/tinybird/pipes/activities_relations_filtered.pipe

Lines changed: 55 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ SQL >
8787
{% end %}
8888
)
8989
{% end %}
90-
-- ================== G1..G5 (OR, AND in groups) ==================
90+
-- ================== G1..G5 groups ==================
9191
{% set has_g1 = 0 %}
9292
{% if defined(G1_memberIds) %} {% set has_g1 = 1 %} {% end %}
9393
{% if defined(G1_memberIds_exclude) %} {% set has_g1 = 1 %} {% end %}
@@ -377,7 +377,6 @@ SQL >
377377
)
378378
{% end %}
379379
-- ================== end G1..G5 ==================
380-
-- chiusura CTE base_ar
381380
)
382381
SELECT
383382
ar.id,
@@ -392,20 +391,28 @@ SQL >
392391
ar.timestamp,
393392
ar.type
394393
FROM base_ar AS ar
395-
WHERE
396-
1
397-
{% if defined(searchTerm) and searchTerm %}
398-
AND CAST(ar.id AS String) IN (
399-
SELECT CAST(id AS String)
394+
{% if defined(searchTerm) and searchTerm %}
395+
SEMI
396+
JOIN
397+
(
398+
SELECT CAST(id AS String) AS id
400399
FROM activities_deduplicated_ds
401400
WHERE
402-
CAST(id AS String) IN (SELECT CAST(id AS String) FROM base_ar)
401+
1
402+
{% if defined(startDate) %}
403+
AND timestamp > parseDateTimeBestEffort({{ String(startDate) }})
404+
{% end %}
405+
{% if defined(endDate) %}
406+
AND timestamp < parseDateTimeBestEffort({{ String(endDate) }})
407+
{% end %}
403408
AND (
404409
positionCaseInsensitive(title, {{ String(searchTerm) }}) > 0
405410
OR positionCaseInsensitive(body, {{ String(searchTerm) }}) > 0
406411
)
407-
)
408-
{% end %}
412+
) act
413+
ON act.id = CAST(ar.id AS String)
414+
{% end %}
415+
WHERE 1
409416
{% if not is_count %}
410417
ORDER BY ar.timestamp DESC, ar.id DESC
411418
LIMIT {{ Int32(pageSize, 10) }}
@@ -415,11 +422,19 @@ SQL >
415422
NODE activities_enriched_v1
416423
SQL >
417424
%
425+
{% set has_start = defined(startDate) %}
426+
{% set has_end = defined(endDate) %}
418427
{% if defined(countOnly) and countOnly == '1' %}
419-
SELECT countDistinct(fr.id) AS count FROM filtered_relations fr
428+
-- If IDs are guaranteed unique in filtered_relations, use COUNT().
429+
-- Otherwise use countDistinct for correctness.
430+
SELECT
431+
{% if defined(fr_ids_are_unique) and fr_ids_are_unique == '1' %} count()
432+
{% else %} countDistinct(fr.id)
433+
{% end %} AS count
434+
FROM filtered_relations AS fr
420435
{% else %}
421436
SELECT
422-
fr.id,
437+
fr.id_str AS id,
423438
fr.channel,
424439
fr.isContribution,
425440
fr.memberId,
@@ -434,12 +449,36 @@ SQL >
434449
a.url,
435450
a.body,
436451
a.title
437-
FROM filtered_relations AS fr ANY
452+
FROM
453+
(
454+
-- Cast IDs only once here and keep alias fr
455+
SELECT
456+
CAST(id AS String) AS id_str,
457+
channel,
458+
isContribution,
459+
memberId,
460+
organizationId,
461+
platform,
462+
segmentId,
463+
sourceId,
464+
sourceParentId,
465+
timestamp,
466+
type
467+
FROM filtered_relations
468+
) AS fr
469+
ANY
438470
LEFT JOIN
439471
(
440-
SELECT CAST(id AS String) AS activity_id, attributes, url, body, title
472+
SELECT CAST(id AS String) AS id_str, attributes, url, body, title
441473
FROM activities_deduplicated_ds
442-
WHERE CAST(id AS String) IN (SELECT DISTINCT CAST(id AS String) FROM filtered_relations)
474+
WHERE
475+
1
476+
{% if has_start %} AND timestamp > parseDateTimeBestEffort({{ String(startDate) }})
477+
{% end %}
478+
{% if has_end %} AND timestamp < parseDateTimeBestEffort({{ String(endDate) }})
479+
{% end %}
480+
-- If activities_deduplicated_ds is partitioned by timestamp,
481+
-- this WHERE clause allows partition pruning
443482
) AS a
444-
ON CAST(fr.id AS String) = a.activity_id
483+
ON a.id_str = fr.id_str
445484
{% end %}

0 commit comments

Comments
 (0)