diff --git a/CHANGELOG.md b/CHANGELOG.md index f3296fd..760afa3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,23 @@ All notable changes to this project will be documented in this file. It uses the [Semantic Versioning]: https://semver.org/spec/v2.0.0.html "Semantic Versioning 2.0.0" +## [v0.2.1] — Unreleased + +### ⚡ Improvements + +* Added pushdown for [pg_re2] functions to their ClickHouse equivalents + (e.g., `re2match` → `match`, `re2extractall` → `extractAll`). Thanks to + Philip Dubé for the PR ([#204]). + +### 📚 Documentation + +* Added "Extensions Pushdown" section to the [reference + docs](doc/pg_clickhouse.md), covering re2 and intarray support. + + [v0.2.1]: https://github.com/ClickHouse/pg_clickhouse/compare/v0.2.0...v0.2.1 + [pg_re2]: https://github.com/ClickHouse/pg_re2 + "pg_re2: ClickHouse-compatible regex functions using RE2" + ## [v0.2.0] — 2026-04-13 This release makes binary-compatible changes to the v0.1 releases. Once diff --git a/META.json b/META.json index 0b445bc..5df3e1d 100644 --- a/META.json +++ b/META.json @@ -16,6 +16,9 @@ "runtime": { "requires": { "PostgreSQL": "13.0.0" + }, + "recommends": { + "re2": "0.1.0" } } }, diff --git a/doc/pg_clickhouse.md b/doc/pg_clickhouse.md index 48f46aa..6bffe23 100644 --- a/doc/pg_clickhouse.md +++ b/doc/pg_clickhouse.md @@ -1105,6 +1105,34 @@ any of these functions cannot be pushed down they will raise an exception. * [dictGet](https://clickhouse.com/docs/sql-reference/functions/ext-dict-functions#dictget-dictgetordefault-dictgetornull) +### Extension Pushdown + +pg_clickhouse recognizes functions from select core and third-party extensions, +pushing them down to their ClickHouse equivalents. + +#### re2 + +All [re2 extension] functions push down 1:1 to ClickHouse: + +* `re2match` → [match](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#match) +* `re2extract` → [extract](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#extract) +* `re2extractall` → [extractAll](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#extractAll) +* `re2regexpextract` → [regexpExtract](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#regexpExtract) +* `re2extractgroups` → [extractGroups](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#extractGroups) +* `re2replaceregexpone` → [replaceRegexpOne](https://clickhouse.com/docs/sql-reference/functions/string-replace-functions#replaceRegexpOne) +* `re2replaceregexpall` → [replaceRegexpAll](https://clickhouse.com/docs/sql-reference/functions/string-replace-functions#replaceRegexpAll) +* `re2countmatches` → [countMatches](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#countMatches) +* `re2countmatchescaseinsensitive` → [countMatchesCaseInsensitive](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#countMatchesCaseInsensitive) +* `re2multimatchany` → [multiMatchAny](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#multiMatchAny) +* `re2multimatchanyindex` → [multiMatchAnyIndex](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#multiMatchAnyIndex) +* `re2multimatchallindices` → [multiMatchAllIndices](https://clickhouse.com/docs/sql-reference/functions/string-search-functions#multiMatchAllIndices) + +#### intarray + +One [intarray] function pushes down to ClickHouse: + +* `idx` → [indexOf](https://clickhouse.com/docs/sql-reference/functions/array-functions#indexOf) + ### Pushdown Casts pg_clickhouse pushes down casts such as `CAST(x AS bigint)` for compatible @@ -1360,3 +1388,7 @@ Copyright (c) 2025-2026, ClickHouse. [Postgres flags]: https://www.postgresql.org/docs/18/functions-matching.html#POSIX-EMBEDDED-OPTIONS-TABLE "PostgreSQL Docs: ARE Embedded-Option Letters" [RE2 Regular Expressions]: https://github.com/google/re2/wiki/Syntax "RE2 Syntax" + [re2 extension]: https://github.com/ClickHouse/pg_re2 + "pg_re2: ClickHouse-compatible regex functions using RE2" + [intarray]: https://www.postgresql.org/docs/current/intarray.html + "PostgreSQL Docs: intarray" diff --git a/src/custom_types.c b/src/custom_types.c index 72d161b..15ad6f5 100644 --- a/src/custom_types.c +++ b/src/custom_types.c @@ -163,6 +163,40 @@ chfdw_check_for_ordered_aggregate(Aggref * agg) return STR_EQUAL(extname, "pg_clickhouse"); } +/* + * Map sans-prefix pg_re2 function names to ClickHouse + * case-sensitive names. Must be kept in lexicographic order. + */ +static char *re2_func_map[][2] = { + {"countmatches", "countMatches"}, + {"countmatchescaseinsensitive", "countMatchesCaseInsensitive"}, + {"extractall", "extractAll"}, + {"extractgroups", "extractGroups"}, + {"multimatchallindices", "multiMatchAllIndices"}, + {"multimatchany", "multiMatchAny"}, + {"multimatchanyindex", "multiMatchAnyIndex"}, + {"regexpextract", "regexpExtract"}, + {"replaceregexpall", "replaceRegexpAll"}, + {"replaceregexpone", "replaceRegexpOne"}, + {NULL, NULL}, +}; + +inline static char * +re2_func_name(char *proname) +{ + Assert(strncmp(proname, "re2", 3) == 0); + char *stripped = proname + 3; + size_t i = 0; + + while (re2_func_map[i][0] != NULL) + { + if (STR_EQUAL(re2_func_map[i][0], stripped)) + return re2_func_map[i][1]; + i++; + } + return stripped; +} + /* * Map pg_clickhouse pushdown function names to ClickHouse case-sensitive * names. Must be kept in lexicographic order. @@ -520,6 +554,12 @@ chfdw_check_for_custom_function(Oid funcid) strcpy(entry->custom_name, "indexOf"); } } + else if (STR_EQUAL(extname, "re2")) + { + /* pg_re2: 1:1 pushdown to ClickHouse RE2 functions. */ + entry->cf_type = CF_CH_FUNCTION; + strlcpy(entry->custom_name, re2_func_name(proname), NAMEDATALEN); + } else if (STR_EQUAL(extname, "pg_clickhouse")) { /* pg_clickhouse custom functions. */ diff --git a/src/deparse.c b/src/deparse.c index c1f742c..502a674 100644 --- a/src/deparse.c +++ b/src/deparse.c @@ -446,10 +446,6 @@ foreign_expr_walker(Node * node, break; } - /* Other variadic functions are not in ClickHouse. */ - if (fe->funcvariadic) - return false; - /* * Recurse to input subexpressions. */ diff --git a/test/expected/re2_functions.out b/test/expected/re2_functions.out new file mode 100644 index 0000000..9dcb94d --- /dev/null +++ b/test/expected/re2_functions.out @@ -0,0 +1,151 @@ +SELECT EXISTS(SELECT 1 FROM pg_available_extensions WHERE name = 're2') AS have_re2 \gset +\if :have_re2 +CREATE SERVER re2_svr FOREIGN DATA WRAPPER clickhouse_fdw OPTIONS(dbname 're2_test'); +CREATE USER MAPPING FOR CURRENT_USER SERVER re2_svr; +SELECT clickhouse_raw_query('DROP DATABASE IF EXISTS re2_test'); + clickhouse_raw_query +---------------------- + +(1 row) + +SELECT clickhouse_raw_query('CREATE DATABASE re2_test'); + clickhouse_raw_query +---------------------- + +(1 row) + +SELECT clickhouse_raw_query($$ + CREATE TABLE re2_test.t1 ( + id Int32, + val String + ) ENGINE = MergeTree ORDER BY id +$$); + clickhouse_raw_query +---------------------- + +(1 row) + +SELECT clickhouse_raw_query($$ + INSERT INTO re2_test.t1 VALUES + (1, 'POSIX uses BRE and ERE'), + (2, 're2 uses finite automata'), + (3, 'PCRE supports backtracking') +$$); + clickhouse_raw_query +---------------------- + +(1 row) + +CREATE SCHEMA re2_test; +IMPORT FOREIGN SCHEMA re2_test FROM SERVER re2_svr INTO re2_test; +SET search_path = re2_test, public; +CREATE EXTENSION re2; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2match(val, 're2'); + QUERY PLAN +------------------------------------------------------------------------- + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE (match(val, 're2')) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2extract(val, '(re2)') = 're2'; + QUERY PLAN +--------------------------------------------------------------------------------------- + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE ((extract(val, '(re2)') = 're2')) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2extractall(val, '[A-Z]+') = ARRAY['POSIX','BRE','ERE']; + QUERY PLAN +----------------------------------------------------------------------------------------------------------- + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE ((extractAll(val, '[A-Z]+') = ['POSIX','BRE','ERE'])) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2regexpextract(val, '(re2)', 1) = 're2'; + QUERY PLAN +------------------------------------------------------------------------------------------------ + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE ((regexpExtract(val, '(re2)', 1) = 're2')) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2extractgroups(val, '(POSIX) uses (BRE)') = ARRAY['POSIX','BRE']; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------- + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE ((extractGroups(val, '(POSIX) uses (BRE)') = ['POSIX','BRE'])) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2replaceregexpone(val, 'POSIX', 're2') = 're2 uses BRE and ERE'; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE ((replaceRegexpOne(val, 'POSIX', 're2') = 're2 uses BRE and ERE')) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2replaceregexpall(val, ' ', '-') = 're2-uses-finite-automata'; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------- + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE ((replaceRegexpAll(val, ' ', '-') = 're2-uses-finite-automata')) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2countmatches(val, 'e') > 0; + QUERY PLAN +------------------------------------------------------------------------------------ + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE ((countMatches(val, 'e') > 0)) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2countmatchescaseinsensitive(val, 'E') > 0; + QUERY PLAN +--------------------------------------------------------------------------------------------------- + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE ((countMatchesCaseInsensitive(val, 'E') > 0)) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2multimatchany(val, 'POSIX','PCRE'); + QUERY PLAN +-------------------------------------------------------------------------------------------- + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE (multiMatchAny(val, ['POSIX','PCRE'])) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2multimatchanyindex(val, 'POSIX','PCRE') > 0; + QUERY PLAN +------------------------------------------------------------------------------------------------------- + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE ((multiMatchAnyIndex(val, ['POSIX','PCRE']) > 0)) +(3 rows) + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2multimatchallindices(val, 'POSIX','PCRE') = ARRAY[1]; + QUERY PLAN +----------------------------------------------------------------------------------------------------------- + Foreign Scan on re2_test.t1 + Output: id, val + Remote SQL: SELECT id, val FROM re2_test.t1 WHERE ((multiMatchAllIndices(val, ['POSIX','PCRE']) = [1])) +(3 rows) + +DROP EXTENSION re2; +DROP USER MAPPING FOR CURRENT_USER SERVER re2_svr; +SELECT clickhouse_raw_query('DROP DATABASE re2_test'); + clickhouse_raw_query +---------------------- + +(1 row) + +DROP SERVER re2_svr CASCADE; +NOTICE: drop cascades to foreign table t1 +\else +\echo 'SKIP: re2 extension not available' +\endif diff --git a/test/expected/re2_functions_1.out b/test/expected/re2_functions_1.out new file mode 100644 index 0000000..068a21f --- /dev/null +++ b/test/expected/re2_functions_1.out @@ -0,0 +1,42 @@ +SELECT EXISTS(SELECT 1 FROM pg_available_extensions WHERE name = 're2') AS have_re2 \gset +\if :have_re2 +CREATE SERVER re2_svr FOREIGN DATA WRAPPER clickhouse_fdw OPTIONS(dbname 're2_test'); +CREATE USER MAPPING FOR CURRENT_USER SERVER re2_svr; +SELECT clickhouse_raw_query('DROP DATABASE IF EXISTS re2_test'); +SELECT clickhouse_raw_query('CREATE DATABASE re2_test'); +SELECT clickhouse_raw_query($$ + CREATE TABLE re2_test.t1 ( + id Int32, + val String + ) ENGINE = MergeTree ORDER BY id +$$); +SELECT clickhouse_raw_query($$ + INSERT INTO re2_test.t1 VALUES + (1, 'POSIX uses BRE and ERE'), + (2, 're2 uses finite automata'), + (3, 'PCRE supports backtracking') +$$); +CREATE SCHEMA re2_test; +IMPORT FOREIGN SCHEMA re2_test FROM SERVER re2_svr INTO re2_test; +SET search_path = re2_test, public; +CREATE EXTENSION re2; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2match(val, 're2'); +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2extract(val, '(re2)') = 're2'; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2extractall(val, '[A-Z]+') = ARRAY['POSIX','BRE','ERE']; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2regexpextract(val, '(re2)', 1) = 're2'; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2extractgroups(val, '(POSIX) uses (BRE)') = ARRAY['POSIX','BRE']; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2replaceregexpone(val, 'POSIX', 're2') = 're2 uses BRE and ERE'; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2replaceregexpall(val, ' ', '-') = 're2-uses-finite-automata'; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2countmatches(val, 'e') > 0; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2countmatchescaseinsensitive(val, 'E') > 0; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2multimatchany(val, 'POSIX','PCRE'); +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2multimatchanyindex(val, 'POSIX','PCRE') > 0; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2multimatchallindices(val, 'POSIX','PCRE') = ARRAY[1]; +DROP EXTENSION re2; +DROP USER MAPPING FOR CURRENT_USER SERVER re2_svr; +SELECT clickhouse_raw_query('DROP DATABASE re2_test'); +DROP SERVER re2_svr CASCADE; +\else +\echo 'SKIP: re2 extension not available' +SKIP: re2 extension not available +\endif diff --git a/test/expected/result_map.txt b/test/expected/result_map.txt index 2964e02..4a68a74 100644 --- a/test/expected/result_map.txt +++ b/test/expected/result_map.txt @@ -246,3 +246,11 @@ window_functions.sql 24.3 | window_functions_2.out 23.8 | window_functions_5.out 23.3 | window_functions_3.out + +re2_functions.sql +----------------- + + Postgres | pg_re2 | File +----------|-----------|--------------------- + 13+ | installed | re2_functions.out + 13+ | absent | re2_functions_1.out diff --git a/test/sql/re2_functions.sql b/test/sql/re2_functions.sql new file mode 100644 index 0000000..92fb927 --- /dev/null +++ b/test/sql/re2_functions.sql @@ -0,0 +1,48 @@ +SELECT EXISTS(SELECT 1 FROM pg_available_extensions WHERE name = 're2') AS have_re2 \gset +\if :have_re2 + +CREATE SERVER re2_svr FOREIGN DATA WRAPPER clickhouse_fdw OPTIONS(dbname 're2_test'); +CREATE USER MAPPING FOR CURRENT_USER SERVER re2_svr; + +SELECT clickhouse_raw_query('DROP DATABASE IF EXISTS re2_test'); +SELECT clickhouse_raw_query('CREATE DATABASE re2_test'); +SELECT clickhouse_raw_query($$ + CREATE TABLE re2_test.t1 ( + id Int32, + val String + ) ENGINE = MergeTree ORDER BY id +$$); +SELECT clickhouse_raw_query($$ + INSERT INTO re2_test.t1 VALUES + (1, 'POSIX uses BRE and ERE'), + (2, 're2 uses finite automata'), + (3, 'PCRE supports backtracking') +$$); + +CREATE SCHEMA re2_test; +IMPORT FOREIGN SCHEMA re2_test FROM SERVER re2_svr INTO re2_test; +SET search_path = re2_test, public; + +CREATE EXTENSION re2; + +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2match(val, 're2'); +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2extract(val, '(re2)') = 're2'; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2extractall(val, '[A-Z]+') = ARRAY['POSIX','BRE','ERE']; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2regexpextract(val, '(re2)', 1) = 're2'; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2extractgroups(val, '(POSIX) uses (BRE)') = ARRAY['POSIX','BRE']; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2replaceregexpone(val, 'POSIX', 're2') = 're2 uses BRE and ERE'; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2replaceregexpall(val, ' ', '-') = 're2-uses-finite-automata'; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2countmatches(val, 'e') > 0; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2countmatchescaseinsensitive(val, 'E') > 0; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2multimatchany(val, 'POSIX','PCRE'); +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2multimatchanyindex(val, 'POSIX','PCRE') > 0; +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM t1 WHERE re2multimatchallindices(val, 'POSIX','PCRE') = ARRAY[1]; + +DROP EXTENSION re2; +DROP USER MAPPING FOR CURRENT_USER SERVER re2_svr; +SELECT clickhouse_raw_query('DROP DATABASE re2_test'); +DROP SERVER re2_svr CASCADE; + +\else +\echo 'SKIP: re2 extension not available' +\endif