From 4a4c53c2e6e55174ef4dd3ee9f99ab83a5b6190c Mon Sep 17 00:00:00 2001 From: Christian Weilbach Date: Sat, 25 Apr 2026 18:05:14 -0700 Subject: [PATCH 1/4] fix(query): defer NOT/predicate/or-join checks to align with iterative resolver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The legacy Relation engine's `-resolve-clause*` cases for predicates, NOT, NOT-JOIN, and OR-JOIN-with-required-vars all used `check-all-bound` / `check-some-bound` to RAISE eagerly when their input vars weren't yet bound. But the iterative resolver in `datahike.tools/resolve-clauses` already supports clause deferral via nil-return — `bind-by-fn` uses it, and the resolver re-queues failed clauses for the next pass until a fixed point is reached. The asymmetry meant that a perfectly resolvable query could be rejected when a NOT/predicate sat after a binder chain that itself needed the resolver's retry pass. Concretely: [(get-else $ ?e :v :__null__) ?vv] ; deferred until ?e bound [(get-else $ ?e :w :__null__) ?ww] [(?fn ?vv ?ww) ?v1] ; deferred until ?vv ?ww bound (not [(contains? #{…} ?v1)]) ; eagerly raised on ?v1 unbound [?e :marker true] ; would have bound ?e Fixed by switching the four eager call sites to `(when (X-bound? …) …)` defer patterns, mirroring `bind-by-fn`. When a query is genuinely unsolvable, the resolver still raises — now with the more accurate "Cannot resolve any more clauses" listing every unresolvable clause, instead of a single misleading "Insufficient bindings" pointing at whichever clause happened to be checked first. Side fix: `update-ctx-with-stats` now propagates `nil` from `update-fn` (silently kept a half-built map otherwise — only triggered for stats-on queries that go through a deferring clause). Tests: - New `test-deferred-clause-binding` regression test in query_not_test covering predicate, NOT, fn-call→NOT chain, and or-join. - Existing `test-insufficient-bindings` (and the attribute-refs mirror) used to assert the buggy eager-raise; updated to the new behavior that matches the compiled engine. - Existing `test-clause-order` legacy-engine branch was dead weight (codified the bug); replaced with a single both-engines assertion. All 1444 unit tests + 6 integration tests pass. --- src/datahike/query.cljc | 93 +++++++++------- src/datahike/query_stats.cljc | 23 ++-- .../test/attribute_refs/query_not_test.cljc | 38 +++---- .../test/attribute_refs/query_or_test.cljc | 4 +- test/datahike/test/query_not_test.cljc | 100 +++++++++++++----- test/datahike/test/query_or_test.cljc | 6 +- test/datahike/test/query_test.cljc | 21 ++-- 7 files changed, 172 insertions(+), 113 deletions(-) diff --git a/src/datahike/query.cljc b/src/datahike/query.cljc index 922b8d4b3..9fdbf878d 100644 --- a/src/datahike/query.cljc +++ b/src/datahike/query.cljc @@ -1227,9 +1227,22 @@ :rels (->> (:rels context) (keep #(limit-rel % vars))))) +(defn- ctx-bound-vars [context] + (set (concat (mapcat #(keys (:attrs %)) (:rels context)) + (keys (:consts context))))) + +(defn all-bound? + "True iff every var in `vars` is currently bound in `context`." + [context vars] + (set/subset? (set vars) (ctx-bound-vars context))) + +(defn some-bound? + "True iff at least one var in `vars` is currently bound in `context`." + [context vars] + (boolean (seq (set/intersection (set vars) (ctx-bound-vars context))))) + (defn check-all-bound [context vars form] - (let [bound (set (concat (mapcat #(keys (:attrs %)) (:rels context)) - (keys (:consts context))))] + (let [bound (ctx-bound-vars context)] (when-not (set/subset? vars bound) (let [missing (set/difference (set vars) bound)] (log/raise "Insufficient bindings: " missing " not bound in " form @@ -1238,12 +1251,10 @@ :vars missing}))))) (defn check-some-bound [context vars form] - (let [bound (set (concat (mapcat #(keys (:attrs %)) (:rels context)) - (keys (:consts context))))] - (when (empty? (set/intersection vars bound)) - (log/raise "Insufficient bindings: none of " vars " is bound in " form - {:error :query/where - :form form})))) + (when (empty? (set/intersection vars (ctx-bound-vars context))) + (log/raise "Insufficient bindings: none of " vars " is bound in " form + {:error :query/where + :form form}))) (defn resolve-context [context clauses] (dt/resolve-clauses resolve-clause context clauses)) @@ -1894,8 +1905,14 @@ ([context clause orig-clause] (condp looks-like? clause [[symbol? '*]] ;; predicate [(pred ?a ?b ?c)] - (do (check-all-bound context (identity (filter free-var? (first clause))) orig-clause) - (filter-by-pred context clause)) + ;; Defer if any input var isn't bound yet — the iterative resolver + ;; (datahike.tools/resolve-clauses) will retry once binders fire. + ;; If the var is never bound, the resolver raises "Cannot resolve any + ;; more clauses" with the full pending list, which is more useful + ;; than a misleading single-clause error from this site. + (let [vars (filter free-var? (first clause))] + (when (all-bound? context vars) + (filter-by-pred context clause))) [[symbol? '*] '_] ;; function [(fn ?a ?b) ?res] (bind-by-fn context clause) @@ -1918,8 +1935,8 @@ '[or-join [[*] *] *] ;; (or-join [[req-vars] vars] ...) (let [[_ [req-vars & vars] & branches] clause] - (check-all-bound context req-vars orig-clause) - (recur context (list* 'or-join (concat req-vars vars) branches) clause)) + (when (all-bound? context req-vars) + (recur context (list* 'or-join (concat req-vars vars) branches) clause))) '[or-join [*] *] ;; (or-join [vars] ...) ;; TODO required vars @@ -1953,34 +1970,34 @@ '[not *] ;; (not ...) (let [[_ & clauses] clause - negation-vars (collect-vars clauses) - _ (check-some-bound context negation-vars orig-clause) - join-rel (reduce hash-join (:rels context)) - negation-context (-> context - (assoc :rels [join-rel]) - (assoc :stats []) - (resolve-context clauses)) - negation-join-rel (reduce hash-join (:rels negation-context)) - negation (subtract-rel join-rel negation-join-rel)] - (cond-> (assoc context :rels [negation]) - (:stats context) (assoc :tmp-stats {:type :not - :branches (:stats negation-context)}))) + negation-vars (collect-vars clauses)] + (when (some-bound? context negation-vars) + (let [join-rel (reduce hash-join (:rels context)) + negation-context (-> context + (assoc :rels [join-rel]) + (assoc :stats []) + (resolve-context clauses)) + negation-join-rel (reduce hash-join (:rels negation-context)) + negation (subtract-rel join-rel negation-join-rel)] + (cond-> (assoc context :rels [negation]) + (:stats context) (assoc :tmp-stats {:type :not + :branches (:stats negation-context)}))))) '[not-join [*] *] ;; (not-join [vars] ...) - (let [[_ vars & clauses] clause - _ (check-all-bound context vars orig-clause) - join-rel (reduce hash-join (:rels context)) - negation-context (-> context - (assoc :rels [join-rel]) - (assoc :stats []) - (limit-context vars) - (resolve-context clauses) - (limit-context vars)) - negation-join-rel (reduce hash-join (:rels negation-context)) - negation (subtract-rel join-rel negation-join-rel)] - (cond-> (assoc context :rels [negation]) - (:stats context) (assoc :tmp-stats {:type :not - :branches (:stats negation-context)}))) + (let [[_ vars & clauses] clause] + (when (all-bound? context vars) + (let [join-rel (reduce hash-join (:rels context)) + negation-context (-> context + (assoc :rels [join-rel]) + (assoc :stats []) + (limit-context vars) + (resolve-context clauses) + (limit-context vars)) + negation-join-rel (reduce hash-join (:rels negation-context)) + negation (subtract-rel join-rel negation-join-rel)] + (cond-> (assoc context :rels [negation]) + (:stats context) (assoc :tmp-stats {:type :not + :branches (:stats negation-context)}))))) '[*] ;; pattern (let [source rel/*implicit-source* diff --git a/src/datahike/query_stats.cljc b/src/datahike/query_stats.cljc index ecaec3442..2cd311e04 100644 --- a/src/datahike/query_stats.cljc +++ b/src/datahike/query_stats.cljc @@ -17,17 +17,22 @@ (:rels context))}) (defn update-ctx-with-stats - "update-fn must expect [context] as argument" + "update-fn must expect [context] as argument. + Returns nil when update-fn returns nil — that is the iterative + resolver's defer signal (datahike.tools/resolve-clauses re-queues + the clause for the next pass). Without the nil propagation, stats + collection would silently keep a half-built map and confuse retries." [context clause update-fn] (if (:stats context) - (let [{:keys [res t]} (dt/timed #(update-fn context)) - clause-stats (merge (get-stats res) - {:clause clause - :t t} - (:tmp-stats res))] - (-> res - (update :stats conj clause-stats) - (dissoc :tmp-stats))) + (let [{:keys [res t]} (dt/timed #(update-fn context))] + (when res + (let [clause-stats (merge (get-stats res) + {:clause clause + :t t} + (:tmp-stats res))] + (-> res + (update :stats conj clause-stats) + (dissoc :tmp-stats))))) (update-fn context))) (defn extend-stat diff --git a/test/datahike/test/attribute_refs/query_not_test.cljc b/test/datahike/test/attribute_refs/query_not_test.cljc index 387399c6a..5bdcf6f39 100644 --- a/test/datahike/test/attribute_refs/query_not_test.cljc +++ b/test/datahike/test/attribute_refs/query_not_test.cljc @@ -176,31 +176,19 @@ (shift-in #{[4 3] [3 3] [4 4]} [0 1] ref-e0))) (deftest test-insufficient-bindings - (if datahike.test.core-test/compiled-engine? - ;; Compiled engine reorders NOT after its bindings — reorderable cases are valid queries - (do - (testing "reorderable NOT — compiled engine handles correctly" - (is (set? (d/q '[:find ?e :where (not [?e :mname "Ivan"]) [?e :mname]] test-db)))) - (testing "NOT-JOIN with inner vars bound within body" - (is (set? (d/q '[:find ?e :where [?e :mname] - (not-join [?e] (not [1 :age ?a]) [?e :age ?a])] - test-db))))) - ;; Legacy engine requires bindings before NOT - (are [q msg] (thrown-with-msg? Throwable msg - (d/q (into '[:find ?e :where] q) - test-db)) - '[(not [?e :mname "Ivan"]) - [?e :mname]] - #"Insufficient bindings: none of #\{\?e\} is bound" - - '[[?e :mname] - (not-join [?e] - (not [1 :age ?a]) - [?e :age ?a])] - #"Insufficient bindings: none of #\{\?a\} is bound")) - - ;; Both engines: truly unbound vars must throw + ;; Both engines now accept NOT before its binder — see + ;; datahike.test.query-not-test for the rationale (legacy engine's + ;; iterative resolver defers and retries NOT/predicate clauses). + (testing "reorderable NOT — both engines handle correctly" + (is (set? (d/q '[:find ?e :where (not [?e :mname "Ivan"]) [?e :mname]] test-db)))) + (testing "NOT-JOIN with inner vars bound within body" + (is (set? (d/q '[:find ?e :where [?e :mname] + (not-join [?e] (not [1 :age ?a]) [?e :age ?a])] + test-db)))) + + ;; Truly unbound vars still error — message changes from + ;; "Insufficient bindings" to "Cannot resolve any more clauses". (testing "truly unbound vars throw" - (is (thrown-with-msg? Throwable #"Insufficient bindings" + (is (thrown-with-msg? Throwable #"Cannot resolve any more clauses|Insufficient bindings" (d/q '[:find ?e :where [?e :mname] (not [?a :mname "Ivan"])] test-db))))) diff --git a/test/datahike/test/attribute_refs/query_or_test.cljc b/test/datahike/test/attribute_refs/query_or_test.cljc index 6ff0318d5..2ef558427 100644 --- a/test/datahike/test/attribute_refs/query_or_test.cljc +++ b/test/datahike/test/attribute_refs/query_or_test.cljc @@ -130,7 +130,9 @@ [?e :age ?a])] test-db))) - (is (thrown-with-msg? Throwable #"Insufficient bindings: #\{\?e\} not bound" + ;; or-join required-vars now defers; if no clause binds them, the + ;; iterative resolver raises "Cannot resolve any more clauses". + (is (thrown-with-msg? Throwable #"Cannot resolve any more clauses|Insufficient bindings" (d/q '[:find ?e :where (or-join [[?e]] [?e :weight 40])] diff --git a/test/datahike/test/query_not_test.cljc b/test/datahike/test/query_not_test.cljc index 9d7671619..b6a654795 100644 --- a/test/datahike/test/query_not_test.cljc +++ b/test/datahike/test/query_not_test.cljc @@ -172,32 +172,80 @@ #{[4 3] [3 3] [4 4]})) (deftest test-insufficient-bindings - (if datahike.test.core-test/compiled-engine? - ;; Compiled engine reorders NOT after its bindings — these are valid queries - (do - (testing "reorderable NOT — compiled engine handles correctly" - (is (= #{[3] [4]} - (d/q '[:find ?e :where (not [?e :name "Ivan"]) [?e :name]] @test-db)))) - (testing "NOT-JOIN with inner vars bound within body" - (is (= #{[1] [3] [5]} - (d/q '[:find ?e :where [?e :name] - (not-join [?e] (not [1 :age ?a]) [?e :age ?a])] - @test-db))))) - ;; Legacy engine requires bindings before NOT - (are [q msg] (thrown-with-msg? Throwable msg - (d/q (into '[:find ?e :where] (quote q)) @test-db)) - [(not [?e :name "Ivan"]) - [?e :name]] - #"Insufficient bindings: none of #\{\?e\} is bound" - - [[?e :name] - (not-join [?e] - (not [1 :age ?a]) - [?e :age ?a])] - #"Insufficient bindings: none of #\{\?a\} is bound")) - - ;; Both engines: truly unbound vars must throw + ;; Both engines now accept NOT before its binder — the legacy engine's + ;; iterative resolver defers an unresolvable NOT and retries it after + ;; binders fire (datahike/tools.cljc:resolve-clauses), matching the + ;; compiled engine's plan-time topological reordering. Previously the + ;; legacy engine raised "Insufficient bindings" eagerly. + (testing "reorderable NOT — both engines handle correctly" + (is (= #{[3] [4]} + (d/q '[:find ?e :where (not [?e :name "Ivan"]) [?e :name]] @test-db)))) + (testing "NOT-JOIN with inner vars bound within body" + (is (= #{[1] [3] [5]} + (d/q '[:find ?e :where [?e :name] + (not-join [?e] (not [1 :age ?a]) [?e :age ?a])] + @test-db)))) + + ;; Truly unbound vars must still error — the iterative resolver gives + ;; up after a fixed-point pass with no progress, raising + ;; "Cannot resolve any more clauses" with the failed-clauses list. (testing "truly unbound vars throw" - (is (thrown-with-msg? Throwable #"Insufficient bindings" + (is (thrown-with-msg? Throwable #"Cannot resolve any more clauses|Insufficient bindings" (d/q '[:find ?e :where [?e :name] (not [?a :name "Ivan"])] @test-db))))) + +(deftest test-deferred-clause-binding + ;; Regression test for a planner gap: clauses that gate on bound vars + ;; (predicates, NOT, NOT-JOIN, OR-JOIN-with-required-vars) used to + ;; raise "Insufficient bindings" on the first pass instead of deferring + ;; like bind-by-fn does. When their inputs traced back to a deferred + ;; binder (e.g. a get-else whose entity var was itself bound by a + ;; later pattern), the eager raise masked perfectly resolvable queries. + ;; + ;; The user-facing repro lives in pgwire-datahike: a SQL `WHERE + ;; format_type(a.atttypid, a.atttypmod) NOT IN (…)` translated to: + ;; [?e :pg_attribute/db-row-exists true] ; row marker — last + ;; [(get-else $ ?e :…/atttypid :__null__) ?a] ; deferred until ?e bound + ;; [(get-else $ ?e :…/atttypmod :__null__) ?b] + ;; [(?fmt ?a ?b) ?v1] ; deferred until ?a ?b bound + ;; (not [(contains? #{…} ?v1)]) ; deferred until ?v1 bound + ;; Old planner: raised on the (not …) before the chain could fire. + ;; New: each clause defers, the resolver iterates, all clauses resolve. + (let [db (d/db-with (db/empty-db) + [{:db/id 1 :name "Ivan"} + {:db/id 2 :name "Oleg"} + {:db/id 3 :name "Ivan"}])] + (testing "predicate before its binder defers" + (is (= #{[1] [3]} + (d/q '[:find ?e + :where [(= ?n "Ivan")] + [?e :name ?n]] + db)))) + + (testing "NOT before its binder defers" + (is (= #{[2]} + (d/q '[:find ?e + :where (not [?e :name "Ivan"]) + [?e :name]] + db)))) + + (testing "fn-call deferred chain feeding (not [pred])" + ;; ?upper resolves only after both the binder and the deferred + ;; fn-call run; the NOT clause must wait through the cascade. + (is (= #{[2]} + (d/q '[:find ?e + :in $ ?up + :where (not [(contains? #{"IVAN"} ?upper)]) + [(?up ?n) ?upper] + [?e :name ?n]] + db (fn [^String s] (.toUpperCase s))))))) + + (testing "or-join with required vars defers until req-vars bind" + (let [db (d/db-with (db/empty-db) + [{:db/id 1 :name "Ivan" :age 10} + {:db/id 2 :name "Oleg" :age 20}])] + (is (= #{[1]} + (d/q '[:find ?e + :where (or-join [[?e]] [?e :age 10]) + [?e :name]] + db)))))) diff --git a/test/datahike/test/query_or_test.cljc b/test/datahike/test/query_or_test.cljc index 0788366fd..c14483250 100644 --- a/test/datahike/test/query_or_test.cljc +++ b/test/datahike/test/query_or_test.cljc @@ -203,7 +203,11 @@ [?e :age ?a])] @test-db))) - (is (thrown-with-msg? Throwable #"Insufficient bindings: #\{\?e\} not bound" + ;; or-join with required vars defers if those vars aren't bound. + ;; When no other clause can bind them, the iterative resolver raises + ;; "Cannot resolve any more clauses" instead of the previous eager + ;; "Insufficient bindings" — same semantics, less misleading message. + (is (thrown-with-msg? Throwable #"Cannot resolve any more clauses|Insufficient bindings" (d/q '[:find ?e :where (or-join [[?e]] [?e :name "Ivan"])] diff --git a/test/datahike/test/query_test.cljc b/test/datahike/test/query_test.cljc index 400004c71..ae11ac645 100644 --- a/test/datahike/test/query_test.cljc +++ b/test/datahike/test/query_test.cljc @@ -397,19 +397,14 @@ [(= ?age 37)]]} :args [db]}) #{[2] [3]})) - (if core-test/compiled-engine? - ;; Compiled engine reorders predicate after its binding - (is (= #{[2] [3]} - (d/q {:query '{:find [?e] - :where [[(= ?age 37)] - [?e :age ?age]]} - :args [db]}))) - ;; Legacy engine requires correct ordering - (is (thrown-with-msg? Throwable #"Insufficient bindings: #\{\?age\} not bound" - (d/q {:query '{:find [?e] - :where [[(= ?age 37)] - [?e :age ?age]]} - :args [db]}))))))) + ;; Both engines handle predicate-before-binding now — the legacy + ;; engine's iterative resolver defers the predicate and retries + ;; after the pattern binds ?age, matching the compiled engine. + (is (= #{[2] [3]} + (d/q {:query '{:find [?e] + :where [[(= ?age 37)] + [?e :age ?age]]} + :args [db]})))))) (deftest test-zeros-in-pattern (let [cfg {:store {:backend :memory From 717b8b582fe6e4313f7d82b6517e9c4aaf3eee5a Mon Sep 17 00:00:00 2001 From: Christian Weilbach Date: Sat, 25 Apr 2026 20:19:34 -0700 Subject: [PATCH 2/4] fix(query/lower,plan): NOT-binding validator must read function ops' :binding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The plan-builder's post-ordering NOT-binding check (lower.cljc Step 7 and the equivalent loop in plan.cljc create-plan) walked ordered ops to build a vars-so-far set, but used `(:bind-vars op)` for `:function` ops — a key plan-function-op never sets. The op contributed nothing, so any subsequent NOT/predicate whose only required var came from a function chain was falsely rejected with "Insufficient bindings". Concretely, the Odoo `_check_removed_columns` query goes through pgwire-datahike with `*force-legacy* false` (planner enabled). Its SQL `format_type(a.atttypid, a.atttypmod) NOT IN (...)` becomes: [(get-else $ ?a_eid :pg_attribute/atttypid :__null__) ?atttypid] [(get-else $ ?a_eid :pg_attribute/atttypmod :__null__) ?atttypmod] [(?fmt ?atttypid ?atttypmod) ?v1] (not [(contains? #{...} ?v1)]) After ordering, the planner walked the ops correctly but the function-op contribution was dropped, leaving ?v1 invisible to the NOT validation. Same op-cost code at plan.cljc:790 uses :args for inputs and is fine — only the vars-so-far accumulator was broken. Fixed by reading `(:binding op)` (the canonical key from plan-function-op) and passing through `analyze/extract-vars` so scalar / tuple / list / map binding forms all work uniformly. Pgwire-datahike's failing Odoo query now succeeds end-to-end on the planner path. Verified directly: parsed SQL → q-result with both *force-legacy* true (legacy engine, was already fixed in prior commit) and false (planner path, this commit's fix). All 1444 tests pass on both engines; 6983 assertions on legacy, 7010 on planner. --- src/datahike/query/lower.cljc | 21 +++++++++++++++++++-- src/datahike/query/plan.cljc | 10 +++++++++- test/datahike/test/query_not_test.cljc | 16 +++++----------- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/datahike/query/lower.cljc b/src/datahike/query/lower.cljc index b8085ed2e..590220245 100644 --- a/src/datahike/query/lower.cljc +++ b/src/datahike/query/lower.cljc @@ -407,7 +407,24 @@ (filterv #(#{:entity-group :pattern-scan} (:op %)) ordered-ops)) ;; --------------------------------------------------------------- - ;; Step 7: NOT binding validation + ;; Step 7: NOT binding validation. + ;; Walks the ordered ops in execution order, tracking which vars + ;; are bound after each op runs. NOT/NOT-JOIN must have at least + ;; one of its vars bound by a prior op (legacy semantics). + ;; + ;; The per-op contribution-set must mirror what the executor + ;; will actually bind: + ;; - :entity-group → scan + merge vars + ;; - :pattern-scan → pattern vars + ;; - :function → the result-binding var (`:binding` from + ;; plan-function-op). Predicates produce no + ;; new bindings; or/or-join handle their own + ;; binding internally. + ;; Earlier this case used `(:bind-vars op)` which plan-function-op + ;; never sets — function ops looked like they bound nothing, so + ;; any subsequent NOT/predicate whose only required var came from + ;; a function chain (e.g. `format_type(...)` feeding NOT IN) was + ;; falsely rejected with "Insufficient bindings". _ (loop [remaining ordered-ops vars-so-far bound-vars] (when (seq remaining) @@ -425,7 +442,7 @@ :entity-group (into (:vars (:scan-op op)) (mapcat :vars (:merge-ops op))) :pattern-scan (:vars op) - :function (into #{} (filter analyze/free-var?) (:bind-vars op)) + :function (analyze/extract-vars (:binding op)) nil))))))] {:ops ordered-ops diff --git a/src/datahike/query/plan.cljc b/src/datahike/query/plan.cljc index 29ce5ac89..ee4ffdb14 100644 --- a/src/datahike/query/plan.cljc +++ b/src/datahike/query/plan.cljc @@ -1397,7 +1397,15 @@ :entity-group (into (:vars (:scan-op op)) (mapcat :vars (:merge-ops op))) :pattern-scan (:vars op) - :function (into #{} (filter analyze/free-var?) (:bind-vars op)) + ;; plan-function-op stores the result var(s) in + ;; :binding (scalar, tuple, list, or map). The + ;; legacy `:bind-vars` key is never set — + ;; reading it lost the result-var contribution + ;; and falsely tripped the NOT validation when + ;; a function-chain output was the only var + ;; reaching a NOT clause. Mirror lower.cljc's + ;; identical loop. + :function (analyze/extract-vars (:binding op)) nil))))))] {:ops ordered-ops diff --git a/test/datahike/test/query_not_test.cljc b/test/datahike/test/query_not_test.cljc index b6a654795..ad8a963ce 100644 --- a/test/datahike/test/query_not_test.cljc +++ b/test/datahike/test/query_not_test.cljc @@ -232,20 +232,14 @@ (testing "fn-call deferred chain feeding (not [pred])" ;; ?upper resolves only after both the binder and the deferred ;; fn-call run; the NOT clause must wait through the cascade. + ;; Tests both engines: + ;; - Legacy: iterative resolver retries the deferred clauses. + ;; - New planner: relies on lower.cljc's NOT validation reading + ;; :function ops' :binding (was :bind-vars — wrong key, never set). (is (= #{[2]} (d/q '[:find ?e :in $ ?up :where (not [(contains? #{"IVAN"} ?upper)]) [(?up ?n) ?upper] [?e :name ?n]] - db (fn [^String s] (.toUpperCase s))))))) - - (testing "or-join with required vars defers until req-vars bind" - (let [db (d/db-with (db/empty-db) - [{:db/id 1 :name "Ivan" :age 10} - {:db/id 2 :name "Oleg" :age 20}])] - (is (= #{[1]} - (d/q '[:find ?e - :where (or-join [[?e]] [?e :age 10]) - [?e :name]] - db)))))) + db (fn [^String s] (.toUpperCase s)))))))) From 81ac126170059e7c5e3f5b2be56201319243f159 Mon Sep 17 00:00:00 2001 From: Christian Weilbach Date: Sat, 25 Apr 2026 21:46:14 -0700 Subject: [PATCH 3/4] fix(query/execute): adopt-vector silently corrupted >32-element rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `clojure.lang.PersistentVector/adopt` is a zero-copy wrapper that builds the vector with `cnt = arr.length`, `shift = 5`, `root = EMPTY_NODE`, and the data in `tail`. That layout is only valid for vectors of length ≤ 32 (the entire body fits in tail; tailoff = 0). For longer arrays adopt silently produces a corrupt vector: cnt > 32 implies tailoff = cnt-32 > 0, but root remains EMPTY_NODE, so any indexed access at i < tailoff walks `EMPTY_NODE.array` and dies with NPE: Cannot read field "array" because "node" is null The corruption isn't surfaced until a `seq`/`nth`/`take`/`subvec` runs — at which point the call site looks completely innocent (just `(seq row)` on a `clojure.lang.PersistentVector`). Hard to spot in code review. Surfaced in production by pgwire-datahike running Odoo's `_auto_init` field-reflection: `SELECT 34 cols FROM res_partner WHERE id IN (1)` materialised one row whose backing Object[] had length 35 (34 user columns + ?eid for :with). pgwire's `format-query-result` hidden-strip step `(vec (take visible row))` then NPE'd on the first chunkedSeq access. The runtime check is essentially free: `LazilyPersistentVector /createOwning` does the right dispatch — `adopt` for arr.length ≤ 32 (unchanged hot path), and `PersistentVector/create` (transient-build, valid tree) for longer arrays. Note: the bug has been latent since `adopt-vector` was added — every query whose materialised tuple width exceeded 32 was at risk. Most queries don't hit that arity, which is why no existing test caught it. Tests: - New `test-find-arity-greater-than-32` regression in query_test: 35-element :find with 35-attr schema, asserts seq/nth/take all succeed and round-trip through (vec (take k row)) cleanly. - All 1444 unit tests still pass with DATAHIKE_QUERY_PLANNER=true. --- src/datahike/query/execute.cljc | 22 +++++++++++++++++++-- test/datahike/test/query_test.cljc | 31 ++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/datahike/query/execute.cljc b/src/datahike/query/execute.cljc index 664c25d63..6cb99b688 100644 --- a/src/datahike/query/execute.cljc +++ b/src/datahike/query/execute.cljc @@ -246,9 +246,27 @@ d))))))))) (defn- adopt-vector - "Create a PersistentVector from an object array without copying." + "Create a PersistentVector from an object array. + + `clojure.lang.PersistentVector/adopt` is fast (zero-copy) but only + correct when `arr.length <= 32`. It constructs the vector with + `root = EMPTY_NODE` and the data in the tail, which is the + PersistentVector internal layout for short vectors. For arrays + longer than 32 the result is silently corrupt: `cnt > 32` but + `tailoff() = cnt-32 > 0`, and any `arrayFor(i)` for i < tailoff + walks `EMPTY_NODE.array` and NPEs on the first level. + + Real-world repro: SELECT against a 33+-column table from pgwire + (Odoo's res_partner has 34 columns). The corrupt row crashes at + the first `seq`/`nth`/`take` with `Cannot read field \"array\" + because \"node\" is null`. + + `LazilyPersistentVector/createOwning` does the right dispatch: + the cheap adopt for length ≤ 32, and `PersistentVector/create` + (transient-build, valid tree) for longer arrays. Still no copy + in the short path." [^objects arr] - #?(:clj (clojure.lang.PersistentVector/adopt arr) + #?(:clj (clojure.lang.LazilyPersistentVector/createOwning arr) :cljs (vec arr))) ;; --------------------------------------------------------------------------- diff --git a/test/datahike/test/query_test.cljc b/test/datahike/test/query_test.cljc index ae11ac645..1eeabf1d3 100644 --- a/test/datahike/test/query_test.cljc +++ b/test/datahike/test/query_test.cljc @@ -986,3 +986,34 @@ we query all (parent, child) pairs." (let [f (dq/basic-index-selector 5)] (is (= [10 7] ((f [1 3]) [9 10 4 7 1234]))) (is (= [7 10] ((f [3 1]) [9 10 4 7 1234]))))) + +(deftest test-find-arity-greater-than-32 + ;; Regression for the executor's row-materialization path: it used + ;; `clojure.lang.PersistentVector/adopt` to wrap the result Object[] + ;; without copying. That call is only correct for arrays of length + ;; ≤ 32 — for longer arrays it builds a corrupt PersistentVector + ;; (cnt > 32 with root = EMPTY_NODE), which silently NPEs on the + ;; first `seq`/`nth`/`take`. Exercised in production via Odoo's + ;; res_partner SELECT with 34 columns. + (let [n 35 + attrs (vec (for [i (range n)] + (keyword "t" (str "c" i)))) + db (-> (db/empty-db) + (d/db-with (vec (for [a attrs] + {:db/ident a :db/valueType :db.type/long :db/cardinality :db.cardinality/one}))) + (d/db-with [(into {} (map-indexed (fn [i a] [a i]) attrs))])) + q `{:find ~(vec (map #(symbol (str "?v" %)) (range n))) + :where ~(vec (map-indexed + (fn [i a] ['?e a (symbol (str "?v" i))]) + attrs))} + result (d/q q db) + row (first result)] + (is (= n (count row))) + (testing "row is a valid PersistentVector — seq/nth/take all succeed" + (is (= (vec (range n)) (vec (seq row)))) + (is (= 0 (nth row 0))) + (is (= 17 (nth row 17))) + (is (= 34 (nth row 34))) + (is (= (vec (range n)) (vec (take 100 row))))) + (testing "round-trip through (vec (take k row)) — the pgwire pattern" + (is (= (vec (range 30)) (vec (take 30 row))))))) From f762523d9c1773b19e78c2d29e0ed4e11d445e43 Mon Sep 17 00:00:00 2001 From: Christian Weilbach Date: Sun, 26 Apr 2026 15:01:18 -0700 Subject: [PATCH 4/4] Fix format. --- test/datahike/test/query_not_test.cljc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/datahike/test/query_not_test.cljc b/test/datahike/test/query_not_test.cljc index ad8a963ce..73a24d0ec 100644 --- a/test/datahike/test/query_not_test.cljc +++ b/test/datahike/test/query_not_test.cljc @@ -219,14 +219,14 @@ (is (= #{[1] [3]} (d/q '[:find ?e :where [(= ?n "Ivan")] - [?e :name ?n]] + [?e :name ?n]] db)))) (testing "NOT before its binder defers" (is (= #{[2]} (d/q '[:find ?e :where (not [?e :name "Ivan"]) - [?e :name]] + [?e :name]] db)))) (testing "fn-call deferred chain feeding (not [pred])" @@ -240,6 +240,6 @@ (d/q '[:find ?e :in $ ?up :where (not [(contains? #{"IVAN"} ?upper)]) - [(?up ?n) ?upper] - [?e :name ?n]] + [(?up ?n) ?upper] + [?e :name ?n]] db (fn [^String s] (.toUpperCase s))))))))