Skip to content

Commit 1257dac

Browse files
lmeyerovclaude
andcommitted
bench(gfql): large real-graph + bulk-OLAP + fairest-kuzu harnesses (4 engines vs kuzu)
Adds the dgx-spark benchmark harnesses backing the CSR-index + bulk-OLAP claims, all guarded for trust (timing only reported when the index path was actually taken AND index result == scan result; engine parity checked via matched row counts): - index_largegraph_bench.py: real SNAP edge lists (LiveJournal 35M, Orkut 117M, Friendster stretch), parquet-cached, degree-percentile + multi-seed sweeps. Shows seeded latency flat in N, scaling with seed degree (the O(degree) honesty sweep). - index_bulk_olap_bench.py: BULK regime the index deliberately avoids -- seeded multi-hop frontier expansion via the chain API (the one GFQL surface supporting all 4 engines) + full-graph out-degree aggregation, 4 engines + kuzu. Answers "is bulk OLAP better with GFQL cudf?": yes with GFQL, but on polars/polars-gpu (fused lazy), not cudf (eager per-op). polars-CPU 11-47x over pandas, 6-18x over cudf, 3-87x over kuzu on frontier expansion at 35-117M edges. - index_vs_kuzu_prepared.py: fairest seeded comparison -- kuzu prepared statement + columnar get_as_df, in-process (no bolt), matched rows. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 0cd717c commit 1257dac

3 files changed

Lines changed: 397 additions & 0 deletions

File tree

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
#!/usr/bin/env python3
2+
"""BULK-OLAP head-to-head: GFQL 4 engines vs kuzu on REAL graphs.
3+
4+
Answers "is bulk OLAP better with GFQL (cudf / polars-gpu)?" The seeded CSR index
5+
is O(degree) and wins tiny work; this bench deliberately AVOIDS that path and
6+
measures the BULK regime instead — large-frontier multi-hop + full-graph
7+
aggregation, i.e. the scan/join work where columnar GPU throughput should pay off
8+
and the index does NOT help. We run g0.hop (NO resident index -> engine traversal,
9+
the honest bulk path) so every engine does the same materialized join work.
10+
11+
Tasks (all bulk, all materialized on both sides):
12+
BULK1 1-hop forward from K seeds (edge semijoin, frontier=K)
13+
BULK2 2-hop forward from K seeds (edge-edge join, frontier blows up)
14+
DEGALL full-graph out-degree aggregation (group_by over ALL edges; pure OLAP)
15+
K frontier sweep: 1k, 10k, 100k seeds. cudf/polars-gpu should overtake pandas as K
16+
(hence work) grows; kuzu is the WCOJ/optimizer peer for the multi-hop join.
17+
18+
Trust: GFQL rows reported per engine (engine parity is separately guaranteed by the
19+
conformance suite); kuzu rows reported alongside with a semantic note. Timing is the
20+
deliverable — rows are the honesty check that each system did real work.
21+
22+
Env: PARQUET=/path/edges.parquet KS=1000,10000,100000 ENGINES=pandas,polars,cudf,polars-gpu
23+
SYSTEMS=gfql,kuzu REPS=10 WARM=2 OUT=/tmp/bulk.jsonl SEED=0
24+
"""
25+
from __future__ import annotations
26+
import json, os, statistics, time, tempfile, shutil
27+
import numpy as np
28+
import pandas as pd
29+
import graphistry
30+
from graphistry.compute.ast import n, e_forward
31+
32+
33+
def _sync(engine):
34+
if engine in ("cudf", "polars-gpu"):
35+
try:
36+
import cupy as cp # type: ignore
37+
cp.cuda.runtime.deviceSynchronize()
38+
except Exception:
39+
pass
40+
41+
42+
def timeit(fn, reps, engine="cpu", warmup=2):
43+
for _ in range(warmup):
44+
fn(); _sync(engine)
45+
ts = []
46+
for _ in range(reps):
47+
t0 = time.perf_counter(); fn(); _sync(engine)
48+
ts.append((time.perf_counter() - t0) * 1e3)
49+
ts.sort()
50+
return statistics.median(ts)
51+
52+
53+
def load_graph():
54+
edf = pd.read_parquet(os.environ["PARQUET"]).astype({"src": np.int64, "dst": np.int64})
55+
nodes = np.unique(np.concatenate([edf["src"].values, edf["dst"].values]))
56+
ndf = pd.DataFrame({"id": nodes})
57+
return ndf, edf, nodes
58+
59+
60+
def gfql_trav(g0, seed_ids, hops, engine):
61+
"""BULK seeded multi-hop via the CHAIN API — the one GFQL surface that supports
62+
ALL FOUR engines (generic hop() is pandas/cudf only; polars/polars-gpu route
63+
through engine_polars). n({id:seeds}) = frontier filter, then e_forward()*hops."""
64+
ops = [n({"id": seed_ids})] + [e_forward() for _ in range(hops)]
65+
return g0.chain(ops, engine=engine)
66+
67+
68+
def run_gfql(ndf, edf, nodes, ks, engines, reps, warm, outf, seed):
69+
N, E = len(ndf), len(edf)
70+
rng = np.random.default_rng(seed)
71+
seed_sets = {k: rng.choice(nodes, size=min(k, len(nodes)), replace=False).tolist() for k in ks}
72+
for engine in engines:
73+
try:
74+
g0 = graphistry.nodes(ndf, "id").edges(edf, "src", "dst")
75+
# warm/convert frames onto the engine ONCE (exclude H2D/convert from timing)
76+
_ = gfql_trav(g0, seed_sets[ks[0]], 1, engine)
77+
except Exception as ex:
78+
print(f" gfql {engine}: SETUP FAILED {type(ex).__name__}: {ex}"); continue
79+
# frontier sweep: BULK1 (1-hop) + BULK2 (2-hop)
80+
for k in ks:
81+
sids = seed_sets[k]
82+
for task, hops in (("BULK1", 1), ("BULK2", 2)):
83+
try:
84+
res = gfql_trav(g0, sids, hops, engine)
85+
rows = int(res._edges.shape[0]); nn = int(res._nodes.shape[0])
86+
ms = timeit(lambda: gfql_trav(g0, sids, hops, engine), reps, engine, warm)
87+
except Exception as ex:
88+
print(f" gfql {engine} {task} k={k} FAILED: {type(ex).__name__}: {ex}"); continue
89+
rec = dict(system="gfql", engine=engine, task=task, k=k, hops=hops,
90+
n=N, edges=E, warm_ms=ms, e_rows=rows, n_rows=nn)
91+
print(f" gfql {engine:11} {task} k={k:>7} {ms:10.3f}ms e_rows={rows:>10} n_rows={nn:>9}")
92+
if outf: outf.write(json.dumps(rec) + "\n"); outf.flush()
93+
# DEGALL: full-graph out-degree aggregation (pure columnar OLAP, no traversal)
94+
try:
95+
ms, rows = degall(edf, engine, reps, warm)
96+
rec = dict(system="gfql", engine=engine, task="DEGALL", k=None, hops=0,
97+
n=N, edges=E, warm_ms=ms, e_rows=rows, n_rows=rows)
98+
print(f" gfql {engine:11} DEGALL{'':>13} {ms:10.3f}ms groups={rows:>10}")
99+
if outf: outf.write(json.dumps(rec) + "\n"); outf.flush()
100+
except Exception as ex:
101+
print(f" gfql {engine} DEGALL FAILED: {type(ex).__name__}: {ex}")
102+
103+
104+
def degall(edf, engine, reps, warm):
105+
"""Full-graph out-degree = group_by(src).size() on the chosen engine."""
106+
if engine == "pandas":
107+
df = edf
108+
fn = lambda: df.groupby("src").size()
109+
elif engine == "cudf":
110+
import cudf
111+
df = cudf.from_pandas(edf)
112+
fn = lambda: df.groupby("src").size()
113+
elif engine in ("polars", "polars-gpu"):
114+
import polars as pl
115+
df = pl.from_pandas(edf)
116+
if engine == "polars-gpu":
117+
eng = pl.GPUEngine(executor="in-memory", raise_on_fail=False)
118+
fn = lambda: df.lazy().group_by("src").len().collect(engine=eng)
119+
else:
120+
fn = lambda: df.group_by("src").len()
121+
else:
122+
raise ValueError(engine)
123+
r = fn(); rows = int(r.shape[0])
124+
ms = timeit(fn, reps, engine, warm)
125+
return ms, rows
126+
127+
128+
def run_kuzu(ndf, edf, nodes, ks, reps, warm, outf, seed, tmpdir):
129+
try:
130+
import kuzu
131+
except Exception:
132+
print(" kuzu: NOT AVAILABLE (pip install kuzu)"); return
133+
rng = np.random.default_rng(seed)
134+
seed_sets = {k: rng.choice(nodes, size=min(k, len(nodes)), replace=False).tolist() for k in ks}
135+
dbp = tempfile.mkdtemp(dir=tmpdir)
136+
db = kuzu.Database(os.path.join(dbp, "kz")); conn = kuzu.Connection(db)
137+
conn.execute("CREATE NODE TABLE N(id INT64, PRIMARY KEY(id))")
138+
conn.execute("CREATE REL TABLE E(FROM N TO N)")
139+
np_path = os.path.join(dbp, "n.parquet"); ep_path = os.path.join(dbp, "e.parquet")
140+
ndf.to_parquet(np_path)
141+
edf.rename(columns={"src": "from", "dst": "to"}).to_parquet(ep_path)
142+
t0 = time.perf_counter()
143+
conn.execute(f'COPY N FROM "{np_path}"'); conn.execute(f'COPY E FROM "{ep_path}"')
144+
load_ms = (time.perf_counter() - t0) * 1e3
145+
print(f" kuzu load: {load_ms:.0f}ms")
146+
# BULK1/BULK2: distinct reachable set from K seeds (materialized columnar via get_as_df)
147+
q1 = conn.prepare("MATCH (a:N)-[:E]->(b:N) WHERE a.id IN $seeds RETURN b.id")
148+
q2 = conn.prepare("MATCH (a:N)-[:E]->()-[:E]->(b:N) WHERE a.id IN $seeds RETURN b.id")
149+
for k in ks:
150+
s = seed_sets[k]
151+
for task, stmt in (("BULK1", q1), ("BULK2", q2)):
152+
try:
153+
rows = len(conn.execute(stmt, {"seeds": s}).get_as_df())
154+
ms = timeit(lambda: conn.execute(stmt, {"seeds": s}).get_as_df(), reps, "kuzu", warm)
155+
except Exception as ex:
156+
print(f" kuzu {task} k={k} FAILED: {type(ex).__name__}: {ex}"); continue
157+
rec = dict(system="kuzu", engine="kuzu", task=task, k=k, n=len(ndf), edges=len(edf),
158+
warm_ms=ms, e_rows=rows, n_rows=rows, load_ms=load_ms)
159+
print(f" kuzu {'':11} {task} k={k:>7} {ms:10.3f}ms rows={rows:>10} (b.id, not-distinct)")
160+
if outf: outf.write(json.dumps(rec) + "\n"); outf.flush()
161+
# DEGALL: full out-degree aggregation
162+
try:
163+
qd = "MATCH (a:N)-[:E]->() RETURN a.id, count(*) AS deg"
164+
for _ in range(warm): conn.execute(qd).get_as_df()
165+
rows = len(conn.execute(qd).get_as_df())
166+
ms = timeit(lambda: conn.execute(qd).get_as_df(), reps, "kuzu", warm)
167+
rec = dict(system="kuzu", engine="kuzu", task="DEGALL", k=None, n=len(ndf), edges=len(edf),
168+
warm_ms=ms, e_rows=rows, n_rows=rows, load_ms=load_ms)
169+
print(f" kuzu {'':11} DEGALL{'':>13} {ms:10.3f}ms groups={rows:>10}")
170+
if outf: outf.write(json.dumps(rec) + "\n"); outf.flush()
171+
except Exception as ex:
172+
print(f" kuzu DEGALL FAILED: {type(ex).__name__}: {ex}")
173+
shutil.rmtree(dbp, ignore_errors=True)
174+
175+
176+
def main():
177+
ndf, edf, nodes = load_graph()
178+
print(f"===== graph: {len(ndf):,} nodes {len(edf):,} edges =====")
179+
ks = [int(x) for x in os.environ.get("KS", "1000,10000,100000").split(",")]
180+
engines = os.environ.get("ENGINES", "pandas,polars,cudf,polars-gpu").split(",")
181+
systems = os.environ.get("SYSTEMS", "gfql,kuzu").split(",")
182+
reps = int(os.environ.get("REPS", "10")); warm = int(os.environ.get("WARM", "2"))
183+
seed = int(os.environ.get("SEED", "0"))
184+
tmpdir = os.environ.get("TMPDIR_BENCH", "/tmp/bulkbench"); os.makedirs(tmpdir, exist_ok=True)
185+
outf = open(os.environ["OUT"], "a") if os.environ.get("OUT") else None
186+
if "gfql" in systems:
187+
run_gfql(ndf, edf, nodes, ks, engines, reps, warm, outf, seed)
188+
if "kuzu" in systems:
189+
run_kuzu(ndf, edf, nodes, ks, reps, warm, outf, seed, tmpdir)
190+
if outf: outf.close()
191+
192+
193+
if __name__ == "__main__":
194+
main()
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#!/usr/bin/env python3
2+
"""Large REAL-graph CSR-index bench (Step 7). Power-law topology exposes what the
3+
uniform deg-8 synthetic never did: the index is O(degree), so warm latency is flat
4+
in N but scales with SEED DEGREE — a hub seed is the adversarial worst case.
5+
6+
Same trust discipline as index_takeover_bench.py: every GFQL timing is GUARDED by
7+
(index path actually taken via index_trace) AND (index result == scan result). A
8+
cell failing either guard is reported INVALID, never as a speedup.
9+
10+
Datasets (SNAP edge lists, gzipped `u v`, load once -> parquet cache):
11+
com-Orkut 3.07M nodes / 117M edges https://snap.stanford.edu/data/bigdata/communities/com-orkut.ungraph.txt.gz
12+
com-LiveJournal 4.0M / 34.7M https://snap.stanford.edu/data/bigdata/communities/com-lj.ungraph.txt.gz
13+
soc-LiveJournal1 4.8M / 69M (directed) https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz
14+
com-Friendster 65.6M / 1.8B (STRETCH) https://snap.stanford.edu/data/bigdata/communities/com-friendster.ungraph.txt.gz
15+
twitter-2010 41.7M / 1.47B (STRETCH)
16+
LDBC SNB sf10/sf100 via ~/Work/pyg-bench loader + the live snb-interactive-neo4j.
17+
18+
Env: EDGELIST=/path/to/edges.txt.gz (or PARQUET=/path/edges.parquet)
19+
DEG_PCTLS=50,90,99,100 MULTISEED=1,10,100,1000 ENGINES=pandas,polars,cudf,polars-gpu
20+
REPS=15 OUT=/tmp/lg.jsonl MAXSCAN_REPS=3 (cap scan reps at large E)
21+
"""
22+
from __future__ import annotations
23+
import gzip, json, os, statistics, time
24+
import numpy as np
25+
import pandas as pd
26+
import graphistry
27+
from graphistry.compute.gfql.index import index_trace
28+
29+
30+
def load_graph(seed=0):
31+
"""Load a real edge list -> graphistry graph (int64 ids), parquet-cached."""
32+
pq = os.environ.get("PARQUET")
33+
el = os.environ.get("EDGELIST")
34+
if pq and os.path.exists(pq):
35+
edf = pd.read_parquet(pq)
36+
elif el:
37+
cache = el + ".parquet"
38+
if os.path.exists(cache):
39+
edf = pd.read_parquet(cache)
40+
else:
41+
op = gzip.open if el.endswith(".gz") else open
42+
with op(el, "rt") as f:
43+
edf = pd.read_csv(f, sep=r"\s+", comment="#", header=None,
44+
names=["src", "dst"], dtype=np.int64)
45+
edf.to_parquet(cache)
46+
print(f" cached parquet -> {cache}")
47+
else:
48+
# fallback synthetic power-law (Barabasi-ish via preferential attachment proxy)
49+
rng = np.random.default_rng(seed)
50+
n = int(os.environ.get("SYNTH_N", "1000000")); m = n * 8
51+
deg = rng.zipf(2.2, m) % n
52+
edf = pd.DataFrame({"src": rng.integers(0, n, m, dtype=np.int64), "dst": deg.astype(np.int64)})
53+
nodes = np.unique(np.concatenate([edf["src"].values, edf["dst"].values]))
54+
ndf = pd.DataFrame({"id": nodes})
55+
return graphistry.nodes(ndf, "id").edges(edf, "src", "dst"), ndf, edf
56+
57+
58+
def degree_seeds(edf, pctls):
59+
"""Pick one seed id at each out-degree percentile (the O(degree) honesty sweep)."""
60+
deg = edf.groupby("src").size()
61+
out = {}
62+
for p in pctls:
63+
if p >= 100:
64+
sid = int(deg.idxmax()); out["max"] = (int(deg.max()), sid)
65+
else:
66+
thr = np.percentile(deg.values, p)
67+
cand = deg[deg >= thr]
68+
sid = int(cand.index[0]); out[f"p{p}"] = (int(deg.loc[sid]), sid)
69+
return out
70+
71+
72+
def _sync(engine):
73+
if engine in ("cudf", "polars-gpu"):
74+
try:
75+
import cupy as cp; cp.cuda.runtime.deviceSynchronize()
76+
except Exception:
77+
pass
78+
79+
80+
def timeit(fn, reps, engine, warmup=2):
81+
for _ in range(warmup):
82+
fn(); _sync(engine)
83+
ts = []
84+
for _ in range(reps):
85+
t0 = time.perf_counter(); fn(); _sync(engine); ts.append((time.perf_counter() - t0) * 1e3)
86+
ts.sort(); return statistics.median(ts)
87+
88+
89+
def _sig(g):
90+
n, e = g._nodes, g._edges
91+
if "polars" in type(n).__module__ or "cudf" in type(n).__module__: n = n.to_pandas()
92+
if "polars" in type(e).__module__ or "cudf" in type(e).__module__: e = e.to_pandas()
93+
return (len(n), len(e), int(e["src"].sum()) + int(e["dst"].sum()))
94+
95+
96+
def bench(g0, ndf, edf, engines, reps):
97+
maxscan = int(os.environ.get("MAXSCAN_REPS", "3"))
98+
E = len(edf)
99+
pctls = [int(x) for x in os.environ.get("DEG_PCTLS", "50,90,99,100").split(",")]
100+
multiseed = [int(x) for x in os.environ.get("MULTISEED", "1,10,100,1000").split(",")]
101+
dseeds = degree_seeds(edf, pctls)
102+
print(f" degree seeds: " + ", ".join(f"{k}=deg{d}" for k, (d, _) in dseeds.items()))
103+
outf = open(os.environ["OUT"], "a") if os.environ.get("OUT") else None
104+
for engine in engines:
105+
try:
106+
t0 = time.perf_counter(); gi = g0.gfql_index_all(engine=engine); _sync(engine)
107+
build_ms = (time.perf_counter() - t0) * 1e3
108+
except Exception as ex:
109+
print(f" {engine}: BUILD FAILED {type(ex).__name__}: {ex}"); continue
110+
# T3: seed-degree sweep (1-hop), guarded
111+
for tag, (deg, sid) in dseeds.items():
112+
seeds = pd.DataFrame({"id": [sid]})
113+
with index_trace() as steps:
114+
gidx = gi.hop(nodes=seeds, engine=engine, hops=1, direction="forward")
115+
took = any(s.get("path") == "index" for s in steps)
116+
gscan = g0.hop(nodes=seeds, engine=engine, hops=1, direction="forward")
117+
same = _sig(gidx) == _sig(gscan)
118+
valid = took and same
119+
wi = timeit(lambda: gi.hop(nodes=seeds, engine=engine, hops=1, direction="forward"), reps, engine)
120+
ws = timeit(lambda: g0.hop(nodes=seeds, engine=engine, hops=1, direction="forward"),
121+
min(reps, maxscan), engine)
122+
rec = dict(system="gfql", engine=engine, task="degsweep", seed_deg=deg, n=len(ndf), edges=E,
123+
valid=valid, warm_idx_ms=wi, warm_scan_ms=ws, speedup=ws / wi if wi else None, build_ms=build_ms)
124+
print(f" {engine:11} deg={deg:>8} idx={wi:9.4f}ms scan={ws:10.3f}ms x{ws/wi:7.1f}{'' if valid else ' <<INVALID'}")
125+
if outf: outf.write(json.dumps(rec) + "\n"); outf.flush()
126+
# T4: multi-seed frontier sweep (where the cost gate flips index->scan)
127+
rng = np.random.default_rng(0)
128+
allids = ndf["id"].values
129+
for k in multiseed:
130+
seeds = pd.DataFrame({"id": rng.choice(allids, size=min(k, len(allids)), replace=False)})
131+
with index_trace() as steps:
132+
gidx = gi.hop(nodes=seeds, engine=engine, hops=1, direction="forward")
133+
took = any(s.get("path") == "index" for s in steps)
134+
wi = timeit(lambda: gi.hop(nodes=seeds, engine=engine, hops=1, direction="forward"), reps, engine)
135+
print(f" {engine:11} kseed={k:>6} idx={wi:9.4f}ms path={'index' if took else 'scan'}")
136+
if outf: outf.write(json.dumps(dict(system="gfql", engine=engine, task="multiseed", kseed=k,
137+
n=len(ndf), edges=E, took_index=took, warm_idx_ms=wi)) + "\n"); outf.flush()
138+
if outf: outf.close()
139+
140+
141+
def main():
142+
g0, ndf, edf = load_graph()
143+
print(f"===== graph: {len(ndf):,} nodes {len(edf):,} edges =====")
144+
engines = os.environ.get("ENGINES", "pandas,polars").split(",")
145+
bench(g0, ndf, edf, engines, int(os.environ.get("REPS", "15")))
146+
147+
148+
if __name__ == "__main__":
149+
main()
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/usr/bin/env python3
2+
"""Fairest GFQL-vs-kuzu seeded 1-hop: BOTH in-process, warm, kuzu using a PREPARED
3+
statement (its fast path) + result fully materialized on both sides, same seed,
4+
matched answer counts. Removes Cypher-parse-per-call from kuzu so the comparison is
5+
engine-vs-engine, not engine-vs-(parse+engine). kuzu is embedded like GFQL (no bolt
6+
network), so it's the cleanest peer. Env: PARQUET=/data/<edges>.parquet"""
7+
import os, time, statistics, tempfile, shutil
8+
import numpy as np, pandas as pd, graphistry, kuzu
9+
10+
11+
def med(fn, reps=25, warm=4):
12+
for _ in range(warm): fn()
13+
ts = []
14+
for _ in range(reps):
15+
t = time.perf_counter(); fn(); ts.append((time.perf_counter() - t) * 1e3)
16+
ts.sort(); return statistics.median(ts)
17+
18+
19+
def main():
20+
edf = pd.read_parquet(os.environ["PARQUET"]).astype({"src": np.int64, "dst": np.int64})
21+
nodes = np.unique(np.concatenate([edf["src"].values, edf["dst"].values]))
22+
print(f"graph: {len(nodes):,} nodes / {len(edf):,} edges")
23+
g = graphistry.nodes(pd.DataFrame({"id": nodes}), "id").edges(edf, "src", "dst")
24+
gi = g.gfql_index_all(engine="pandas")
25+
deg = edf.groupby("src").size()
26+
# typical (median-degree) and hub seeds
27+
for tag, sid in [("typical", int(deg[deg >= deg.median()].index[0])), ("hub", int(deg.idxmax()))]:
28+
d = int(deg.loc[sid]); seeds = pd.DataFrame({"id": [sid]})
29+
gfql_rows = int(gi.hop(nodes=seeds, engine="pandas", hops=1)._edges.shape[0])
30+
gfql_ms = med(lambda: gi.hop(nodes=seeds, engine="pandas", hops=1))
31+
32+
dbp = tempfile.mkdtemp()
33+
db = kuzu.Database(os.path.join(dbp, "kz")); conn = kuzu.Connection(db)
34+
conn.execute("CREATE NODE TABLE N(id INT64, PRIMARY KEY(id))")
35+
conn.execute("CREATE REL TABLE E(FROM N TO N)")
36+
np_path = os.path.join(dbp, "n.parquet"); ep_path = os.path.join(dbp, "e.parquet")
37+
pd.DataFrame({"id": nodes}).to_parquet(np_path)
38+
edf.rename(columns={"src": "from", "dst": "to"}).to_parquet(ep_path)
39+
conn.execute(f'COPY N FROM "{np_path}"'); conn.execute(f'COPY E FROM "{ep_path}"')
40+
stmt = conn.prepare("MATCH (a:N {id:$sid})-[:E]->(b:N) RETURN b.id")
41+
# Columnar materialization (kuzu's fast result path) == GFQL's DataFrame output.
42+
def kq():
43+
conn.execute(stmt, {"sid": sid}).get_as_df()
44+
kr = len(conn.execute(stmt, {"sid": sid}).get_as_df())
45+
kuzu_ms = med(kq)
46+
ratio = kuzu_ms / gfql_ms if gfql_ms else float("nan")
47+
print(f" {tag:8} deg={d:>7} GFQL-pandas {gfql_ms:8.4f}ms (rows={gfql_rows}) "
48+
f"kuzu-prepared {kuzu_ms:8.4f}ms (rows={kr}) match={gfql_rows==kr} "
49+
f"GFQL {'faster' if ratio>1 else 'SLOWER'} {ratio:.2f}x")
50+
shutil.rmtree(dbp, ignore_errors=True)
51+
52+
53+
if __name__ == "__main__":
54+
main()

0 commit comments

Comments
 (0)