Description
The code below performs 10x worse without persist().
What I found so far:
- Order doesn't matter, you can also try
for persist in (True, False): or for persist in (1, 2, 0, 3, 4): - same result (0/False performs 10x worse than the others).
- It seem doesn't happened in small scale N=700 (even after introducing delays)
"""Merging unpersisted read_storage() UDF chain is ~10x slower than persisted.
Not reproducible with read_values() — the overhead is specific to file-backed chains.
rm -rf .datachain && python repro_merge_storage.py
"""
import os
import shutil
import time
import datachain as dc
from pydantic import BaseModel
N = 7000
DIR = "/tmp/_repro"
class Label(BaseModel):
tag: str
def classify(file: dc.TextFile) -> Label:
return Label(tag=file.read().strip())
shutil.rmtree(DIR, ignore_errors=True)
for sub in ("a", "b"):
os.makedirs(f"{DIR}/{sub}")
for i in range(N):
open(f"{DIR}/{sub}/{i}.txt", "w").write(f"{sub}_{i}")
time.sleep(1)
stem = dc.func.path.file_stem(dc.C("file.path"))
for persist in (False, True):
t0 = time.time()
right = (
dc.read_storage(f"{DIR}/b/", type="text")
.map(label=classify)
.mutate(stem=stem)
.select("label", "stem")
)
if persist:
right = right.persist()
r = (
dc.read_storage(f"{DIR}/a/", type="text")
.merge(right, on=stem, right_on="stem")
.save(f"repro_{persist}")
)
print(f"persist={persist}: {time.time() - t0:.1f}s, {r.count()} rows")
shutil.rmtree(DIR)
Version Info
0.49.2.dev114+gc646881e7.d20260405
Python 3.12.1
Description
The code below performs 10x worse without persist().
What I found so far:
for persist in (True, False):orfor persist in (1, 2, 0, 3, 4):- same result (0/False performs 10x worse than the others).Version Info