Skip to content

Commit 2ba1faa

Browse files
beinanclaude
andauthored
fix: avoid dataset enumeration on GCS/S3 for query execution (lance-format#89)
Previously, every query would enumerate all datasets on cloud storage and load all of them, causing ~10s latency with 20+ datasets on GCS. Now the query parser extracts which tables are actually referenced (via node_labels() and relationship_types()), and only those specific datasets are loaded. Paths are computed directly from the root path without enumeration. Fixes lance-format#87 Co-authored-by: Claude <noreply@anthropic.com>
1 parent 90bdc5e commit 2ba1faa

2 files changed

Lines changed: 25 additions & 6 deletions

File tree

python/python/knowledge_graph/service.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,18 @@ def run(
116116
*,
117117
datasets: Optional[Mapping[str, pa.Table]] = None,
118118
) -> pa.Table:
119-
"""Execute a Cypher statement against Lance datasets."""
119+
"""Execute a Cypher statement against Lance datasets.
120+
121+
Only loads the datasets referenced in the query, avoiding expensive
122+
enumeration of all datasets on cloud storage.
123+
"""
120124
query = CypherQuery(statement).with_config(self._config)
121-
base_tables: MutableMapping[str, "pa.Table"] = dict(self._store.load_tables())
125+
126+
# Only load tables that are actually referenced in the query
127+
referenced_tables = set(query.node_labels()) | set(query.relationship_types())
128+
base_tables: MutableMapping[str, "pa.Table"] = dict(
129+
self._store.load_tables(referenced_tables)
130+
)
122131
if datasets:
123132
base_tables.update(datasets)
124133
return query.execute(base_tables)

python/python/knowledge_graph/store.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,16 +138,26 @@ def load_tables(
138138
self,
139139
names: Optional[Iterable[str]] = None,
140140
) -> Mapping[str, "pa.Table"]:
141-
"""Load Lance datasets as PyArrow tables."""
141+
"""Load Lance datasets as PyArrow tables.
142+
143+
When specific names are provided, this method computes paths directly
144+
without enumerating all datasets - significantly faster on cloud storage.
145+
"""
142146
lance = self._get_lance()
143147

144148
self.ensure_layout()
145-
available = self.list_datasets()
146-
requested = list(names) if names is not None else list(available.keys())
149+
150+
# Only enumerate datasets when no specific names are requested
151+
if names is not None:
152+
requested = list(names)
153+
else:
154+
available = self.list_datasets()
155+
requested = list(available.keys())
147156

148157
tables: Dict[str, "pa.Table"] = {}
149158
for name in requested:
150-
path = available.get(name, self._dataset_path(name))
159+
# Compute path directly - no need to look up from enumeration
160+
path = self._dataset_path(name)
151161
if not self._path_exists(path):
152162
raise FileNotFoundError(f"Dataset '{name}' not found at {path}")
153163
dataset = lance.dataset(

0 commit comments

Comments
 (0)