Skip to content

Commit 2af042c

Browse files
Merge pull request #14 from VectifyAI/v0.4
bugs fixed
2 parents 4edcb1c + 922e931 commit 2af042c

15 files changed

Lines changed: 278 additions & 138 deletions

contextdb/__init__.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
__version__ = "0.4.0"
22

3-
from contextdb.api.condb import ConDB, ConDBError, LLMNotConfiguredError, QueryResult, TreeNotFoundError
3+
from contextdb.api.condb import (
4+
ConDB,
5+
ConDBError,
6+
LLMNotConfiguredError,
7+
QueryResult,
8+
TreeNotFoundError,
9+
open, # noqa: A004
10+
)
411
from contextdb.api.context_tree import ContextTree
5-
from contextdb.api.condb import open # noqa: A004
612
from contextdb.core.storage import Entity, Node, StorageProtocol, TreeDB
713
from contextdb.llm import LLMClient, LLMProtocol
814
from contextdb.retriever import BeamRetriever, BlockRetriever, ManualRetriever, RetrievalResult

contextdb/config/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
"""Configuration loader for ContextDB."""
22

33
import os
4-
import yaml
54
from pathlib import Path
6-
from typing import Any
75

6+
import yaml
87
from dotenv import load_dotenv
98

109
CONFIG_DIR = Path(__file__).parent

contextdb/retriever/algorithm/block_cutter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ def _generate_block_content(self, nodes: list[dict]) -> str:
367367
meta_lines.append(f" range: {page_start}-{page_end}")
368368

369369
node_metadata.append(meta_lines)
370-
metadata_chars += sum(len(l) for l in meta_lines)
370+
metadata_chars += sum(len(line) for line in meta_lines)
371371

372372
text = payload.get("text") or payload.get("content") or ""
373373
node_texts.append(text)

contextdb/retriever/algorithm/block_retriever.py

Lines changed: 140 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
]
6161

6262

63-
class BlockRetriever(BaseRetriever):
63+
class BlockRetriever(BlockRetrieverFilesystemSupport, BlockRetrieverPromptCacheSupport, BaseRetriever):
6464

6565
def __init__(
6666
self,
@@ -120,22 +120,14 @@ def __init__(
120120
self.max_tokens_per_block,
121121
self.min_tokens_per_block,
122122
)
123-
self._filesystem_support = BlockRetrieverFilesystemSupport(self)
124-
self._prompt_cache_support = BlockRetrieverPromptCacheSupport(self)
125123
self._plan_cache: dict[str, BlockTreePlan] = {}
126124
self._precomputed_tree_id: str = ""
127-
128-
def __getattr__(self, name: str):
129-
if name.startswith("__"):
130-
raise AttributeError(name)
131-
132-
for support_name in ("_filesystem_support", "_prompt_cache_support"):
133-
support = self.__dict__.get(support_name)
134-
if support is None:
135-
continue
136-
if hasattr(type(support), name):
137-
return getattr(support, name)
138-
raise AttributeError(f"{type(self).__name__!s} object has no attribute {name!r}")
125+
self._fs_node_cache: dict[tuple[str, str], dict[str, Any]] = {}
126+
self._fs_attrs_cache: dict[tuple[str, str], dict[str, Any]] = {}
127+
self._fs_path_cache: dict[tuple[str, str], str] = {}
128+
self._fs_is_dir_cache: dict[tuple[str, str], bool] = {}
129+
self._fs_children_cache: dict[tuple[str, str], list[dict[str, Any]]] = {}
130+
self._fs_block_render_cache: dict[tuple[str, ...], tuple[str, int]] = {}
139131

140132
def retrieve(
141133
self,
@@ -164,6 +156,7 @@ def _retrieve_fs(
164156
return self._empty_result()
165157

166158
if tree_id != self._precomputed_tree_id:
159+
self._clear_fs_lookup_cache()
167160
self.token_counter.clear_cache()
168161
self.token_counter.precompute_tree_tokens(self.storage, tree_id)
169162
self._precomputed_tree_id = tree_id
@@ -783,18 +776,18 @@ def _process_block(
783776
def _update_frontier(self, node_ids, tree_id, beam_size):
784777
next_frontier = []
785778
for node_id in node_ids:
786-
node = self.storage.get_node(tree_id, node_id)
787-
attrs = {}
788-
if node and node.attrs_json:
789-
try:
790-
attrs = json.loads(node.attrs_json)
791-
except json.JSONDecodeError:
792-
attrs = {}
793-
794-
frontier_path = attrs.get("rel_path", "") if self.mode == "filesystem" else (node.path if node else "")
779+
if self.mode == "filesystem":
780+
node = self._get_cached_fs_node_dict(tree_id, node_id)
781+
attrs = self._get_cached_fs_attrs(tree_id, node_id, node=node)
782+
frontier_path = attrs.get("rel_path", "")
783+
title = attrs.get("title", "")
784+
else:
785+
node = self.storage.get_node(tree_id, node_id)
786+
frontier_path = node.path if node else ""
787+
title = ""
795788
next_frontier.append({
796789
"node_id": node_id,
797-
"title": attrs.get("title", ""),
790+
"title": title,
798791
"path": frontier_path,
799792
})
800793
if beam_size and len(next_frontier) >= beam_size:
@@ -808,7 +801,13 @@ def _update_beams(self, node_ids, tree_id, beam_size):
808801
def _frontier_has_children(self, tree_id: str, frontier: list[dict[str, str]]) -> bool:
809802
for frontier_node in frontier:
810803
node_id = frontier_node.get("node_id", "")
811-
if node_id and self.storage.get_children(tree_id, node_id):
804+
if not node_id:
805+
continue
806+
if self.mode == "filesystem":
807+
if self._fs_node_has_children(tree_id, node_id):
808+
return True
809+
continue
810+
if self.storage.get_children(tree_id, node_id):
812811
return True
813812
return False
814813

@@ -831,14 +830,14 @@ def _override_done_if_dirs(self, result: BlockResult, tree_id: str, beams: list[
831830
return self._override_done_if_frontier_dirs(result, tree_id, beams)
832831

833832
def _is_fs_directory_id(self, tree_id: str, node_id: str) -> bool:
834-
node = self.storage.get_node(tree_id, node_id)
835-
if not node or not node.attrs_json:
836-
return False
837-
try:
838-
attrs = json.loads(node.attrs_json)
839-
except json.JSONDecodeError:
840-
return False
841-
return bool(attrs.get("is_dir", False))
833+
key = (tree_id, node_id)
834+
if key in self._fs_is_dir_cache:
835+
return self._fs_is_dir_cache[key]
836+
837+
attrs = self._get_cached_fs_attrs(tree_id, node_id)
838+
is_dir = bool(attrs.get("is_dir", False))
839+
self._fs_is_dir_cache[key] = is_dir
840+
return is_dir
842841

843842
# ---- allowed node filtering (dynamic, but content stays fixed) ----
844843

@@ -876,19 +875,120 @@ def _get_node_paths(self, tree_id: str, node_ids: list[str]) -> dict[str, str]:
876875

877876
cursor = self.storage.conn.cursor()
878877
path_map: dict[str, str] = {}
878+
missing: list[str] = []
879+
seen_missing: set[str] = set()
880+
881+
for node_id in node_ids:
882+
key = (tree_id, node_id)
883+
cached_path = self._fs_path_cache.get(key)
884+
if cached_path is not None:
885+
path_map[node_id] = cached_path
886+
elif node_id not in seen_missing:
887+
seen_missing.add(node_id)
888+
missing.append(node_id)
889+
890+
if not missing:
891+
return {node_id: path_map[node_id] for node_id in node_ids if node_id in path_map}
892+
879893
chunk_size = 500
880894

881-
for i in range(0, len(node_ids), chunk_size):
882-
chunk = node_ids[i:i + chunk_size]
895+
for i in range(0, len(missing), chunk_size):
896+
chunk = missing[i:i + chunk_size]
883897
placeholders = ",".join("?" for _ in chunk)
884898
cursor.execute(
885899
f"SELECT node_id, path FROM nodes WHERE tree_id = ? AND node_id IN ({placeholders})",
886900
(tree_id, *chunk),
887901
)
888902
for row in cursor.fetchall():
889-
path_map[row["node_id"]] = row["path"]
903+
node_id = row["node_id"]
904+
path = row["path"]
905+
self._fs_path_cache[(tree_id, node_id)] = path
906+
path_map[node_id] = path
907+
908+
return {node_id: path_map[node_id] for node_id in node_ids if node_id in path_map}
909+
910+
@staticmethod
911+
def _parse_fs_attrs(attrs_value: Any) -> dict[str, Any]:
912+
if isinstance(attrs_value, dict):
913+
return attrs_value
914+
if isinstance(attrs_value, str) and attrs_value:
915+
try:
916+
parsed = json.loads(attrs_value)
917+
except json.JSONDecodeError:
918+
return {}
919+
return parsed if isinstance(parsed, dict) else {}
920+
return {}
921+
922+
def _remember_fs_node(self, tree_id: str, node: dict[str, Any]) -> dict[str, Any]:
923+
node_id = node.get("node_id")
924+
if not node_id:
925+
return node
926+
927+
key = (tree_id, node_id)
928+
existing = self._fs_node_cache.get(key)
929+
if existing:
930+
merged = {**existing, **node}
931+
if "entity" not in node and "entity" in existing:
932+
merged["entity"] = existing["entity"]
933+
node = merged
934+
935+
self._fs_node_cache[key] = node
936+
path = node.get("path")
937+
if isinstance(path, str):
938+
self._fs_path_cache[key] = path
939+
940+
attrs = self._parse_fs_attrs(node.get("attrs") if "attrs" in node else node.get("attrs_json"))
941+
self._fs_attrs_cache[key] = attrs
942+
self._fs_is_dir_cache[key] = bool(attrs.get("is_dir", False))
943+
return node
944+
945+
def _get_cached_fs_node_dict(self, tree_id: str, node_id: str) -> dict[str, Any] | None:
946+
key = (tree_id, node_id)
947+
cached = self._fs_node_cache.get(key)
948+
if cached is not None:
949+
return cached
890950

891-
return path_map
951+
node = self.storage.get_node(tree_id, node_id)
952+
if not node:
953+
return None
954+
return self._remember_fs_node(tree_id, node.to_dict())
955+
956+
def _get_cached_fs_attrs(
957+
self,
958+
tree_id: str,
959+
node_id: str,
960+
node: dict[str, Any] | None = None,
961+
) -> dict[str, Any]:
962+
key = (tree_id, node_id)
963+
cached = self._fs_attrs_cache.get(key)
964+
if cached is not None:
965+
return cached
966+
967+
node = node or self._get_cached_fs_node_dict(tree_id, node_id)
968+
if not node:
969+
attrs: dict[str, Any] = {}
970+
else:
971+
attrs = self._parse_fs_attrs(node.get("attrs") if "attrs" in node else node.get("attrs_json"))
972+
self._remember_fs_node(tree_id, node)
973+
974+
self._fs_attrs_cache[key] = attrs
975+
self._fs_is_dir_cache[key] = bool(attrs.get("is_dir", False))
976+
return attrs
977+
978+
def _fs_node_has_children(self, tree_id: str, node_id: str) -> bool:
979+
key = (tree_id, node_id)
980+
cached = self._fs_children_cache.get(key)
981+
if cached is not None:
982+
return bool(cached)
983+
return bool(self._get_direct_children_nodes(tree_id, node_id))
984+
985+
def _clear_fs_lookup_cache(self) -> None:
986+
self._fs_node_cache.clear()
987+
self._fs_attrs_cache.clear()
988+
self._fs_path_cache.clear()
989+
self._fs_is_dir_cache.clear()
990+
self._fs_children_cache.clear()
991+
self._fs_block_render_cache.clear()
892992

893993
# ---- DB helpers ----
894994

@@ -922,12 +1022,14 @@ def _empty_result(self):
9221022

9231023
def clear_cache(self):
9241024
self._plan_cache.clear()
1025+
self._clear_fs_lookup_cache()
9251026

9261027
def clear_plan_cache(self, tree_id=None):
9271028
if tree_id:
9281029
self._plan_cache.pop(tree_id, None)
9291030
else:
9301031
self._plan_cache.clear()
1032+
self._clear_fs_lookup_cache()
9311033

9321034
def get_cache_stats(self):
9331035
return {"plan_cache_size": len(self._plan_cache)}

0 commit comments

Comments
 (0)