Skip to content

Commit d13b7f9

Browse files
authored
feat: fix chunk id generation (#984)
* feat: fix chunk id generation * feat: fix remove doc * feat: clean up log
1 parent 7b04be2 commit d13b7f9

16 files changed

Lines changed: 486 additions & 334 deletions

File tree

aperag/db/repositories/lightrag.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -878,3 +878,23 @@ def _query(session):
878878
return [row[0] for row in result.fetchall()]
879879

880880
return self._execute_query(_query)
881+
882+
def query_lightrag_vdb_entity_all(self, workspace: str):
883+
"""Query all LightRAG VDB Entity records for workspace"""
884+
885+
def _query(session):
886+
stmt = select(LightRAGVDBEntityModel).where(LightRAGVDBEntityModel.workspace == workspace)
887+
result = session.execute(stmt)
888+
return {entity.id: entity for entity in result.scalars().all()}
889+
890+
return self._execute_query(_query)
891+
892+
def query_lightrag_vdb_relation_all(self, workspace: str):
893+
"""Query all LightRAG VDB Relation records for workspace"""
894+
895+
def _query(session):
896+
stmt = select(LightRAGVDBRelationModel).where(LightRAGVDBRelationModel.workspace == workspace)
897+
result = session.execute(stmt)
898+
return {relation.id: relation for relation in result.scalars().all()}
899+
900+
return self._execute_query(_query)

aperag/graph/lightrag/kg/neo4j_sync_impl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ async def initialize(self):
8282
# Prepare database in thread to avoid blocking
8383
self._DATABASE = await asyncio.to_thread(Neo4jSyncConnectionManager.prepare_database, self.workspace)
8484

85-
logger.info(f"Neo4JSyncStorage initialized for workspace '{self.workspace}', database '{self._DATABASE}'")
85+
logger.debug(f"Neo4JSyncStorage initialized for workspace '{self.workspace}', database '{self._DATABASE}'")
8686

8787
async def finalize(self):
8888
"""Clean up resources."""

aperag/graph/lightrag/kg/postgres_sync_impl.py

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class PGOpsSyncKVStorage(BaseKVStorage):
5353

5454
async def initialize(self):
5555
"""Initialize storage."""
56-
logger.info(f"PGOpsSyncKVStorage initialized for workspace '{self.workspace}'")
56+
logger.debug(f"PGOpsSyncKVStorage initialized for workspace '{self.workspace}'")
5757

5858
async def finalize(self):
5959
"""Clean up resources."""
@@ -240,12 +240,74 @@ class PGOpsSyncVectorStorage(BaseVectorStorage):
240240

241241
async def initialize(self):
242242
"""Initialize storage."""
243-
logger.info(f"PGOpsSyncVectorStorage initialized for workspace '{self.workspace}'")
243+
logger.debug(f"PGOpsSyncVectorStorage initialized for workspace '{self.workspace}'")
244244

245245
async def finalize(self):
246246
"""Clean up resources."""
247247
logger.debug(f"PGOpsSyncVectorStorage finalized for workspace '{self.workspace}'")
248248

249+
async def get_all(self) -> dict[str, Any]:
250+
"""Get all data from vector storage"""
251+
252+
def _sync_get_all():
253+
# Import here to avoid circular imports
254+
from aperag.db.ops import db_ops
255+
from aperag.graph.lightrag.namespace import NameSpace, is_namespace
256+
257+
# Determine which table to query based on namespace
258+
if is_namespace(self.namespace, NameSpace.VECTOR_STORE_CHUNKS):
259+
models = db_ops.query_lightrag_doc_chunks_all(self.workspace)
260+
return {
261+
chunk_id: {
262+
"id": chunk_id,
263+
"tokens": model.tokens,
264+
"content": model.content or "",
265+
"chunk_order_index": model.chunk_order_index,
266+
"full_doc_id": model.full_doc_id,
267+
"content_vector": model.content_vector,
268+
"file_path": model.file_path,
269+
"created_at": int(model.create_time.timestamp()) if model.create_time else None,
270+
}
271+
for chunk_id, model in models.items()
272+
}
273+
elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_ENTITIES):
274+
models = db_ops.query_lightrag_vdb_entity_all(self.workspace)
275+
return {
276+
entity_id: {
277+
"id": entity_id,
278+
"entity_name": model.entity_name,
279+
"content": model.content or "",
280+
"content_vector": model.content_vector,
281+
"chunk_ids": model.chunk_ids or [],
282+
"file_path": model.file_path,
283+
"created_at": int(model.create_time.timestamp()) if model.create_time else None,
284+
}
285+
for entity_id, model in models.items()
286+
}
287+
elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_RELATIONSHIPS):
288+
models = db_ops.query_lightrag_vdb_relation_all(self.workspace)
289+
return {
290+
relation_id: {
291+
"id": relation_id,
292+
"source_id": model.source_id,
293+
"target_id": model.target_id,
294+
"content": model.content or "",
295+
"content_vector": model.content_vector,
296+
"chunk_ids": model.chunk_ids or [],
297+
"file_path": model.file_path,
298+
"created_at": int(model.create_time.timestamp()) if model.create_time else None,
299+
# Add additional fields that might be expected
300+
"src_id": model.source_id,
301+
"tgt_id": model.target_id,
302+
}
303+
for relation_id, model in models.items()
304+
}
305+
else:
306+
logger.error(f"Unknown namespace for get_all: {self.namespace}")
307+
return {}
308+
309+
return await asyncio.to_thread(_sync_get_all)
310+
249311
def _prepare_vector_data(self, item: dict[str, Any], current_time: datetime.datetime) -> dict[str, Any]:
250312
"""Prepare vector data based on namespace."""
251313
from aperag.graph.lightrag.namespace import NameSpace, is_namespace

aperag/graph/lightrag/kg/qdrant_impl.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,27 @@
4545
from ..utils import logger
4646

4747

48-
def compute_mdhash_id_for_qdrant(content: str, prefix: str = "", style: str = "simple") -> str:
48+
def compute_mdhash_id_for_qdrant(content: str, prefix: str = "", workspace: str = "", style: str = "simple") -> str:
4949
"""
50-
Generate a UUID based on the content and support multiple formats.
50+
Generate a UUID based on the content with workspace isolation and support multiple formats.
5151
5252
:param content: The content used to generate the UUID.
53+
:param prefix: The prefix to add to the hash
54+
:param workspace: The workspace identifier for data isolation
5355
:param style: The format of the UUID, optional values are "simple", "hyphenated", "urn".
5456
:return: A UUID that meets the requirements of Qdrant.
5557
"""
5658
if not content:
5759
raise ValueError("Content must not be empty.")
5860

61+
# Combine content with workspace to ensure isolation
62+
if workspace:
63+
hash_input = f"{workspace}::{content}"
64+
else:
65+
hash_input = content
66+
5967
# Use the hash value of the content to create a UUID.
60-
hashed_content = hashlib.sha256((prefix + content).encode("utf-8")).digest()
68+
hashed_content = hashlib.sha256((prefix + hash_input).encode("utf-8")).digest()
6169
generated_uuid = uuid.UUID(bytes=hashed_content[:16], version=4)
6270

6371
# Return the UUID according to the specified format.
@@ -122,7 +130,7 @@ async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
122130
for i, d in enumerate(list_data):
123131
list_points.append(
124132
models.PointStruct(
125-
id=compute_mdhash_id_for_qdrant(d["id"]),
133+
id=compute_mdhash_id_for_qdrant(d["id"], workspace=self.workspace),
126134
vector=embeddings[i],
127135
payload=d,
128136
)
@@ -160,7 +168,7 @@ async def delete(self, ids: List[str]) -> None:
160168
"""
161169
try:
162170
# Convert regular ids to Qdrant compatible ids
163-
qdrant_ids = [compute_mdhash_id_for_qdrant(id) for id in ids]
171+
qdrant_ids = [compute_mdhash_id_for_qdrant(id, workspace=self.workspace) for id in ids]
164172
# Delete points from the collection
165173
self._client.delete(
166174
collection_name=self._collection_name,
@@ -181,7 +189,7 @@ async def delete_entity(self, entity_name: str) -> None:
181189
"""
182190
try:
183191
# Generate the entity ID
184-
entity_id = compute_mdhash_id_for_qdrant(entity_name, prefix="ent-")
192+
entity_id = compute_mdhash_id_for_qdrant(entity_name, prefix="ent-", workspace=self.workspace)
185193
logger.debug(f"Attempting to delete entity {entity_name} with ID {entity_id}")
186194

187195
# Delete the entity point from the collection
@@ -246,7 +254,7 @@ async def get_by_id(self, id: str) -> dict[str, Any] | None:
246254
"""
247255
try:
248256
# Convert to Qdrant compatible ID
249-
qdrant_id = compute_mdhash_id_for_qdrant(id)
257+
qdrant_id = compute_mdhash_id_for_qdrant(id, workspace=self.workspace)
250258

251259
# Retrieve the point by ID
252260
result = self._client.retrieve(
@@ -282,7 +290,7 @@ async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
282290

283291
try:
284292
# Convert to Qdrant compatible IDs
285-
qdrant_ids = [compute_mdhash_id_for_qdrant(id) for id in ids]
293+
qdrant_ids = [compute_mdhash_id_for_qdrant(id, workspace=self.workspace) for id in ids]
286294

287295
# Retrieve the points by IDs
288296
results = self._client.retrieve(

0 commit comments

Comments
 (0)