1515import httpx
1616from chromadb .api import AsyncClientAPI
1717from chromadb .api .models .AsyncCollection import AsyncCollection
18- from chromadb .api .types import EmbeddingFunction , IncludeEnum , QueryResult
18+ from chromadb .api .types import IncludeEnum , QueryResult
1919from chromadb .config import APIVersion , Settings
2020from chromadb .errors import InvalidCollectionException
2121from tree_sitter import Point
2828 expand_globs ,
2929 expand_path ,
3030)
31- from vectorcode .common import get_embedding_function
3231from vectorcode .database .base import DatabaseConnectorBase
3332from vectorcode .database .errors import CollectionNotFoundError
3433from vectorcode .database .types import (
@@ -355,19 +354,15 @@ async def vectorise(
355354 self ,
356355 file_path : str ,
357356 chunker : TreeSitterChunker | None = None ,
358- embedding_function : EmbeddingFunction | None = None ,
359357 ) -> VectoriseStats :
360358 collection_path = str (self ._configs .project_root )
361359 collection = await self ._create_or_get_collection (
362360 collection_path , allow_create = True
363361 )
364362 chunker = chunker or TreeSitterChunker (self ._configs )
365- embedding_function = cast (
366- EmbeddingFunction ,
367- embedding_function or get_embedding_function (self ._configs ),
368- )
363+
369364 chunks = tuple (chunker .chunk (file_path ))
370- embeddings = embedding_function (list (i .text for i in chunks ))
365+ embeddings = self . get_embedding (list (i .text for i in chunks ))
371366
372367 file_hash = hash_file (file_path )
373368
@@ -414,7 +409,7 @@ async def list_collections(self):
414409 for col_name in await client .list_collections ():
415410 col = await client .get_collection (col_name )
416411 project_root = str (col .metadata .get ("path" ))
417- col_counts = await self .list ()
412+ col_counts = await self .list_collection_content ()
418413 result .append (
419414 CollectionInfo (
420415 id = col_name ,
@@ -430,7 +425,7 @@ async def list_collections(self):
430425 )
431426 return result
432427
433- async def list (self , what = None ) -> CollectionContent :
428+ async def list_collection_content (self , what = None ) -> CollectionContent :
434429 """
435430 When `what` is None, this method should populate both `CollectionContent.files` and `CollectionContent.chunks`.
436431 Otherwise, this method may populate only one of them to save waiting time.
@@ -494,7 +489,7 @@ async def delete(self) -> int:
494489 ]
495490 files_in_collection = set (
496491 str (expand_path (i .path , True ))
497- for i in (await self .list (ResultType .document )).files
492+ for i in (await self .list_collection_content (ResultType .document )).files
498493 )
499494
500495 rm_paths = {
@@ -516,3 +511,38 @@ async def drop(
516511 async with _Chroma0ClientManager ().get_client (self ._configs ) as client :
517512 await self ._create_or_get_collection (collection_path , False )
518513 await client .delete_collection (get_collection_id (collection_path ))
514+
515+ async def get_chunks (self , file_path ) -> list [Chunk ]:
516+ file_path = os .path .abspath (file_path )
517+ try :
518+ collection = await self ._create_or_get_collection (
519+ collection_path = str (self ._configs .project_root ), allow_create = False
520+ )
521+ except CollectionNotFoundError :
522+ _logger .warning (
523+ f"There's no existing collection at { self ._configs .project_root } ."
524+ )
525+ return []
526+ except Exception :
527+ raise
528+
529+ raw_results = await collection .get (
530+ where = {"path" : file_path },
531+ include = [IncludeEnum .metadatas , IncludeEnum .documents ],
532+ )
533+ assert raw_results ["metadatas" ] is not None
534+ assert raw_results ["documents" ] is not None
535+
536+ result : list [Chunk ] = []
537+ for i in range (len (raw_results ["ids" ])):
538+ meta = raw_results ["metadatas" ][i ]
539+ text = raw_results ["documents" ][i ]
540+ _id = raw_results ["ids" ][i ]
541+ chunk = Chunk (text = text , id = _id )
542+ if meta .get ("start" ) is not None :
543+ chunk .start = Point (row = int (meta ["start" ]), column = 0 )
544+ if meta .get ("end" ) is not None :
545+ chunk .end = Point (row = int (meta ["end" ]), column = 0 )
546+
547+ result .append (chunk )
548+ return result
0 commit comments