33import os
44from typing import Any , cast
55
6- from chromadb import GetResult , Where
6+ from chromadb import Where
77from chromadb .api .models .AsyncCollection import AsyncCollection
88from chromadb .api .types import IncludeEnum , QueryResult
99from chromadb .errors import InvalidCollectionException , InvalidDimensionException
@@ -39,19 +39,23 @@ def convert_query_results(
3939 assert chroma_result ["documents" ] is not None
4040 assert chroma_result ["distances" ] is not None
4141 assert chroma_result ["metadatas" ] is not None
42+ assert chroma_result ["ids" ] is not None
4243
4344 chroma_results_list : list [vectorcode_types .QueryResult ] = []
4445 for q_i in range (len (queries )):
4546 q = queries [q_i ]
4647 documents = chroma_result ["documents" ][q_i ]
4748 distances = chroma_result ["distances" ][q_i ]
4849 metadatas = chroma_result ["metadatas" ][q_i ]
49- for doc , dist , meta in zip (documents , distances , metadatas ):
50- chunk = Chunk (text = doc )
50+ ids = chroma_result ["ids" ][q_i ]
51+ for doc , dist , meta , _id in zip (documents , distances , metadatas , ids ):
52+ chunk = Chunk (text = doc , id = _id )
5153 if meta .get ("start" ):
5254 chunk .start = Point (int (meta .get ("start" , 0 )), 0 )
5355 if meta .get ("end" ):
54- chunk .end = Point (int (meta .get ("end" , 0 )) + 1 , 0 )
56+ chunk .end = Point (int (meta .get ("end" , 0 )), 0 )
57+ if meta .get ("path" ):
58+ chunk .path = str (meta ["path" ])
5559 chroma_results_list .append (
5660 vectorcode_types .QueryResult (
5761 chunk = chunk ,
@@ -65,7 +69,7 @@ def convert_query_results(
6569
6670async def get_query_result_files (
6771 collection : AsyncCollection , configs : Config
68- ) -> list [str ]:
72+ ) -> list [str | Chunk ]:
6973 query_chunks = []
7074 assert configs .query , "Query messages cannot be empty."
7175 chunker = StringChunker (configs )
@@ -126,63 +130,43 @@ async def get_query_result_files(
126130async def build_query_results (
127131 collection : AsyncCollection , configs : Config
128132) -> list [dict [str , str | int ]]:
129- structured_result = []
130- for identifier in await get_query_result_files (collection , configs ):
131- if os .path .isfile (identifier ):
132- if configs .use_absolute_path :
133- output_path = os .path .abspath (identifier )
134- else :
135- output_path = os .path .relpath (identifier , configs .project_root )
136- full_result = {"path" : output_path }
137- with open (identifier ) as fin :
138- document = fin .read ()
139- full_result ["document" ] = document
133+ assert configs .project_root
140134
141- structured_result .append (
142- {str (key ): full_result [str (key )] for key in configs .include }
143- )
144- elif QueryInclude .chunk in configs .include :
145- chunks : GetResult = await collection .get (
146- identifier , include = [IncludeEnum .metadatas , IncludeEnum .documents ]
147- )
148- meta = chunks .get (
149- "metadatas" ,
150- )
151- if meta is not None and len (meta ) != 0 :
152- chunk_texts = chunks .get ("documents" )
153- assert chunk_texts is not None , (
154- "QueryResult does not contain `documents`!"
155- )
156- full_result : dict [str , str | int ] = {
157- "chunk" : str (chunk_texts [0 ]),
158- "chunk_id" : identifier ,
159- }
160- if meta [0 ].get ("start" ) is not None and meta [0 ].get ("end" ) is not None :
161- path = str (meta [0 ].get ("path" ))
162- with open (path ) as fin :
163- start : int = int (meta [0 ]["start" ])
164- end : int = int (meta [0 ]["end" ])
165- full_result ["chunk" ] = "" .join (fin .readlines ()[start : end + 1 ])
166- full_result ["start_line" ] = start
167- full_result ["end_line" ] = end
168- if QueryInclude .path in configs .include :
169- full_result ["path" ] = str (
170- meta [0 ]["path" ]
171- if configs .use_absolute_path
172- else os .path .relpath (
173- str (meta [0 ]["path" ]), str (configs .project_root )
174- )
175- )
176-
177- structured_result .append (full_result )
178- else : # pragma: nocover
179- logger .error (
180- "This collection doesn't support chunk-mode output because it lacks the necessary metadata. Please re-vectorise it." ,
181- )
135+ def make_output_path (path : str , absolute : bool ) -> str :
136+ if absolute :
137+ if os .path .isabs (path ):
138+ return path
139+ return os .path .abspath (os .path .join (str (configs .project_root ), path ))
140+ else :
141+ rel_path = os .path .relpath (path , configs .project_root )
142+ if isinstance (rel_path , bytes ): # pragma: nocover
143+ # for some reasons some python versions report that `os.path.relpath` returns a string.
144+ rel_path = rel_path .decode ()
145+ return rel_path
182146
147+ structured_result = []
148+ for res in await get_query_result_files (collection , configs ):
149+ if isinstance (res , str ):
150+ output_path = make_output_path (res , configs .use_absolute_path )
151+ io_path = make_output_path (res , True )
152+ if not os .path .isfile (io_path ):
153+ logger .warning (f"{ io_path } is no longer a valid file." )
154+ continue
155+ with open (io_path ) as fin :
156+ structured_result .append ({"path" : output_path , "document" : fin .read ()})
183157 else :
184- logger .warning (
185- f"{ identifier } is no longer a valid file! Please re-run vectorcode vectorise to refresh the database." ,
158+ res = cast (Chunk , res )
159+ assert res .path , f"{ res } has no `path` attribute."
160+ structured_result .append (
161+ {
162+ "path" : make_output_path (res .path , configs .use_absolute_path )
163+ if res .path is not None
164+ else None ,
165+ "chunk" : res .text ,
166+ "start_line" : res .start .row if res .start is not None else None ,
167+ "end_line" : res .end .row if res .end is not None else None ,
168+ "chunk_id" : res .id ,
169+ }
186170 )
187171 for result in structured_result :
188172 if result .get ("path" ) is not None :
0 commit comments