11import asyncio
22import hashlib
3+ import logging
34import os
45import socket
56import subprocess
67import sys
8+ import traceback
79from typing import AsyncGenerator
810
911import chromadb
1517
1618from vectorcode .cli_utils import Config , expand_path
1719
20+ logger = logging .getLogger (name = __name__ )
21+
1822
1923async def get_collections (
2024 client : AsyncClientAPI ,
@@ -42,6 +46,7 @@ async def try_server(host: str, port: int):
4246 try :
4347 async with httpx .AsyncClient () as client :
4448 response = await client .get (url = url )
49+ logger .debug (f"Chromadb server at { host } :{ port } returned { response = } " )
4550 return response .status_code == 200
4651 except (httpx .ConnectError , httpx .ConnectTimeout ):
4752 return False
@@ -82,6 +87,9 @@ async def start_server(configs: Config):
8287 with socket .socket (socket .AF_INET , socket .SOCK_STREAM ) as s :
8388 s .bind (("" , 0 )) # OS selects a free ephemeral port
8489 configs .port = int (s .getsockname ()[1 ])
90+ logger .warning (
91+ f"Starting bundled ChromaDB server at { configs .host } :{ configs .port } ."
92+ )
8593 env .update ({"ANONYMIZED_TELEMETRY" : "False" })
8694 process = await asyncio .create_subprocess_exec (
8795 sys .executable ,
@@ -130,10 +138,12 @@ async def get_client(configs: Config) -> AsyncClientAPI:
130138def get_collection_name (full_path : str ) -> str :
131139 full_path = str (expand_path (full_path , absolute = True ))
132140 hasher = hashlib .sha256 ()
133- hasher .update (
134- f"{ os .environ .get ('USER' , os .environ .get ('USERNAME' , 'DEFAULT_USER' ))} @{ socket .gethostname ()} :{ full_path } " .encode ()
135- )
141+ plain_collection_name = f"{ os .environ .get ('USER' , os .environ .get ('USERNAME' , 'DEFAULT_USER' ))} @{ socket .gethostname ()} :{ full_path } "
142+ hasher .update (plain_collection_name .encode ())
136143 collection_id = hasher .hexdigest ()[:63 ]
144+ logger .debug (
145+ f"Hashing { plain_collection_name } as the collection name for { full_path } ."
146+ )
137147 return collection_id
138148
139149
@@ -143,20 +153,18 @@ def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction | None
143153 ** configs .embedding_params
144154 )
145155 except AttributeError :
146- print (
156+ logger . warning (
147157 f"Failed to use { configs .embedding_function } . Falling back to Sentence Transformer." ,
148- file = sys .stderr ,
149158 )
150159 return embedding_functions .SentenceTransformerEmbeddingFunction ()
151160 except Exception as e :
152- print (
153- f"Failed to use { configs .embedding_function } with the following error:" ,
154- file = sys .stderr ,
155- )
156161 e .add_note (
157162 "\n For errors caused by missing dependency, consult the documentation of pipx (or whatever package manager that you installed VectorCode with) for instructions to inject libraries into the virtual environment."
158163 )
159-
164+ logger .error (
165+ f"Failed to use { configs .embedding_function } with the following error:" ,
166+ )
167+ logger .error (traceback .format_exc ())
160168 raise
161169
162170
@@ -175,6 +183,7 @@ async def get_collection(
175183 if __COLLECTION_CACHE .get (full_path ) is None :
176184 collection_name = get_collection_name (full_path )
177185 embedding_function = get_embedding_function (configs )
186+
178187 collection_meta : dict [str , str | int ] = {
179188 "path" : full_path ,
180189 "hostname" : socket .gethostname (),
@@ -190,7 +199,9 @@ async def get_collection(
190199 if not key .startswith ("hnsw:" ):
191200 target_key = f"hnsw:{ key } "
192201 collection_meta [target_key ] = configs .hnsw [key ]
193-
202+ logger .debug (
203+ f"Getting/Creating collection with the following metadata: { collection_meta } "
204+ )
194205 if not make_if_missing :
195206 __COLLECTION_CACHE [full_path ] = await client .get_collection (
196207 collection_name , embedding_function
@@ -211,6 +222,9 @@ async def get_collection(
211222 )
212223 or not collection .metadata .get ("created-by" ) == "VectorCode"
213224 ):
225+ logger .error (
226+ f"Failed to use existing collection due to metadata mismatch: { collection_meta } "
227+ )
214228 raise IndexError (
215229 "Failed to create the collection due to hash collision. Please file a bug report."
216230 )
@@ -222,15 +236,13 @@ def verify_ef(collection: AsyncCollection, configs: Config):
222236 collection_ef = collection .metadata .get ("embedding_function" )
223237 collection_ep = collection .metadata .get ("embedding_params" )
224238 if collection_ef and collection_ef != configs .embedding_function :
225- print (f"The collection was embedded using { collection_ef } ." )
226- print (
239+ logger . error (f"The collection was embedded using { collection_ef } ." )
240+ logger . error (
227241 "Embeddings and query must use the same embedding function and parameters. Please double-check your config."
228242 )
229243 return False
230244 elif collection_ep and collection_ep != configs .embedding_params :
231- print (
232- f"The collection was embedded with a different set of configurations: { collection_ep } ." ,
233- file = sys .stderr ,
245+ logger .warning (
246+ f"The collection was embedded with a different set of configurations: { collection_ep } . The result may be inaccurate." ,
234247 )
235- print ("The result may be inaccurate." , file = sys .stderr )
236248 return True
0 commit comments