11import asyncio
22import hashlib
3+ import logging
34import os
45import socket
56import subprocess
67import sys
8+ import traceback
79from typing import AsyncGenerator
810
911import chromadb
1517
1618from vectorcode .cli_utils import Config , expand_path
1719
20+ logger = logging .getLogger (name = __name__ )
21+
1822
1923async def get_collections (
2024 client : AsyncClientAPI ,
@@ -42,6 +46,7 @@ async def try_server(host: str, port: int):
4246 try :
4347 async with httpx .AsyncClient () as client :
4448 response = await client .get (url = url )
49+ logger .debug (f"Chromadb server at { host } :{ port } returned { response = } " )
4550 return response .status_code == 200
4651 except (httpx .ConnectError , httpx .ConnectTimeout ):
4752 return False
@@ -73,15 +78,17 @@ async def start_server(configs: Config):
7378 if not os .path .isdir (configs .db_log_path ):
7479 os .makedirs (configs .db_log_path )
7580 if not os .path .isdir (db_path ):
76- print (
81+ logger . warning (
7782 f"Using local database at { os .path .expanduser ('~/.local/share/vectorcode/chromadb/' )} ." ,
78- file = sys .stderr ,
7983 )
8084 db_path = os .path .expanduser ("~/.local/share/vectorcode/chromadb/" )
8185 env = os .environ .copy ()
8286 with socket .socket (socket .AF_INET , socket .SOCK_STREAM ) as s :
8387 s .bind (("" , 0 )) # OS selects a free ephemeral port
8488 configs .port = int (s .getsockname ()[1 ])
89+ logger .warning (
90+ f"Starting bundled ChromaDB server at { configs .host } :{ configs .port } ."
91+ )
8592 env .update ({"ANONYMIZED_TELEMETRY" : "False" })
8693 process = await asyncio .create_subprocess_exec (
8794 sys .executable ,
@@ -130,10 +137,12 @@ async def get_client(configs: Config) -> AsyncClientAPI:
130137def get_collection_name (full_path : str ) -> str :
131138 full_path = str (expand_path (full_path , absolute = True ))
132139 hasher = hashlib .sha256 ()
133- hasher .update (
134- f"{ os .environ .get ('USER' , os .environ .get ('USERNAME' , 'DEFAULT_USER' ))} @{ socket .gethostname ()} :{ full_path } " .encode ()
135- )
140+ plain_collection_name = f"{ os .environ .get ('USER' , os .environ .get ('USERNAME' , 'DEFAULT_USER' ))} @{ socket .gethostname ()} :{ full_path } "
141+ hasher .update (plain_collection_name .encode ())
136142 collection_id = hasher .hexdigest ()[:63 ]
143+ logger .debug (
144+ f"Hashing { plain_collection_name } as the collection name for { full_path } ."
145+ )
137146 return collection_id
138147
139148
@@ -143,20 +152,18 @@ def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction | None
143152 ** configs .embedding_params
144153 )
145154 except AttributeError :
146- print (
155+ logger . warning (
147156 f"Failed to use { configs .embedding_function } . Falling back to Sentence Transformer." ,
148- file = sys .stderr ,
149157 )
150158 return embedding_functions .SentenceTransformerEmbeddingFunction ()
151159 except Exception as e :
152- print (
153- f"Failed to use { configs .embedding_function } with the following error:" ,
154- file = sys .stderr ,
155- )
156160 e .add_note (
157161 "\n For errors caused by missing dependency, consult the documentation of pipx (or whatever package manager that you installed VectorCode with) for instructions to inject libraries into the virtual environment."
158162 )
159-
163+ logger .error (
164+ f"Failed to use { configs .embedding_function } with the following error:" ,
165+ )
166+ logger .error (traceback .format_exc ())
160167 raise
161168
162169
@@ -175,6 +182,7 @@ async def get_collection(
175182 if __COLLECTION_CACHE .get (full_path ) is None :
176183 collection_name = get_collection_name (full_path )
177184 embedding_function = get_embedding_function (configs )
185+
178186 collection_meta : dict [str , str | int ] = {
179187 "path" : full_path ,
180188 "hostname" : socket .gethostname (),
@@ -190,7 +198,9 @@ async def get_collection(
190198 if not key .startswith ("hnsw:" ):
191199 target_key = f"hnsw:{ key } "
192200 collection_meta [target_key ] = configs .hnsw [key ]
193-
201+ logger .debug (
202+ f"Getting/Creating collection with the following metadata: { collection_meta } "
203+ )
194204 if not make_if_missing :
195205 __COLLECTION_CACHE [full_path ] = await client .get_collection (
196206 collection_name , embedding_function
@@ -211,6 +221,9 @@ async def get_collection(
211221 )
212222 or not collection .metadata .get ("created-by" ) == "VectorCode"
213223 ):
224+ logger .error (
225+ f"Failed to use existing collection due to metadata mismatch: { collection_meta } "
226+ )
214227 raise IndexError (
215228 "Failed to create the collection due to hash collision. Please file a bug report."
216229 )
@@ -222,15 +235,13 @@ def verify_ef(collection: AsyncCollection, configs: Config):
222235 collection_ef = collection .metadata .get ("embedding_function" )
223236 collection_ep = collection .metadata .get ("embedding_params" )
224237 if collection_ef and collection_ef != configs .embedding_function :
225- print (f"The collection was embedded using { collection_ef } ." )
226- print (
238+ logger . error (f"The collection was embedded using { collection_ef } ." )
239+ logger . error (
227240 "Embeddings and query must use the same embedding function and parameters. Please double-check your config."
228241 )
229242 return False
230243 elif collection_ep and collection_ep != configs .embedding_params :
231- print (
232- f"The collection was embedded with a different set of configurations: { collection_ep } ." ,
233- file = sys .stderr ,
244+ logger .warning (
245+ f"The collection was embedded with a different set of configurations: { collection_ep } . The result may be inaccurate." ,
234246 )
235- print ("The result may be inaccurate." , file = sys .stderr )
236247 return True
0 commit comments