Skip to content

Commit 1d1a28d

Browse files
authored
chore(cli): add loggers. (#81)
* chore(cli): add logger for `main.py`. * configure the logging module. * logging for lsp * do not log to stdio for daemon processes. * fix broken test * logger for mcp server. * coverage for lsp * logger for chunkers. * documentation about logging. * logging for `common.py` * loggers for clean, drop, init and ls * logger for `query` * logger for reranker. * logger for `vectorise`. * logger for `update` * logger for cli_utils * coverage for `vectorise` * coverage for `query` * replace `print` with logger. * convert log levels to upper case
1 parent d71aeac commit 1d1a28d

21 files changed

Lines changed: 357 additions & 69 deletions

docs/cli.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* [Removing a Collection](#removing-a-collection)
2222
* [Checking Project Setup](#checking-project-setup)
2323
* [Cleaning up](#cleaning-up)
24+
* [Debugging and Diagnosing](#debugging-and-diagnosing)
2425
* [Shell Completion](#shell-completion)
2526
* [Hardware Acceleration](#hardware-acceleration)
2627
* [For Developers](#for-developers)
@@ -417,6 +418,20 @@ some_message` and then getting an empty results.
417418
For empty collections and collections for removed projects, you can use the
418419
`vectorcode clean` command to remove them at once.
419420

421+
### Debugging and Diagnosing
422+
423+
When something doesn't work as expected, you can enable logging by setting the
424+
`VECTORCODE_LOG_LEVEL` variable to one of `ERROR`, `WARN` (`WARNING`), `INFO` or
425+
`DEBUG`. For the CLI that you interact with in your shell, this will output logs
426+
to `STDERR` and write a log file to `~/.local/share/vectorcode/logs/`. For LSP
427+
and MCP servers, because `STDIO` is used for the RPC, only the log file will be
428+
written.
429+
430+
For example:
431+
```bash
432+
VECTORCODE_LOG_LEVEL=INFO vectorcode vectorise file1.py file2.lua
433+
```
434+
420435
## Shell Completion
421436

422437
VectorCode supports shell completion for bash/zsh/tcsh. You can use `vectorcode -s {bash,zsh,tcsh}`

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ dependencies = [
1717
"pygments",
1818
"transformers>=4.36.0,!=4.51.0,!=4.51.1,!=4.51.2",
1919
"wheel<0.46.0",
20+
"colorlog",
2021
]
2122
requires-python = ">=3.11,<3.14"
2223
readme = "README.md"

src/vectorcode/chunking.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import os
23
import re
34
from abc import abstractmethod
@@ -14,6 +15,8 @@
1415

1516
from vectorcode.cli_utils import Config
1617

18+
logger = logging.getLogger(name=__name__)
19+
1720

1821
@dataclass
1922
class Chunk:
@@ -50,6 +53,8 @@ def __init__(self, config: Optional[Config] = None) -> None:
5053
super().__init__(config)
5154

5255
def chunk(self, data: str):
56+
logger.info("Started chunking with StringChunker.")
57+
logger.debug(f"{data=}")
5358
if self.config.chunk_size < 0:
5459
yield Chunk(
5560
text=data,
@@ -80,6 +85,7 @@ def __init__(self, config: Optional[Config] = None) -> None:
8085
super().__init__(config)
8186

8287
def chunk(self, data: TextIOWrapper) -> Generator[Chunk, None, None]:
88+
logger.info("Started chunking using FileChunker.", data.name)
8389
lines = data.readlines()
8490
if len(lines) == 0:
8591
return
@@ -137,6 +143,10 @@ def __init__(self, config: Optional[Config] = None):
137143
super().__init__(config)
138144

139145
def __chunk_node(self, node: Node, text: str) -> Generator[Chunk, None, None]:
146+
if node.text is not None:
147+
logger.debug(
148+
f"Traversing at node {node.text.decode()} at position {node.byte_range}"
149+
)
140150
current_chunk = ""
141151

142152
current_start = None
@@ -225,6 +235,9 @@ def __build_pattern(self, language: str):
225235
else:
226236
patterns.extend(self.config.chunk_filters.get("*", []))
227237
if len(patterns):
238+
logger.debug(
239+
f"Merging {len(patterns)} filter patterns for excluding chunks."
240+
)
228241
patterns = [f"(?:{i})" for i in patterns]
229242
return f"(?:{'|'.join(patterns)})"
230243
return ""
@@ -234,10 +247,15 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
234247
data: path to the file
235248
"""
236249
assert os.path.isfile(data)
250+
logger.info(f"Started chunking {data} with TreeSitterChunker.")
237251
with open(data) as fin:
238252
lines = fin.readlines()
239253
content = "".join(lines)
240254
if self.config.chunk_size < 0 and content:
255+
logger.info(
256+
"Skipping chunking %s because document is smaller than chunk_size.",
257+
data,
258+
)
241259
yield Chunk(content, Point(1, 0), Point(len(lines), len(lines[-1]) - 1))
242260
return
243261
parser = None
@@ -251,12 +269,17 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
251269
parser = get_parser(name.lower())
252270
if parser is not None:
253271
language = name.lower()
272+
logger.debug(
273+
"Detected %s filetype for treesitter chunking.", language
274+
)
254275
break
255276
except LookupError: # pragma: nocover
256277
pass
257278

258279
if parser is None:
259-
# fall back to naive chunking
280+
logger.debug(
281+
"Unable to pick a suitable parser. Fall back to naive chunking"
282+
)
260283
yield from StringChunker(self.config).chunk(content)
261284
else:
262285
pattern_str = self.__build_pattern(language=language)

src/vectorcode/cli_utils.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import argparse
22
import glob
33
import json
4+
import logging
45
import os
6+
import sys
57
from dataclasses import dataclass, field, fields
8+
from datetime import datetime
69
from enum import Enum, StrEnum
710
from pathlib import Path
811
from typing import Any, Optional, Sequence, Union
@@ -11,6 +14,9 @@
1114

1215
from vectorcode import __version__
1316

17+
logger = logging.getLogger(name=__name__)
18+
19+
1420
PathLike = Union[str, Path]
1521

1622
GLOBAL_CONFIG_PATH = os.path.join(
@@ -395,10 +401,13 @@ async def load_config_file(path: Optional[PathLike] = None):
395401
if path is None:
396402
path = GLOBAL_CONFIG_PATH
397403
if os.path.isfile(path):
404+
logger.debug(f"Loading config from {path}")
398405
with open(path) as fin:
399406
config = json.load(fin)
400407
expand_envs_in_dict(config)
401408
return await Config.import_from(config)
409+
else:
410+
logger.warning("Loading default config.")
402411
return Config()
403412

404413

@@ -410,9 +419,13 @@ async def find_project_config_dir(start_from: PathLike = "."):
410419
for anchor in project_root_anchors:
411420
to_be_checked = os.path.join(current_dir, anchor)
412421
if os.path.isdir(to_be_checked):
422+
logger.debug(f"Found root anchor at {str(to_be_checked)}")
413423
return to_be_checked
414424
parent = current_dir.parent
415425
if parent.resolve() == current_dir:
426+
logger.debug(
427+
f"Couldn't find project root after reaching {str(current_dir)}"
428+
)
416429
return
417430
current_dir = parent.resolve()
418431

@@ -481,3 +494,58 @@ async def expand_globs(
481494
)
482495
)
483496
return list(result)
497+
498+
499+
def config_logging(
500+
name: str = "vectorcode", stdio: bool = True, log_file: bool = False
501+
): # pragma: nocover
502+
"""Configure the logging module. This should be called before a `main` function (CLI, LSP or MCP)."""
503+
504+
logging.root.handlers = []
505+
506+
level_from_env = os.environ.get("VECTORCODE_LOG_LEVEL")
507+
level = None
508+
if level_from_env:
509+
level = logging._nameToLevel.get(level_from_env.upper())
510+
if level is None:
511+
logging.warning(
512+
"Invalid log level: %s. Falling back to default levels.", level_from_env
513+
)
514+
515+
handlers = []
516+
if level is not None or log_file:
517+
log_dir = os.path.expanduser("~/.local/share/vectorcode/logs/")
518+
os.makedirs(log_dir, exist_ok=True)
519+
# File handler
520+
log_file_path = os.path.join(
521+
log_dir, f"{name}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
522+
)
523+
file_handler = logging.FileHandler(log_file_path)
524+
file_handler.setLevel(level or logging.WARN)
525+
handlers.append(file_handler)
526+
527+
if stdio:
528+
import colorlog
529+
530+
# Console handler
531+
console_handler = colorlog.StreamHandler(sys.stderr)
532+
console_handler.setFormatter(
533+
colorlog.ColoredFormatter(
534+
fmt="%(log_color)s%(levelname)s%(reset)s: %(name)s : %(message)s",
535+
log_colors={
536+
"DEBUG": "cyan",
537+
"INFO": "green",
538+
"WARNING": "yellow",
539+
"ERROR": "red",
540+
"CRITICAL": "bold_red",
541+
},
542+
reset=True,
543+
)
544+
)
545+
console_handler.setLevel(level or logging.WARN)
546+
handlers.append(console_handler)
547+
548+
logging.basicConfig(
549+
handlers=handlers,
550+
level=level,
551+
)

src/vectorcode/common.py

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import asyncio
22
import hashlib
3+
import logging
34
import os
45
import socket
56
import subprocess
67
import sys
8+
import traceback
79
from typing import AsyncGenerator
810

911
import chromadb
@@ -15,6 +17,8 @@
1517

1618
from vectorcode.cli_utils import Config, expand_path
1719

20+
logger = logging.getLogger(name=__name__)
21+
1822

1923
async def get_collections(
2024
client: AsyncClientAPI,
@@ -42,6 +46,7 @@ async def try_server(host: str, port: int):
4246
try:
4347
async with httpx.AsyncClient() as client:
4448
response = await client.get(url=url)
49+
logger.debug(f"Chromadb server at {host}:{port} returned {response=}")
4550
return response.status_code == 200
4651
except (httpx.ConnectError, httpx.ConnectTimeout):
4752
return False
@@ -73,15 +78,17 @@ async def start_server(configs: Config):
7378
if not os.path.isdir(configs.db_log_path):
7479
os.makedirs(configs.db_log_path)
7580
if not os.path.isdir(db_path):
76-
print(
81+
logger.warning(
7782
f"Using local database at {os.path.expanduser('~/.local/share/vectorcode/chromadb/')}.",
78-
file=sys.stderr,
7983
)
8084
db_path = os.path.expanduser("~/.local/share/vectorcode/chromadb/")
8185
env = os.environ.copy()
8286
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
8387
s.bind(("", 0)) # OS selects a free ephemeral port
8488
configs.port = int(s.getsockname()[1])
89+
logger.warning(
90+
f"Starting bundled ChromaDB server at {configs.host}:{configs.port}."
91+
)
8592
env.update({"ANONYMIZED_TELEMETRY": "False"})
8693
process = await asyncio.create_subprocess_exec(
8794
sys.executable,
@@ -130,10 +137,12 @@ async def get_client(configs: Config) -> AsyncClientAPI:
130137
def get_collection_name(full_path: str) -> str:
131138
full_path = str(expand_path(full_path, absolute=True))
132139
hasher = hashlib.sha256()
133-
hasher.update(
134-
f"{os.environ.get('USER', os.environ.get('USERNAME', 'DEFAULT_USER'))}@{socket.gethostname()}:{full_path}".encode()
135-
)
140+
plain_collection_name = f"{os.environ.get('USER', os.environ.get('USERNAME', 'DEFAULT_USER'))}@{socket.gethostname()}:{full_path}"
141+
hasher.update(plain_collection_name.encode())
136142
collection_id = hasher.hexdigest()[:63]
143+
logger.debug(
144+
f"Hashing {plain_collection_name} as the collection name for {full_path}."
145+
)
137146
return collection_id
138147

139148

@@ -143,20 +152,18 @@ def get_embedding_function(configs: Config) -> chromadb.EmbeddingFunction | None
143152
**configs.embedding_params
144153
)
145154
except AttributeError:
146-
print(
155+
logger.warning(
147156
f"Failed to use {configs.embedding_function}. Falling back to Sentence Transformer.",
148-
file=sys.stderr,
149157
)
150158
return embedding_functions.SentenceTransformerEmbeddingFunction()
151159
except Exception as e:
152-
print(
153-
f"Failed to use {configs.embedding_function} with the following error:",
154-
file=sys.stderr,
155-
)
156160
e.add_note(
157161
"\nFor errors caused by missing dependency, consult the documentation of pipx (or whatever package manager that you installed VectorCode with) for instructions to inject libraries into the virtual environment."
158162
)
159-
163+
logger.error(
164+
f"Failed to use {configs.embedding_function} with the following error:",
165+
)
166+
logger.error(traceback.format_exc())
160167
raise
161168

162169

@@ -175,6 +182,7 @@ async def get_collection(
175182
if __COLLECTION_CACHE.get(full_path) is None:
176183
collection_name = get_collection_name(full_path)
177184
embedding_function = get_embedding_function(configs)
185+
178186
collection_meta: dict[str, str | int] = {
179187
"path": full_path,
180188
"hostname": socket.gethostname(),
@@ -190,7 +198,9 @@ async def get_collection(
190198
if not key.startswith("hnsw:"):
191199
target_key = f"hnsw:{key}"
192200
collection_meta[target_key] = configs.hnsw[key]
193-
201+
logger.debug(
202+
f"Getting/Creating collection with the following metadata: {collection_meta}"
203+
)
194204
if not make_if_missing:
195205
__COLLECTION_CACHE[full_path] = await client.get_collection(
196206
collection_name, embedding_function
@@ -211,6 +221,9 @@ async def get_collection(
211221
)
212222
or not collection.metadata.get("created-by") == "VectorCode"
213223
):
224+
logger.error(
225+
f"Failed to use existing collection due to metadata mismatch: {collection_meta}"
226+
)
214227
raise IndexError(
215228
"Failed to create the collection due to hash collision. Please file a bug report."
216229
)
@@ -222,15 +235,13 @@ def verify_ef(collection: AsyncCollection, configs: Config):
222235
collection_ef = collection.metadata.get("embedding_function")
223236
collection_ep = collection.metadata.get("embedding_params")
224237
if collection_ef and collection_ef != configs.embedding_function:
225-
print(f"The collection was embedded using {collection_ef}.")
226-
print(
238+
logger.error(f"The collection was embedded using {collection_ef}.")
239+
logger.error(
227240
"Embeddings and query must use the same embedding function and parameters. Please double-check your config."
228241
)
229242
return False
230243
elif collection_ep and collection_ep != configs.embedding_params:
231-
print(
232-
f"The collection was embedded with a different set of configurations: {collection_ep}.",
233-
file=sys.stderr,
244+
logger.warning(
245+
f"The collection was embedded with a different set of configurations: {collection_ep}. The result may be inaccurate.",
234246
)
235-
print("The result may be inaccurate.", file=sys.stderr)
236247
return True

0 commit comments

Comments
 (0)