Merge pull request #7 from VectifyAI/v0.3

carsontung666 · web-flow · commit 267613147f1d · 2026-03-31T17:32:35.000+08:00
update readme and config
diff --git a/README.md b/README.md
@@ -149,19 +149,19 @@ Run setup: `fs_query_order=prefix`, `beam_size=3`, `max_turns=10`, `5` filesyste
 
 | Retriever | Avg Time (s) | Avg LLM Calls | Hit@1 | Hit@10 | Total Cost (USD) |
 |---|---:|---:|---:|---:|---:|
-| **Block** | 9.27 | 2.6 | 1.00 | 1.00 | 0.1416 |
-| **Vertical** | 22.85 | 6.8 | 0.40 | 1.00 | 0.1682 |
-| **Beam** | 18.37 | 5.0 | 0.60 | 1.00 | 0.1331 |
+| **Block** | 8.44 | 2.4 | 1.00 | 1.00 | 0.2166 |
+| **Vertical** | 28.18 | 6.8 | 0.40 | 1.00 | 0.2900 |
+| **Beam** | 18.36 | 4.8 | 0.60 | 1.00 | 0.2091 |
 
 ### Claude Sonnet 4.6
 
 | Retriever | Avg Time (s) | Avg LLM Calls | Hit@1 | Hit@10 | Total Cost (USD) |
 |---|---:|---:|---:|---:|---:|
-| **Block** | 7.95 | 2.8 | 1.00 | 1.00 | 0.1670 |
-| **Vertical** | 17.85 | 5.8 | 0.40 | 0.80 | 0.1438 |
-| **Beam** | 17.41 | 4.8 | 0.60 | 1.00 | 0.1338 |
+| **Block** | 8.42 | 3.4 | 1.00 | 1.00 | 0.0643 |
+| **Vertical** | 20.78 | 7.0 | 0.40 | 0.80 | 0.1712 |
+| **Beam** | 17.84 | 4.8 | 0.40 | 1.00 | 0.1335 |
 
-`Block` is the best default: perfect Hit@1 across both models. `Beam` and `Vertical` are sensitive to model version — `Block` is the most robust choice.
+`Block` is the best default: perfect Hit@1 across both models, lowest cost on Sonnet 4.6 (prompt caching cuts cost by ~60%), and fastest latency. `Beam` and `Vertical` are sensitive to model version — `Block` is the most robust choice.
 
 These numbers are benchmark snapshots, not hard guarantees; exact cost and latency will vary with model choice, provider pricing, prompt-cache behavior, and corpus shape.
 
diff --git a/bench/benchmark_retrievers.py b/bench/benchmark_retrievers.py
@@ -30,7 +30,7 @@
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
 from contextdb import TreeDB
-from contextdb.config import Config
+from contextdb.config import Config, get_llm_config
 from contextdb.metrics import LLMWithStats, StatisticsRecorder
 from contextdb.retriever.algorithm.base_retriever import BaseRetriever
 
@@ -110,10 +110,16 @@ class BenchmarkResult:
     retriever_names: list[str] = field(default_factory=list)
     queries: list[QueryResult] = field(default_factory=list)
 
-    def summary(self) -> dict:
+    def summary(self, pricing: dict = None) -> dict:
         def total(items, attr):
             return sum(getattr(i, attr) for i in items) if items else 0
 
+        p = pricing or {}
+        pi = p.get("price_input", 3)
+        po = p.get("price_output", 15)
+        pcw = p.get("price_cache_write", 3.75)
+        pcr = p.get("price_cache_read", 0.30)
+
         summary = {"queries_run": len(self.queries)}
 
         for name in self.retriever_names:
@@ -128,7 +134,9 @@ def total(items, attr):
             total_cache_read = total(valid, "cache_read_tokens")
             total_cache_write = total(valid, "cache_creation_tokens")
 
-            cost = (total_input * 3 + total_output * 15 + total_cache_write * 3.75 + total_cache_read * 0.30) / 1_000_000
+            # litellm: input_tokens includes cache_read + cache_creation + uncached
+            uncached_input = total_input - total_cache_read - total_cache_write
+            cost = (uncached_input * pi + total_output * po + total_cache_write * pcw + total_cache_read * pcr) / 1_000_000
 
             n = len(valid) if valid else 1
             s = {
@@ -471,7 +479,8 @@ def make_tree(db):
 
 
 def print_summary(result: BenchmarkResult):
-    summary = result.summary()
+    pricing = get_llm_config(Config.LLM_PROVIDER, Config.LLM_MODEL)
+    summary = result.summary(pricing)
 
     print("\n" + "=" * 70)
     title = "FILESYSTEM BENCHMARK SUMMARY" if result.mode == "fs" else "BENCHMARK SUMMARY"
@@ -626,7 +635,7 @@ def main():
                 {"query": q.query, "results": {n: asdict(m) for n, m in q.results.items()}}
                 for q in result.queries
             ],
-            "summary": result.summary(),
+            "summary": result.summary(get_llm_config(Config.LLM_PROVIDER, Config.LLM_MODEL)),
         }, indent=2))
     else:
         print_summary(result)
diff --git a/contextdb/config/llm/anthropic.yaml b/contextdb/config/llm/anthropic.yaml
@@ -1,13 +1,22 @@
 # Anthropic models configuration
+# Pricing in USD per million tokens
 
 claude-opus-4-6:
   context_limit: 200000
   max_concurrent: 50
   rpm_limit: 4000
   tpm_limit: 400000
+  price_input: 5
+  price_output: 25
+  price_cache_write: 6.25
+  price_cache_read: 0.50
 
 claude-sonnet-4-6:
   context_limit: 200000
   max_concurrent: 50
   rpm_limit: 4000
   tpm_limit: 400000
+  price_input: 3
+  price_output: 15
+  price_cache_write: 3.75
+  price_cache_read: 0.30
diff --git a/contextdb/config/llm/openai.yaml b/contextdb/config/llm/openai.yaml
@@ -1,25 +1,22 @@
 # OpenAI models configuration
+# Pricing in USD per million tokens
 
-gpt-4:
-  context_limit: 8192
-  max_concurrent: 10
-  rpm_limit: 500
-  tpm_limit: 30000
-
-gpt-4-turbo:
+gpt-4o:
   context_limit: 128000
   max_concurrent: 20
   rpm_limit: 500
   tpm_limit: 150000
+  price_input: 2.50
+  price_output: 10
+  price_cache_write: 2.50
+  price_cache_read: 1.25
 
-gpt-4o:
+gpt-4o-mini:
   context_limit: 128000
-  max_concurrent: 20
+  max_concurrent: 50
   rpm_limit: 500
   tpm_limit: 150000
-
-gpt-3.5-turbo:
-  context_limit: 16385
-  max_concurrent: 50
-  rpm_limit: 3500
-  tpm_limit: 90000
+  price_input: 0.15
+  price_output: 0.60
+  price_cache_write: 0.15
+  price_cache_read: 0.075