SWORDIntel
diff --git a/‎Makefile‎
Lines changed: 19 additions & 6 deletions b/‎Makefile‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 79 additions & 20 deletions b/‎README.md‎
Lines changed: 79 additions & 20 deletions
@@ -3,7 +3,7 @@
 
 CC=gcc
 
-CFLAGS_BASE=-std=c99 -Wall -Wextra -I. -I./core -I./algorithms -I./backends/cpu -I./backends/npu -I./orchestration/include -I./memory/include -I./quantization/include -I./ml/include -fPIC -lm -pthread -D_GNU_SOURCE -O3
+CFLAGS_BASE=-std=c99 -Wall -Wextra -I. -I./include -I./core -I./algorithms -I./backends/cpu -I./backends/npu -I./orchestration/include -I./memory/include -I./quantization/include -I./ml/include -fPIC -lm -pthread -D_GNU_SOURCE -O3
 QIHSE_CFLAGS_EXTRA?=
 
 # CPU-specific SIMD backend selection.
@@ -38,13 +38,14 @@ QIHSE_TRINARY_SWEEP_BENCH_ITERS?=1
 
 # Use the most complete set of sources WITHOUT duplicates
 # We use qihse_exports.c to fill in any missing gaps for the Python layer
-SRCS_BASE=core/qihse.c qihse_search.c qihse_math.c qihse_instr.c qihse_hetero.c qihse_vector_db.c qihse_exports.c \
+SRCS_BASE=core/qihse.c src/qihse_search.c src/qihse_math.c src/qihse_instr.c src/qihse_hetero.c src/qihse_vector_db.c src/qihse_exports.c \
      persistence/qihse_file_posix.c persistence/qihse_persist_format.c persistence/qihse_vector_store.c \
      algorithms/qihse_anchor_search.c algorithms/qihse_version.c \
      codecs/qihse_trinary_tryte_codec.c \
      core/qihse_helpers.c core/qihse_plugin.c \
      algorithms/qihse_dimensions.c algorithms/qihse_verification.c algorithms/qihse_amplification.c \
      backends/cpu/qihse_cpu_detect.c \
+     backends/cpu/qihse_cpu_distance.c \
      backends/npu/qihse_npu_openvino.c \
      backends/gpu/cuda/qihse_cuda_backend.c \
      memory/src/qihse_memory.c memory/src/qihse_hma.c memory/src/qihse_uma.c \
@@ -56,7 +57,7 @@ SRCS=$(SRCS_BASE)
 
 ifeq ($(QIHSE_ENABLE_AVX2),1)
 CFLAGS += -mavx2 -mfma
-SRCS += backends/cpu/qihse_cpu_avx2.c
+SRCS += backends/cpu/qihse_cpu_avx2.c backends/cpu/qihse_cpu_distance_avx2.c
 endif
 
 ifeq ($(QIHSE_ENABLE_AVX512),1)
@@ -68,14 +69,14 @@ endif
 # because their functionality is already partially in qihse_math.c / qihse_search.c 
 # or provided by qihse_exports.c stubs.
 
-.PHONY: all build build-native clean pristine workspace workspace-clean lib persistence persistence-check test benchmark install dev-setup docs test-persist test-trinary-codec test-memory-planner test-memory-topology-probe test-memory-planner-trace test-memory-allocation-policy test-memory-coherence test-memory-migration-policy test-memory-migration test-memory-device-placement test-memory-migration-backend test-memory-migration-scheduler bench-trinary-codec bench-trinary-db-candidate bench-trinary-search-path bench-trinary-search-sweep bench-trinary-random-sweep bench-trinary-weighted-sweep bench-trinary-magnitude-sweep bench-reference-workloads bench-reference-runner-smoke sample-vxug-pdf-workload bench-vxug-pdf-workload bench-reference-workload bench-reference-result-summary bench-sift1m-workload bench-sift1m-fallback-data calibrate-sift1m-workload validate-reference-workflow check-upstream-workflow check-upstream-workflow-strict check upstream-pr-loop test-all-isa test-vnni-bench test-vnni-only test-avx2-only test-avx512-direct test-amx-only test-direct-execution test-simple-exec
+.PHONY: all build build-native clean pristine workspace workspace-clean lib persistence persistence-check test benchmark install dev-setup docs test-persist test-trinary-codec test-memory-planner test-memory-topology-probe test-memory-planner-trace test-memory-allocation-policy test-memory-coherence test-memory-migration-policy test-memory-migration test-memory-device-placement test-memory-migration-backend test-memory-migration-scheduler bench-trinary-codec bench-trinary-db-candidate bench-micro bench-trinary-search-path bench-trinary-search-sweep bench-trinary-random-sweep bench-trinary-weighted-sweep bench-trinary-magnitude-sweep bench-reference-workloads bench-reference-runner-smoke sample-vxug-pdf-workload bench-vxug-pdf-workload bench-reference-workload bench-reference-result-summary bench-sift1m-workload bench-sift1m-fallback-data calibrate-sift1m-workload validate-reference-workflow check-upstream-workflow check-upstream-workflow-strict check upstream-pr-loop test-all-isa test-vnni-bench test-vnni-only test-avx2-only test-avx512-direct test-amx-only test-direct-execution test-simple-exec
 .NOTPARALLEL: validate-reference-workflow
 
 all: lib
 build: lib
 
 build-native:
-	./build-native.sh
+	./scripts/build-native.sh
 
 lib: $(LIB_TARGET)
 
@@ -195,6 +196,18 @@ test-simple-exec:
 	$(CC) $(CFLAGS) -mavx2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfma -mamx-tile -mamx-int8 -mamx-bf16 -o tests/test_simple_exec tests/test_simple_exec.c $(LDFLAGS)
 	./tests/test_simple_exec
 
+bench-micro: lib
+	$(CC) $(CFLAGS) -o benchmarks/qihse_micro_bench \
+		benchmarks/qihse_micro_bench.c \
+		-L. -lqihse $(LDFLAGS)
+	LD_LIBRARY_PATH=. ./benchmarks/qihse_micro_bench
+
+bench-memory-hierarchy: lib
+	$(CC) $(CFLAGS) -o benchmarks/qihse_memory_hierarchy_bench \
+		benchmarks/qihse_memory_hierarchy_bench.c \
+		-L. -lqihse $(LDFLAGS)
+	LD_LIBRARY_PATH=. ./benchmarks/qihse_memory_hierarchy_bench
+
 benchmark: validate-reference-workflow
 
 dev-setup:
@@ -243,7 +256,7 @@ bench-trinary-search-sweep: lib
 	QIHSE_BENCH_SWEEP=1 QIHSE_BENCH_DATASET=near_tie LD_LIBRARY_PATH=. /tmp/qihse_trinary_search_path_bench
 
 bench-trinary-random-sweep: lib
-	./run-trinary-random-sweep.sh \
+	./scripts/run-trinary-random-sweep.sh \
 	  --iterations $(QIHSE_TRINARY_SWEEP_ITERS) \
 	  --iters-per-pass $(QIHSE_TRINARY_SWEEP_BENCH_ITERS) \
 	  --output-dir $(QIHSE_TRINARY_SWEEP_OUTPUT_DIR) \
 
@@ -1,12 +1,16 @@
 # QIHSE — Quantum Inspired Hilbert Space Expansion Search
-## (QIHSE): Vector Search with Exactness Contracts and Performance Escape Hatches
+## Vector Search with Exactness Contracts and Performance Escape Hatches
 
 [![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-black.svg)](LICENSE)
+[![C](https://img.shields.io/badge/C-00599C?logo=c&logoColor=white)](https://en.wikipedia.org/wiki/C_(programming_language))
+[![Python](https://img.shields.io/badge/Python-3776AB?logo=python&logoColor=white)](https://www.python.org/)
+[![Quantum Inspired](https://img.shields.io/badge/Quantum%20Inspired-%E2%9C%A8-purple.svg)]()
 
-QIHSE (Quantum Inspired Hilbert Space Expansion Search) is built for teams that want ANN performance without surrendering
-correctness guarantees. The project is intentionally conservative in its default
-behavior and explicit about when it uses aggressive acceleration. In practice, this
-means you get a small number of clear knobs instead of implicit magic behavior.
+Most vector databases make a quiet deal with you: they will be fast, and you will stop asking whether the results are correct. QIHSE does not make that deal.
+
+QIHSE was built for teams that have been burned by silent approximation drift — where a deployment that passed yesterday's tests starts returning subtly wrong top-k because someone tuned a parameter three layers down that you never knew existed. It treats correctness as the default, not an opt-in, and treats speed as something you earn by understanding your data shape, not something you buy with hidden trade-offs.
+
+The core promise is simple: **exact float32 search is the only path that does not ask your permission.** Every acceleration layer — graph indexing, scalar quantization, binary compression, sparse inverted indices — is an explicit contract. You decide whether to engage it, the system validates that it is safe for your query shape, and the final ranking is still produced by the same authoritative distance computation that would have run if you had never turned the accelerator on at all. You get to have the conversation about speed *after* you have established that correctness is not on the table.
 
 ## What makes QIHSE different
 
@@ -18,8 +22,15 @@ file-backed lifecycle controls and a query path model designed around two rules:
 - **Only use approximations when they are explicitly requested and validated.**
   Trinary-based acceleration is opt-in and enforced by explicit sidecar contracts.
 
-This gives you practical speedups in the common sparse/high-selectivity cases while
-preserving confidence that correctness has not been silently traded away.
+Most vector search libraries are designed around a single happy path: build an approximate index, query it, hope the recall is good enough. QIHSE is designed around a different assumption: you will eventually need to know *exactly* what the right answer is, and when that moment comes, the system should not have painted you into a corner.
+
+**Exactness is the default, not a debug mode.** Every query runs through float32 distance unless you explicitly ask for something else. The accelerators — graph indices, quantized sidecars, sparse inverted lists — are candidate *selectors*, not result *producers*. They narrow the field; the exact metric picks the winners. This means you can turn an accelerator on for speed, then turn it off for validation, and expect the same results.
+
+**Sidecars are first-class, not afterthoughts.** When you build a graph index or an INT8 quantization table, QIHSE tracks whether that artifact is valid, stale, or corrupt. It does not silently fall back to brute force because a sidecar disappeared. It tells you the sidecar is missing and lets you decide what to do.
+
+**The query planner knows when to say no.** The graph index is fast, but it is not always the right tool. QIHSE gates accelerator selection based on query dimensionality, top-k pressure, and dataset scale. Dense queries against small collections do not get pushed through a graph just because one exists. The system falls back to exact search by design when the overhead would not pay off.
+
+**Recovery is deterministic, not magical.** There is no background thread you are expected to trust. Snapshot, WAL, replay, checkpoint, compact — these are explicit operations you call when you are ready. If the process crashes, you know exactly what state you will find on restart because you decided when the last snapshot happened.
 
 ## Unique technical characteristics
 
@@ -77,6 +88,18 @@ Maintenance and scheduling calls are available and explicit. There is no require
 that hidden background threads be assumed for basic correctness; your host controls
 the maintenance cadence.
 
+### 7) Hierarchical memory storage with automatic hot/cold tiering
+QIHSE tracks per-vector access frequency and temperature, then promotes frequently-accessed
+vectors to faster memory tiers (HBM, NPU cache) and demotes cold vectors to DRAM.
+Access tracking is automatic across all query paths (exact, graph, INT8, sparse).
+Tier assignments are persisted in a `vectors.qtier` sidecar and recovered on restart.
+Configuration via `.qihse.conf` or environment variables:
+- `memory.hot_threshold` / `QIHSE_MEMORY_HOT_THRESHOLD` (default: 100 accesses/sec)
+- `memory.cold_threshold` / `QIHSE_MEMORY_COLD_THRESHOLD` (default: 5 accesses/sec)
+- `memory.maintenance_interval` / `QIHSE_MEMORY_MAINTENANCE_INTERVAL` (queries between maintenance runs, 0 = explicit only)
+
+Run `qihse_vector_db_run_memory_maintenance(db)` explicitly, or let batch search auto-trigger it.
+
 ## Build and run
 
 ```bash
@@ -91,14 +114,28 @@ make all
 flowchart TB
     A[Client Process] --> B[Query Ingestion]
     A --> C[Vector Mutations]
+    A --> P[Python / CLI Bindings]
     B --> D{qihse_vector_db_search}
     D --> E["Exact float32 rerank path<br/>(default)"]
     D --> F{Query mode}
     F -->|TRINARY_SCALAR| G["qtri sidecar shortlist"]
     F -->|TRINARY_MAGNITUDE| H["qmag sidecar shortlist"]
+    F -->|GRAPH| X["Graph index (HNSW)"]
+    F -->|INT8| Y["INT8 quantization"]
+    F -->|BINARY| Z["Binary quantization"]
+    F -->|SPARSE| W["Inverted index (BM25)"]
     G --> E
     H --> E
+    X --> E
+    Y --> E
+    Z --> E
+    W --> E
     E --> I["Returned ranked results"]
+    I --> Q[Query Result Cache]
+    Q --> I
+    I --> T[Hierarchical Storage<br/>Hot/cold tiering]
+    T --> S[Tier sidecar .qtier]
+    S --> T
     C --> J["WAL + snapshot metadata"]
     J --> K["checkpoint/compact"]
     K --> L["Restart-safe snapshot"]
@@ -154,21 +191,28 @@ For rawest speed (at the cost of recall guarantees), use:
 
 `QIHSE_VDB_QUERY_TRINARY_MAGNITUDE_BYPASS`.
 
-## Core vector DB API surface
+## What is actually in the box
+
+**Distance computation that uses your silicon.** On modern x86 CPUs with AVX2, QIHSE automatically selects vectorized implementations of cosine similarity, dot product, and Euclidean distance. On older hardware, it falls back to scalar loops without any code changes or recompilation. You do not configure this. It is simply a property of the hardware you are running on.
+
+**A graph index that knows when it is not needed.** The HNSW-style graph index is built and persisted automatically, but it is not used for every query. The system evaluates query dimensionality, top-k pressure, and dataset size before deciding whether the graph will actually be faster than a brute-force scan. For small collections or dense high-dimensional queries, it falls back to exact search — not because the graph is broken, but because the math says brute force is cheaper. The graph state is persisted to `index.qgraph` and loaded on restart, but it is an accelerator, not a crutch.
+
+**Quantization that does not quantize your results.** INT8 scalar quantization stores per-dimension min/max scaling factors and compresses vectors to one byte per dimension. Binary quantization goes further, packing each dimension to a single bit. Both are used exclusively as candidate selectors: they produce a shortlist of promising rows, and then the exact float32 metric runs against that shortlist to produce the final ranking. The quantized artifacts are persisted sidecars (`vectors.qint8`, `vectors.qbinary`) and validated on load. If they are stale or corrupt, the system tells you, not your users.
 
-- Lifecycle: `qihse_vector_db_open`, `qihse_vector_db_close`, `qihse_vector_db_flush`,
-  `qihse_vector_db_checkpoint`, `qihse_vector_db_compact`, `qihse_vector_db_destroy`
-- Mutations: `qihse_vector_db_add_vectors`, `qihse_vector_db_update_by_id`,
-  `qihse_vector_db_delete_by_id`, `qihse_vector_db_upsert_by_ids`
-- Search: `qihse_vector_db_search`, `qihse_vector_db_search_trinary_candidates`
-- Runtime diagnostics: `qihse_vector_db_get_persistence_stats`
+**Sparse vectors handled natively.** If your vectors are mostly zeros — think TF-IDF, think one-hot embeddings, think any high-dimensional space where most dimensions are inactive — QIHSE builds an inverted index with BM25 scoring. The sparse path is not an afterthought or a plugin. It is a first-class query mode, and sparse vectors coexist in the same database as dense ones.
+
+**A query cache with teeth.** Repeated identical queries are cached with FNV-1a hashing keyed on vector contents, top-k, and metric choice. The cache is invalidated automatically on any database mutation. There is no stale-cache bug where you delete a vector and still get it in results because the cache did not notice.
+
+**Configuration that respects your environment.** Drop a `.qihse.conf` in your working directory or home directory. Set `graph.M`, `cache.max_entries`, `search.default_k` — the usual suspects. Environment variables override file values for containerized deployments. No XML, no YAML, no ceremony.
+
+**Python and CLI interfaces.** The core is C, but you do not need to write C to use it. The Python bindings cover the full API, and the CLI tool handles database creation, bulk insertion, index building, and search from the shell.
 
 ## Randomized trinary / qmag benchmarks
 
 From the repo root:
 
 ```bash
-./run-trinary-random-sweep.sh --iterations 1000 --seed 1337 --output-dir results/sweep-1000
+./scripts/run-trinary-random-sweep.sh --iterations 1000 --seed 1337 --output-dir results/sweep-1000
 ```
 
 The same flow is available through `make`:
@@ -191,16 +235,29 @@ make bench-trinary-random-sweep QIHSE_TRINARY_SWEEP_ITERS=10000
 - `make benchmark`
 - `make bench-vxug-pdf-workload` (sample end-to-end flow)
 - `make bench-trinary-search-sweep` (acceleration shape behavior)
+- `make bench-micro` (micro-benchmarks for all query paths)
+- `make bench-memory-hierarchy` (hot/cold tiering behavior)
+
+## Benchmark chart
+
+Run the micro-benchmarks and generate a comparison chart:
+
+```bash
+make bench-micro 2>&1 | tee /tmp/bench_results.txt
+python3 scripts/generate_benchmark_chart.py /tmp/bench_results.txt benchmarks/qihse_benchmark_chart.png
+```
+
+![Benchmark Chart](benchmarks/qihse_benchmark_chart.png)
 
 ## Native build helper (one-line entrypoint)
 
-Use the root helper to auto-detect SIMD and build an optimized native binary safely.
+Use the build helper to auto-detect SIMD and build an optimized native binary safely.
 
 ```bash
-./build-native.sh
+./scripts/build-native.sh
 make build-native
-./build-native.sh --avx2
-./build-native.sh --avx512 --allow-unsupported --cflags "-O3 -DNDEBUG"
+./scripts/build-native.sh --avx2
+./scripts/build-native.sh --avx512 --allow-unsupported --cflags "-O3 -DNDEBUG"
 ```
 
 If you need custom flags, create `./.qihse-build-flags` and set:
@@ -221,4 +278,6 @@ QIHSE_BUILD_ALLOW_UNSUPPORTED=1
 
 ## License
 
-AGPL-3.0-or-later. See [LICENSE](LICENSE).
+**AGPL-3.0-or-later. This is strong copyleft. See [LICENSE](LICENSE) before any commercial use.**
+
+This project is published as a technical showcase and for home deployment if you so wish, bear me in mind if you want a world class database driving your fancy new framework. Failure to comply will be treated as copyright infringement and pursued to the full extent of the law.