Finish clostera rebrand cleanup

ponythewhite · ponythewhite · commit f6e8a0a0f526 · 2026-04-23T22:59:43.000+02:00
diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ clusterer = clostera.Clusterer(k=256, fastest=True)  # K = number of clusters
 labels = clusterer.fit_transform(vectors)
 ```
 
-`fastest=True` turns off OPQ and uses the plain PQ path. That is the right choice when end-to-end throughput matters more than reconstruction quality. The main speed win is in encoder training and encoding; the final PQk-means assignment stage itself is already fast in both modes.
+`fastest=True` turns off OPQ and uses the plain PQ path. That is the right choice when end-to-end throughput matters more than reconstruction quality. The main speed win is in encoder training and encoding; the final compressed assignment stage itself is already fast in both modes.
 
 ### Out-of-core from parquet
 
@@ -59,7 +59,7 @@ The original repository proved a powerful idea: by clustering in PQ code space i
 
 `clostera` asks the obvious follow-up question:
 
-> what happens if you rebuild PQk-means properly for modern hardware and modern Python workflows?
+> what happens if you rebuild the original `pqkmeans` project properly for modern hardware and modern Python workflows?
 
 On the committed deterministic `10M x 2048` checkpoint, the answer is not subtle.
 
@@ -524,7 +524,7 @@ The classes below expose the encoder/clusterer split directly. Reach for them wh
 | --- | --- | --- | --- |
 | `encoder` | `PQEncoder` | `required` | Trained encoder that defines the codebooks. |
 | `k` | `int \| None` | `None` | Number of target clusters. Here `K` means the number of clusters. `None` enables Rust-side automatic number-of-clusters selection over candidate values in PQ code space. |
-| `iterations` | `int` | `20` | Number of PQk-means update rounds. |
+| `iterations` | `int` | `20` | Number of clustering update rounds. |
 | `seed` | `int` | `0` | Deterministic seed for cluster-center initialization. |
 | `verbose` | `bool` | `False` | Emit inertia diagnostics during fitting. |
 | `lookup_table_bytes` | `int` | `1 << 30` | Memory budget for code-domain lookup tables. Larger budgets favor faster assignment. |
@@ -545,10 +545,10 @@ The classes below expose the encoder/clusterer split directly. Reach for them wh
 | `num_subquantizers` | `int \| None` | `None` | Optional encoder-side PQ subspace count when `encoder` is omitted. |
 | `codebook_size` | `int` | `256` | Optional encoder-side codebook size when `encoder` is omitted. |
 | `encoder_iterations` | `int` | `20` | Encoder training iterations used when `encoder` is omitted. |
-| `seed` | `int` | `0` | Deterministic seed shared by the implicit encoder and the PQk-means clusterer. |
+| `seed` | `int` | `0` | Deterministic seed shared by the implicit encoder and the clusterer. |
 | `opq_iterations` | `int` | `3` | OPQ refinement steps used by the implicit encoder. |
 | `k` | `int \| None` | `None` | Number of target clusters. Here `K` means the number of clusters. `None` enables Rust-side automatic number-of-clusters selection over candidate values in PQ code space. |
-| `iterations` | `int` | `20` | Number of PQk-means update rounds. |
+| `iterations` | `int` | `20` | Number of clustering update rounds. |
 | `verbose` | `bool` | `False` | Emit inertia diagnostics during fitting. |
 | `lookup_table_bytes` | `int` | `1 << 30` | Memory budget for code-domain lookup tables. Larger budgets favor faster assignment. |
 | `auto_k_method` | `str` | `"centroid_silhouette"` | Automatic-number-of-clusters (`K`) scoring rule. Supported values are `"centroid_silhouette"`, `"davies_bouldin"`, `"elbow"`, and `"bic"`. |
@@ -579,7 +579,7 @@ When `k=None`, fitting also populates:
 
 | Environment variable | Meaning |
 | --- | --- |
-| `PQK_ROTATION_BATCH_MIB` | Override the default OPQ rotation batch target in MiB for benchmarking or machine-specific tuning. |
+| `CLOSTERA_ROTATION_BATCH_MIB` | Override the default OPQ rotation batch target in MiB for benchmarking or machine-specific tuning. |
 
 ## Reproducing the benchmark artifacts
 
diff --git a/docs/assets/benchmark_hero.png b/docs/assets/benchmark_hero.png
diff --git a/docs/assets/clostera_hero.png b/docs/assets/clostera_hero.png
diff --git a/docs/assets/clustering_visualization.png b/docs/assets/clustering_visualization.png
diff --git a/notebooks/clostera_showcase.ipynb b/notebooks/clostera_showcase.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "maturin"
 [project]
 name = "clostera"
 version = "1.0.0"
-description = "Modern Rust implementation of PQk-means for large-scale clustering with numpy and parquet workflows"
+description = "Modern Rust rewrite of the original pqkmeans project for large-scale clustering with numpy and parquet workflows"
 readme = "README.md"
 requires-python = ">=3.10"
 license = { file = "LICENSE" }
diff --git a/scripts/generate_demo_notebook.py b/scripts/generate_demo_notebook.py
@@ -57,7 +57,7 @@ def build_notebook() -> dict:
         markdown_cell(
             """# clostera Tutorial
 
-This notebook is a **hands-on tutorial** for using `clostera`, the Rust implementation of PQk-means. It focuses on the public API and the workflows you are most likely to use in practice:
+This notebook is a **hands-on tutorial** for using `clostera`, the Rust rewrite of the original `pqkmeans` project. It focuses on the public API and the workflows you are most likely to use in practice:
 
 1. Use the high-level `Clusterer` API
 2. Cluster with a known number of clusters (`K`)
@@ -185,7 +185,7 @@ def build_notebook() -> dict:
         markdown_cell(
             """## 4. Need maximum throughput? Use `fastest=True`
 
-`fastest=True` turns off OPQ and uses the plain PQ path. That usually gives the best end-to-end throughput, at the cost of somewhat worse reconstruction quality. The main speed win is in encoder training and encoding, not in the final PQk-means assignment loop itself.
+`fastest=True` turns off OPQ and uses the plain PQ path. That usually gives the best end-to-end throughput, at the cost of somewhat worse reconstruction quality. The main speed win is in encoder training and encoding, not in the final compressed assignment loop itself.
 """
         ),
         code_cell(
diff --git a/scripts/render_benchmark_assets.py b/scripts/render_benchmark_assets.py
@@ -615,7 +615,7 @@ def render_hero_asset(args: argparse.Namespace, suite_payload: dict, large_paylo
     ax.text(
         0.05,
         0.695,
-        "A from-scratch Rust rebuild of PQk-means with deterministic initialization,\n"
+        "A from-scratch Rust rebuild of the original pqkmeans project with deterministic initialization,\n"
         "full-core CPU execution, parquet-native ingestion, out-of-core raw-vector workflows, and automatic K selection.",
         fontsize=17,
         color=phosphor_dim,
@@ -968,7 +968,7 @@ def toy_visualization(output_path: Path) -> None:
     axes[0].scatter(vectors[:, 0], vectors[:, 1], c=truth, s=10, cmap=cmap, alpha=0.8)
     axes[0].set_title("Ground truth clusters")
     axes[1].scatter(vectors[:, 0], vectors[:, 1], c=predicted, s=10, cmap=cmap, alpha=0.8)
-    axes[1].set_title("PQKMeans prediction")
+    axes[1].set_title("clostera prediction")
     for axis in axes:
         axis.set_xlabel("x0")
         axis.set_ylabel("x1")
diff --git a/scripts/render_sexy_hero.py b/scripts/render_sexy_hero.py
@@ -53,10 +53,10 @@ def render_hero():
     ax.imshow(Z, extent=(0, 1, 0, 1), cmap="Blues", alpha=0.1, aspect="auto", zorder=0)
 
     # --- Header Area ---
-    ax.text(0.05, 0.88, "PQK ENGINE", color=THEME["blue"], fontsize=12, weight="bold")
+    ax.text(0.05, 0.88, "CLOSTERA ENGINE", color=THEME["blue"], fontsize=12, weight="bold")
     ax.text(0.05, 0.74, "Billion-Scale Clustering\nOn Your Laptop", 
             color=THEME["text"], fontsize=36, weight="bold", linespacing=1.1, va="top")
-    ax.text(0.05, 0.44, "The high-performance Rust rebuild of PQk-means.\nOptimized for 2026 hardware. Engineered for scale.", 
+    ax.text(0.05, 0.44, "The high-performance Rust rebuild of the original pqkmeans project.\nOptimized for 2026 hardware. Engineered for scale.", 
             color=THEME["subtext"], fontsize=14, va="top", linespacing=1.4)
 
     # Tech Stack Badges
diff --git a/scripts/run_impl_eval.py b/scripts/run_impl_eval.py
@@ -92,9 +92,9 @@ def apply_thread_settings(args: argparse.Namespace) -> dict[str, int | None]:
         os.environ.pop("RAYON_NUM_THREADS", None)
 
     if args.rotation_batch_mib > 0:
-        os.environ["PQK_ROTATION_BATCH_MIB"] = str(args.rotation_batch_mib)
+        os.environ["CLOSTERA_ROTATION_BATCH_MIB"] = str(args.rotation_batch_mib)
     else:
-        os.environ.pop("PQK_ROTATION_BATCH_MIB", None)
+        os.environ.pop("CLOSTERA_ROTATION_BATCH_MIB", None)
 
     return {
         "blas_threads": args.blas_threads if args.blas_threads > 0 else None,
diff --git a/scripts/run_k_sweep_impl.py b/scripts/run_k_sweep_impl.py
@@ -85,9 +85,9 @@ def apply_thread_settings(args: argparse.Namespace) -> dict[str, int | None]:
         os.environ.pop("RAYON_NUM_THREADS", None)
 
     if args.rotation_batch_mib > 0:
-        os.environ["PQK_ROTATION_BATCH_MIB"] = str(args.rotation_batch_mib)
+        os.environ["CLOSTERA_ROTATION_BATCH_MIB"] = str(args.rotation_batch_mib)
     else:
-        os.environ.pop("PQK_ROTATION_BATCH_MIB", None)
+        os.environ.pop("CLOSTERA_ROTATION_BATCH_MIB", None)
 
     return {
         "blas_threads": args.blas_threads if args.blas_threads > 0 else None,
diff --git a/scripts/run_n_sweep_impl.py b/scripts/run_n_sweep_impl.py
@@ -96,9 +96,9 @@ def apply_thread_settings(args: argparse.Namespace) -> dict[str, int | None]:
         os.environ.pop("RAYON_NUM_THREADS", None)
 
     if args.rotation_batch_mib > 0:
-        os.environ["PQK_ROTATION_BATCH_MIB"] = str(args.rotation_batch_mib)
+        os.environ["CLOSTERA_ROTATION_BATCH_MIB"] = str(args.rotation_batch_mib)
     else:
-        os.environ.pop("PQK_ROTATION_BATCH_MIB", None)
+        os.environ.pop("CLOSTERA_ROTATION_BATCH_MIB", None)
 
     return {
         "blas_threads": args.blas_threads if args.blas_threads > 0 else None,
diff --git a/src/error.rs b/src/error.rs
@@ -2,10 +2,10 @@ use ndarray::ShapeError;
 use ndarray_linalg::error::LinalgError;
 use thiserror::Error;
 
-pub type Result<T> = std::result::Result<T, PqkError>;
+pub type Result<T> = std::result::Result<T, ClosteraError>;
 
 #[derive(Debug, Error)]
-pub enum PqkError {
+pub enum ClosteraError {
     #[error("{0}")]
     InvalidArgument(String),
     #[error("shape error: {0}")]
@@ -14,6 +14,6 @@ pub enum PqkError {
     Linalg(#[from] LinalgError),
 }
 
-pub fn invalid_argument(message: impl Into<String>) -> PqkError {
-    PqkError::InvalidArgument(message.into())
+pub fn invalid_argument(message: impl Into<String>) -> ClosteraError {
+    ClosteraError::InvalidArgument(message.into())
 }
diff --git a/src/math.rs b/src/math.rs
@@ -138,7 +138,7 @@ pub fn rotation_batch_mib(default_target: usize) -> usize {
     static OVERRIDE: OnceLock<Option<usize>> = OnceLock::new();
     OVERRIDE
         .get_or_init(|| {
-            std::env::var("PQK_ROTATION_BATCH_MIB")
+            std::env::var("CLOSTERA_ROTATION_BATCH_MIB")
                 .ok()
                 .and_then(|value| value.parse::<usize>().ok())
                 .filter(|&value| value > 0)
diff --git a/src/pqkmeans.rs b/src/pqkmeans.rs
@@ -49,7 +49,7 @@ struct FitProfile {
 
 impl FitProfile {
     fn from_env() -> Self {
-        let enabled = std::env::var_os("PQK_PROFILE_CLUSTER").is_some();
+        let enabled = std::env::var_os("CLOSTERA_PROFILE_CLUSTER").is_some();
         Self {
             enabled,
             ..Self::default()
diff --git a/src/python_bindings.rs b/src/python_bindings.rs
@@ -5,10 +5,10 @@ use pyo3::prelude::*;
 use pyo3::types::PyDict;
 
 use crate::autok::{AutoKMethod, analyze_k_candidates as analyze_k_candidates_impl};
-use crate::error::PqkError;
+use crate::error::ClosteraError;
 use crate::{PqKMeans, ProductQuantizer};
 
-fn to_py_err(error: PqkError) -> PyErr {
+fn to_py_err(error: ClosteraError) -> PyErr {
     PyValueError::new_err(error.to_string())
 }