aaronsb
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎api/app/routes/projection.py‎
Lines changed: 5 additions & 0 deletions b/‎api/app/routes/projection.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎api/app/services/embedding_projection_service.py‎
Lines changed: 183 additions & 1 deletion b/‎api/app/services/embedding_projection_service.py‎
Lines changed: 183 additions & 1 deletion
diff --git a/‎docs/features/web-workstation.md‎
Lines changed: 6 additions & 4 deletions b/‎docs/features/web-workstation.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎docs/media/screenshots/web-embedding-landscape-clusters.png‎
552 KB b/‎docs/media/screenshots/web-embedding-landscape-clusters.png‎
552 KB
diff --git a/‎docs/media/screenshots/web-embedding-landscape.png‎
169 KB b/‎docs/media/screenshots/web-embedding-landscape.png‎
169 KB
@@ -47,6 +47,9 @@ See [Quick Start Guide](docs/operating/quick-start.md) for details.
 ![CLI with Inline Image Evidence](docs/media/screenshots/cli-search-with-images.png)
 *Command-line search returns concepts with source images rendered inline via chafa*
 
+![Embedding Landscape with DBSCAN Clusters](docs/media/screenshots/web-embedding-landscape-clusters.png)
+*t-SNE embedding landscape with auto-detected clusters, named by topic via TF-IDF*
+
 ## What You Can Do
 
 **Ingest documents** — PDFs, markdown, images, text. The system extracts concepts, relationships, and evidence automatically.
 
@@ -39,6 +39,7 @@ class ProjectionConceptResponse(BaseModel):
     diversity_related_count: Optional[int] = None
     ontology: Optional[str] = None  # Source ontology (for cross-ontology mode)
     item_type: Optional[str] = None  # Item type (for combined mode)
+    cluster_id: Optional[int] = None  # DBSCAN cluster assignment (None = noise)
 
 
 class ProjectionParametersResponse(BaseModel):
@@ -60,6 +61,10 @@ class ProjectionStatisticsResponse(BaseModel):
     embedding_dims: int
     grounding_range: Optional[List[float]] = None
     diversity_range: Optional[List[float]] = None
+    cluster_count: Optional[int] = None
+    cluster_sizes: Optional[Dict[str, int]] = None
+    cluster_names: Optional[Dict[str, str]] = None
+    cluster_noise_count: Optional[int] = None
 
 
 class ProjectionDatasetResponse(BaseModel):
 
@@ -14,6 +14,8 @@
 import json
 import logging
 import hashlib
+import math
+from collections import Counter
 from datetime import datetime
 from typing import Dict, List, Optional, Any, Literal
 import numpy as np
@@ -28,6 +30,14 @@
     TSNE_AVAILABLE = False
     logger.warning("sklearn.manifold.TSNE not available")
 
+try:
+    from sklearn.cluster import DBSCAN
+    from sklearn.neighbors import NearestNeighbors
+    DBSCAN_AVAILABLE = True
+except ImportError:
+    DBSCAN_AVAILABLE = False
+    logger.warning("sklearn.cluster.DBSCAN not available")
+
 try:
     from umap import UMAP
     UMAP_AVAILABLE = True
@@ -685,6 +695,163 @@ def compute_projection(
 
         return projection.astype(np.float32)
 
+    def _compute_clusters(self, projection: np.ndarray, min_samples: int = 5) -> Dict[str, Any]:
+        """Run DBSCAN on projected coordinates to identify spatial clusters.
+
+        Auto-tunes eps using the 40th percentile of k-NN distances. This
+        produces clusters where no single cluster dominates, giving a
+        "political map" coloring of the embedding space.
+
+        Args:
+            projection: (N, D) array of projected coordinates
+            min_samples: DBSCAN min_samples parameter
+
+        Returns:
+            Dict with cluster_labels, cluster_count, cluster_sizes,
+            eps_used, noise_count
+        """
+        if not DBSCAN_AVAILABLE or len(projection) < min_samples:
+            return {
+                "cluster_labels": np.full(len(projection), -1, dtype=int),
+                "cluster_count": 0,
+                "cluster_sizes": {},
+                "eps_used": 0.0,
+                "noise_count": len(projection),
+            }
+
+        # Compute k-NN distances for eps estimation
+        k = min_samples
+        nn = NearestNeighbors(n_neighbors=k)
+        nn.fit(projection)
+        distances, _ = nn.kneighbors(projection)
+        k_distances = np.sort(distances[:, -1])
+
+        # Use 40th percentile — empirically produces balanced clusters where
+        # no single cluster dominates (largest ~10% of points).
+        # Higher percentiles merge too aggressively; lower ones fragment.
+        eps = float(np.percentile(k_distances, 40))
+
+        # Floor at 1% of data range (minimum 1e-6) to avoid degenerate eps=0
+        data_range = float(np.max(projection.max(axis=0) - projection.min(axis=0)))
+        eps = max(eps, data_range * 0.01, 1e-6)
+
+        # Run DBSCAN
+        db = DBSCAN(eps=eps, min_samples=min_samples)
+        labels = db.fit_predict(projection)
+
+        # Compute stats
+        unique = set(labels)
+        unique.discard(-1)
+        cluster_sizes = {}
+        for label in unique:
+            cluster_sizes[str(int(label))] = int(np.sum(labels == label))
+        noise_count = int(np.sum(labels == -1))
+
+        logger.info(
+            f"DBSCAN clustering: {len(unique)} clusters, "
+            f"{noise_count} noise points, eps={eps:.3f}"
+        )
+
+        return {
+            "cluster_labels": labels,
+            "cluster_count": len(unique),
+            "cluster_sizes": cluster_sizes,
+            "eps_used": eps,
+            "noise_count": noise_count,
+        }
+
+    # Common English stop words for cluster naming
+    _STOP_WORDS = frozenset({
+        "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
+        "of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
+        "being", "have", "has", "had", "do", "does", "did", "will", "would",
+        "could", "should", "may", "might", "shall", "can", "need", "must",
+        "not", "no", "nor", "so", "if", "then", "than", "that", "this",
+        "these", "those", "it", "its", "as", "up", "out", "about", "into",
+        "over", "after", "before", "between", "under", "above", "below",
+        "all", "each", "every", "both", "few", "more", "most", "other",
+        "some", "such", "only", "own", "same", "too", "very", "just",
+        "because", "through", "during", "while", "where", "when", "how",
+        "what", "which", "who", "whom", "why", "any", "many", "much",
+        "also", "back", "even", "still", "well", "way", "use", "her",
+        "his", "he", "she", "they", "we", "you", "your", "their", "our",
+        "us", "me", "my", "based", "using", "used", "via", "per", "vs",
+    })
+
+    def _name_clusters(
+        self,
+        labels: np.ndarray,
+        items: List[Dict[str, Any]],
+    ) -> Dict[int, str]:
+        """Derive descriptive names for clusters from concept labels.
+
+        Uses TF-IDF-style scoring: terms frequent within a cluster but rare
+        across other clusters get the highest score. Top 2 terms form the name.
+
+        Args:
+            labels: DBSCAN cluster assignment per item (-1 = noise)
+            items: List of item dicts with "label" keys
+
+        Returns:
+            Dict mapping cluster_id -> descriptive name string
+        """
+        unique = set(labels)
+        unique.discard(-1)
+        if not unique:
+            return {}
+
+        # Tokenize: collect word counts per cluster
+        cluster_words: Dict[int, Counter] = {}
+        for i, item in enumerate(items):
+            cid = int(labels[i])
+            if cid == -1:
+                continue
+            if cid not in cluster_words:
+                cluster_words[cid] = Counter()
+            words = item.get("label", "").lower().split()
+            for w in words:
+                # Strip non-alpha chars, skip short/stop words
+                w = w.strip("()-/,:;\"'")
+                if len(w) <= 2 or w in self._STOP_WORDS:
+                    continue
+                cluster_words[cid][w] += 1
+
+        # Document frequency: how many clusters contain each term
+        num_clusters = len(unique)
+        doc_freq: Counter = Counter()
+        for wc in cluster_words.values():
+            for w in wc:
+                doc_freq[w] += 1
+
+        # Score terms per cluster: tf * idf
+        # Use str keys to match Pydantic Dict[str, str] models
+        cluster_names: Dict[str, str] = {}
+        for cid in sorted(int(c) for c in unique):
+            wc = cluster_words.get(cid, Counter())
+            key = str(cid)
+            if not wc:
+                cluster_names[key] = f"Cluster {cid}"
+                continue
+
+            total = sum(wc.values())
+            scored = []
+            for w, count in wc.items():
+                tf = count / total
+                if num_clusters <= 1:
+                    # Single cluster: rank by frequency only
+                    scored.append((w, tf, count))
+                else:
+                    idf = math.log(num_clusters / doc_freq[w]) if doc_freq[w] < num_clusters else 0.1
+                    scored.append((w, tf * idf, count))
+
+            # Sort by score desc, break ties by raw count
+            scored.sort(key=lambda x: (-x[1], -x[2]))
+            # Take top 2 terms, title-case
+            top = [s[0].title() for s in scored[:2]]
+            cluster_names[key] = " ".join(top) if top else f"Cluster {cid}"
+
+        return cluster_names
+
     def generate_projection_dataset(
         self,
         ontology: str,
@@ -814,6 +981,14 @@ def generate_projection_dataset(
             center=center
         )
 
+        # Run DBSCAN clustering on projected coordinates
+        cluster_result = self._compute_clusters(projection)
+        cluster_labels = cluster_result["cluster_labels"]
+
+        # Derive descriptive names for each cluster from concept labels
+        cluster_names = self._name_clusters(cluster_labels, items)
+        cluster_result["cluster_names"] = cluster_names
+
         # Batch compute fresh grounding if requested (only for concepts)
         fresh_groundings = {}
         if include_grounding and refresh_grounding and embedding_source in ("concepts", "combined"):
@@ -846,7 +1021,8 @@ def generate_projection_dataset(
                 "label": item["label"],
                 "x": coords[0],
                 "y": coords[1],
-                "z": coords[2] if n_components == 3 else 0.0
+                "z": coords[2] if n_components == 3 else 0.0,
+                "cluster_id": int(cluster_labels[i]) if cluster_labels[i] != -1 else None
             }
 
             # Add item type for combined mode
@@ -892,6 +1068,12 @@ def generate_projection_dataset(
         if diversity_values:
             stats["diversity_range"] = [min(diversity_values), max(diversity_values)]
 
+        # Cluster statistics
+        stats["cluster_count"] = cluster_result["cluster_count"]
+        stats["cluster_sizes"] = cluster_result["cluster_sizes"]
+        stats["cluster_names"] = cluster_result["cluster_names"]
+        stats["cluster_noise_count"] = cluster_result["noise_count"]
+
         # Generate changelist ID for cache invalidation
         changelist_id = self._generate_changelist_id(f"{ontology}:{embedding_source}", len(items))
 
 
@@ -75,15 +75,17 @@ Project concepts onto a semantic spectrum between two poles.
 
 ![Embedding Landscape](../media/screenshots/web-embedding-landscape.png)
 
-3D visualization of all concept embeddings using t-SNE or UMAP.
+3D visualization of all concept embeddings using t-SNE or UMAP with automatic DBSCAN cluster detection.
 
 **What you can do:**
 - See the overall shape of your semantic space
-- Identify natural clusters before diving into details
-- Click two concepts to preview a polarity axis
+- View auto-detected clusters with TF-IDF-derived names
+- Toggle cluster visibility to focus on specific regions
+- Switch color palettes (Bold, Warm→Cool, Earth) and sort by name, count, or color
+- Right-click any concept for details and to examine in force graph
 - Plan analysis based on what you see
 
-**Best for:** Discovering semantic dimensions, validating embeddings, global overview before detailed exploration.
+**Best for:** Discovering semantic dimensions, identifying topic clusters, validating embeddings, global overview before detailed exploration.
 
 ---