Skip to content

Commit af95b3e

Browse files
committed
Merge branch 'feat/dbscan-cluster-coloring' into main
2 parents a53745d + 9407d37 commit af95b3e

11 files changed

Lines changed: 665 additions & 62 deletions

File tree

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ See [Quick Start Guide](docs/operating/quick-start.md) for details.
4747
![CLI with Inline Image Evidence](docs/media/screenshots/cli-search-with-images.png)
4848
*Command-line search returns concepts with source images rendered inline via chafa*
4949

50+
![Embedding Landscape with DBSCAN Clusters](docs/media/screenshots/web-embedding-landscape-clusters.png)
51+
*t-SNE embedding landscape with auto-detected clusters, named by topic via TF-IDF*
52+
5053
## What You Can Do
5154

5255
**Ingest documents** — PDFs, markdown, images, text. The system extracts concepts, relationships, and evidence automatically.

api/app/routes/projection.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class ProjectionConceptResponse(BaseModel):
3939
diversity_related_count: Optional[int] = None
4040
ontology: Optional[str] = None # Source ontology (for cross-ontology mode)
4141
item_type: Optional[str] = None # Item type (for combined mode)
42+
cluster_id: Optional[int] = None # DBSCAN cluster assignment (None = noise)
4243

4344

4445
class ProjectionParametersResponse(BaseModel):
@@ -60,6 +61,10 @@ class ProjectionStatisticsResponse(BaseModel):
6061
embedding_dims: int
6162
grounding_range: Optional[List[float]] = None
6263
diversity_range: Optional[List[float]] = None
64+
cluster_count: Optional[int] = None
65+
cluster_sizes: Optional[Dict[str, int]] = None
66+
cluster_names: Optional[Dict[str, str]] = None
67+
cluster_noise_count: Optional[int] = None
6368

6469

6570
class ProjectionDatasetResponse(BaseModel):

api/app/services/embedding_projection_service.py

Lines changed: 183 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import json
1515
import logging
1616
import hashlib
17+
import math
18+
from collections import Counter
1719
from datetime import datetime
1820
from typing import Dict, List, Optional, Any, Literal
1921
import numpy as np
@@ -28,6 +30,14 @@
2830
TSNE_AVAILABLE = False
2931
logger.warning("sklearn.manifold.TSNE not available")
3032

33+
try:
34+
from sklearn.cluster import DBSCAN
35+
from sklearn.neighbors import NearestNeighbors
36+
DBSCAN_AVAILABLE = True
37+
except ImportError:
38+
DBSCAN_AVAILABLE = False
39+
logger.warning("sklearn.cluster.DBSCAN not available")
40+
3141
try:
3242
from umap import UMAP
3343
UMAP_AVAILABLE = True
@@ -685,6 +695,163 @@ def compute_projection(
685695

686696
return projection.astype(np.float32)
687697

698+
def _compute_clusters(self, projection: np.ndarray, min_samples: int = 5) -> Dict[str, Any]:
699+
"""Run DBSCAN on projected coordinates to identify spatial clusters.
700+
701+
Auto-tunes eps using the 40th percentile of k-NN distances. This
702+
produces clusters where no single cluster dominates, giving a
703+
"political map" coloring of the embedding space.
704+
705+
Args:
706+
projection: (N, D) array of projected coordinates
707+
min_samples: DBSCAN min_samples parameter
708+
709+
Returns:
710+
Dict with cluster_labels, cluster_count, cluster_sizes,
711+
eps_used, noise_count
712+
"""
713+
if not DBSCAN_AVAILABLE or len(projection) < min_samples:
714+
return {
715+
"cluster_labels": np.full(len(projection), -1, dtype=int),
716+
"cluster_count": 0,
717+
"cluster_sizes": {},
718+
"eps_used": 0.0,
719+
"noise_count": len(projection),
720+
}
721+
722+
# Compute k-NN distances for eps estimation
723+
k = min_samples
724+
nn = NearestNeighbors(n_neighbors=k)
725+
nn.fit(projection)
726+
distances, _ = nn.kneighbors(projection)
727+
k_distances = np.sort(distances[:, -1])
728+
729+
# Use 40th percentile — empirically produces balanced clusters where
730+
# no single cluster dominates (largest ~10% of points).
731+
# Higher percentiles merge too aggressively; lower ones fragment.
732+
eps = float(np.percentile(k_distances, 40))
733+
734+
# Floor at 1% of data range (minimum 1e-6) to avoid degenerate eps=0
735+
data_range = float(np.max(projection.max(axis=0) - projection.min(axis=0)))
736+
eps = max(eps, data_range * 0.01, 1e-6)
737+
738+
# Run DBSCAN
739+
db = DBSCAN(eps=eps, min_samples=min_samples)
740+
labels = db.fit_predict(projection)
741+
742+
# Compute stats
743+
unique = set(labels)
744+
unique.discard(-1)
745+
cluster_sizes = {}
746+
for label in unique:
747+
cluster_sizes[str(int(label))] = int(np.sum(labels == label))
748+
noise_count = int(np.sum(labels == -1))
749+
750+
logger.info(
751+
f"DBSCAN clustering: {len(unique)} clusters, "
752+
f"{noise_count} noise points, eps={eps:.3f}"
753+
)
754+
755+
return {
756+
"cluster_labels": labels,
757+
"cluster_count": len(unique),
758+
"cluster_sizes": cluster_sizes,
759+
"eps_used": eps,
760+
"noise_count": noise_count,
761+
}
762+
763+
# Common English stop words for cluster naming
764+
_STOP_WORDS = frozenset({
765+
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
766+
"of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
767+
"being", "have", "has", "had", "do", "does", "did", "will", "would",
768+
"could", "should", "may", "might", "shall", "can", "need", "must",
769+
"not", "no", "nor", "so", "if", "then", "than", "that", "this",
770+
"these", "those", "it", "its", "as", "up", "out", "about", "into",
771+
"over", "after", "before", "between", "under", "above", "below",
772+
"all", "each", "every", "both", "few", "more", "most", "other",
773+
"some", "such", "only", "own", "same", "too", "very", "just",
774+
"because", "through", "during", "while", "where", "when", "how",
775+
"what", "which", "who", "whom", "why", "any", "many", "much",
776+
"also", "back", "even", "still", "well", "way", "use", "her",
777+
"his", "he", "she", "they", "we", "you", "your", "their", "our",
778+
"us", "me", "my", "based", "using", "used", "via", "per", "vs",
779+
})
780+
781+
def _name_clusters(
782+
self,
783+
labels: np.ndarray,
784+
items: List[Dict[str, Any]],
785+
) -> Dict[int, str]:
786+
"""Derive descriptive names for clusters from concept labels.
787+
788+
Uses TF-IDF-style scoring: terms frequent within a cluster but rare
789+
across other clusters get the highest score. Top 2 terms form the name.
790+
791+
Args:
792+
labels: DBSCAN cluster assignment per item (-1 = noise)
793+
items: List of item dicts with "label" keys
794+
795+
Returns:
796+
Dict mapping cluster_id -> descriptive name string
797+
"""
798+
unique = set(labels)
799+
unique.discard(-1)
800+
if not unique:
801+
return {}
802+
803+
# Tokenize: collect word counts per cluster
804+
cluster_words: Dict[int, Counter] = {}
805+
for i, item in enumerate(items):
806+
cid = int(labels[i])
807+
if cid == -1:
808+
continue
809+
if cid not in cluster_words:
810+
cluster_words[cid] = Counter()
811+
words = item.get("label", "").lower().split()
812+
for w in words:
813+
# Strip non-alpha chars, skip short/stop words
814+
w = w.strip("()-/,:;\"'")
815+
if len(w) <= 2 or w in self._STOP_WORDS:
816+
continue
817+
cluster_words[cid][w] += 1
818+
819+
# Document frequency: how many clusters contain each term
820+
num_clusters = len(unique)
821+
doc_freq: Counter = Counter()
822+
for wc in cluster_words.values():
823+
for w in wc:
824+
doc_freq[w] += 1
825+
826+
# Score terms per cluster: tf * idf
827+
# Use str keys to match Pydantic Dict[str, str] models
828+
cluster_names: Dict[str, str] = {}
829+
for cid in sorted(int(c) for c in unique):
830+
wc = cluster_words.get(cid, Counter())
831+
key = str(cid)
832+
if not wc:
833+
cluster_names[key] = f"Cluster {cid}"
834+
continue
835+
836+
total = sum(wc.values())
837+
scored = []
838+
for w, count in wc.items():
839+
tf = count / total
840+
if num_clusters <= 1:
841+
# Single cluster: rank by frequency only
842+
scored.append((w, tf, count))
843+
else:
844+
idf = math.log(num_clusters / doc_freq[w]) if doc_freq[w] < num_clusters else 0.1
845+
scored.append((w, tf * idf, count))
846+
847+
# Sort by score desc, break ties by raw count
848+
scored.sort(key=lambda x: (-x[1], -x[2]))
849+
# Take top 2 terms, title-case
850+
top = [s[0].title() for s in scored[:2]]
851+
cluster_names[key] = " ".join(top) if top else f"Cluster {cid}"
852+
853+
return cluster_names
854+
688855
def generate_projection_dataset(
689856
self,
690857
ontology: str,
@@ -814,6 +981,14 @@ def generate_projection_dataset(
814981
center=center
815982
)
816983

984+
# Run DBSCAN clustering on projected coordinates
985+
cluster_result = self._compute_clusters(projection)
986+
cluster_labels = cluster_result["cluster_labels"]
987+
988+
# Derive descriptive names for each cluster from concept labels
989+
cluster_names = self._name_clusters(cluster_labels, items)
990+
cluster_result["cluster_names"] = cluster_names
991+
817992
# Batch compute fresh grounding if requested (only for concepts)
818993
fresh_groundings = {}
819994
if include_grounding and refresh_grounding and embedding_source in ("concepts", "combined"):
@@ -846,7 +1021,8 @@ def generate_projection_dataset(
8461021
"label": item["label"],
8471022
"x": coords[0],
8481023
"y": coords[1],
849-
"z": coords[2] if n_components == 3 else 0.0
1024+
"z": coords[2] if n_components == 3 else 0.0,
1025+
"cluster_id": int(cluster_labels[i]) if cluster_labels[i] != -1 else None
8501026
}
8511027

8521028
# Add item type for combined mode
@@ -892,6 +1068,12 @@ def generate_projection_dataset(
8921068
if diversity_values:
8931069
stats["diversity_range"] = [min(diversity_values), max(diversity_values)]
8941070

1071+
# Cluster statistics
1072+
stats["cluster_count"] = cluster_result["cluster_count"]
1073+
stats["cluster_sizes"] = cluster_result["cluster_sizes"]
1074+
stats["cluster_names"] = cluster_result["cluster_names"]
1075+
stats["cluster_noise_count"] = cluster_result["noise_count"]
1076+
8951077
# Generate changelist ID for cache invalidation
8961078
changelist_id = self._generate_changelist_id(f"{ontology}:{embedding_source}", len(items))
8971079

docs/features/web-workstation.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,17 @@ Project concepts onto a semantic spectrum between two poles.
7575

7676
![Embedding Landscape](../media/screenshots/web-embedding-landscape.png)
7777

78-
3D visualization of all concept embeddings using t-SNE or UMAP.
78+
3D visualization of all concept embeddings using t-SNE or UMAP with automatic DBSCAN cluster detection.
7979

8080
**What you can do:**
8181
- See the overall shape of your semantic space
82-
- Identify natural clusters before diving into details
83-
- Click two concepts to preview a polarity axis
82+
- View auto-detected clusters with TF-IDF-derived names
83+
- Toggle cluster visibility to focus on specific regions
84+
- Switch color palettes (Bold, Warm→Cool, Earth) and sort by name, count, or color
85+
- Right-click any concept for details and to examine in force graph
8486
- Plan analysis based on what you see
8587

86-
**Best for:** Discovering semantic dimensions, validating embeddings, global overview before detailed exploration.
88+
**Best for:** Discovering semantic dimensions, identifying topic clusters, validating embeddings, global overview before detailed exploration.
8789

8890
---
8991

552 KB
Loading
169 KB
Loading

0 commit comments

Comments
 (0)