|
14 | 14 | import json |
15 | 15 | import logging |
16 | 16 | import hashlib |
| 17 | +import math |
| 18 | +from collections import Counter |
17 | 19 | from datetime import datetime |
18 | 20 | from typing import Dict, List, Optional, Any, Literal |
19 | 21 | import numpy as np |
|
28 | 30 | TSNE_AVAILABLE = False |
29 | 31 | logger.warning("sklearn.manifold.TSNE not available") |
30 | 32 |
|
| 33 | +try: |
| 34 | + from sklearn.cluster import DBSCAN |
| 35 | + from sklearn.neighbors import NearestNeighbors |
| 36 | + DBSCAN_AVAILABLE = True |
| 37 | +except ImportError: |
| 38 | + DBSCAN_AVAILABLE = False |
| 39 | + logger.warning("sklearn.cluster.DBSCAN not available") |
| 40 | + |
31 | 41 | try: |
32 | 42 | from umap import UMAP |
33 | 43 | UMAP_AVAILABLE = True |
@@ -685,6 +695,163 @@ def compute_projection( |
685 | 695 |
|
686 | 696 | return projection.astype(np.float32) |
687 | 697 |
|
| 698 | + def _compute_clusters(self, projection: np.ndarray, min_samples: int = 5) -> Dict[str, Any]: |
| 699 | + """Run DBSCAN on projected coordinates to identify spatial clusters. |
| 700 | +
|
| 701 | + Auto-tunes eps using the 40th percentile of k-NN distances. This |
| 702 | + produces clusters where no single cluster dominates, giving a |
| 703 | + "political map" coloring of the embedding space. |
| 704 | +
|
| 705 | + Args: |
| 706 | + projection: (N, D) array of projected coordinates |
| 707 | + min_samples: DBSCAN min_samples parameter |
| 708 | +
|
| 709 | + Returns: |
| 710 | + Dict with cluster_labels, cluster_count, cluster_sizes, |
| 711 | + eps_used, noise_count |
| 712 | + """ |
| 713 | + if not DBSCAN_AVAILABLE or len(projection) < min_samples: |
| 714 | + return { |
| 715 | + "cluster_labels": np.full(len(projection), -1, dtype=int), |
| 716 | + "cluster_count": 0, |
| 717 | + "cluster_sizes": {}, |
| 718 | + "eps_used": 0.0, |
| 719 | + "noise_count": len(projection), |
| 720 | + } |
| 721 | + |
| 722 | + # Compute k-NN distances for eps estimation |
| 723 | + k = min_samples |
| 724 | + nn = NearestNeighbors(n_neighbors=k) |
| 725 | + nn.fit(projection) |
| 726 | + distances, _ = nn.kneighbors(projection) |
| 727 | + k_distances = np.sort(distances[:, -1]) |
| 728 | + |
| 729 | + # Use 40th percentile — empirically produces balanced clusters where |
| 730 | + # no single cluster dominates (largest ~10% of points). |
| 731 | + # Higher percentiles merge too aggressively; lower ones fragment. |
| 732 | + eps = float(np.percentile(k_distances, 40)) |
| 733 | + |
| 734 | + # Floor at 1% of data range (minimum 1e-6) to avoid degenerate eps=0 |
| 735 | + data_range = float(np.max(projection.max(axis=0) - projection.min(axis=0))) |
| 736 | + eps = max(eps, data_range * 0.01, 1e-6) |
| 737 | + |
| 738 | + # Run DBSCAN |
| 739 | + db = DBSCAN(eps=eps, min_samples=min_samples) |
| 740 | + labels = db.fit_predict(projection) |
| 741 | + |
| 742 | + # Compute stats |
| 743 | + unique = set(labels) |
| 744 | + unique.discard(-1) |
| 745 | + cluster_sizes = {} |
| 746 | + for label in unique: |
| 747 | + cluster_sizes[str(int(label))] = int(np.sum(labels == label)) |
| 748 | + noise_count = int(np.sum(labels == -1)) |
| 749 | + |
| 750 | + logger.info( |
| 751 | + f"DBSCAN clustering: {len(unique)} clusters, " |
| 752 | + f"{noise_count} noise points, eps={eps:.3f}" |
| 753 | + ) |
| 754 | + |
| 755 | + return { |
| 756 | + "cluster_labels": labels, |
| 757 | + "cluster_count": len(unique), |
| 758 | + "cluster_sizes": cluster_sizes, |
| 759 | + "eps_used": eps, |
| 760 | + "noise_count": noise_count, |
| 761 | + } |
| 762 | + |
| 763 | + # Common English stop words for cluster naming |
| 764 | + _STOP_WORDS = frozenset({ |
| 765 | + "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", |
| 766 | + "of", "with", "by", "from", "is", "are", "was", "were", "be", "been", |
| 767 | + "being", "have", "has", "had", "do", "does", "did", "will", "would", |
| 768 | + "could", "should", "may", "might", "shall", "can", "need", "must", |
| 769 | + "not", "no", "nor", "so", "if", "then", "than", "that", "this", |
| 770 | + "these", "those", "it", "its", "as", "up", "out", "about", "into", |
| 771 | + "over", "after", "before", "between", "under", "above", "below", |
| 772 | + "all", "each", "every", "both", "few", "more", "most", "other", |
| 773 | + "some", "such", "only", "own", "same", "too", "very", "just", |
| 774 | + "because", "through", "during", "while", "where", "when", "how", |
| 775 | + "what", "which", "who", "whom", "why", "any", "many", "much", |
| 776 | + "also", "back", "even", "still", "well", "way", "use", "her", |
| 777 | + "his", "he", "she", "they", "we", "you", "your", "their", "our", |
| 778 | + "us", "me", "my", "based", "using", "used", "via", "per", "vs", |
| 779 | + }) |
| 780 | + |
| 781 | + def _name_clusters( |
| 782 | + self, |
| 783 | + labels: np.ndarray, |
| 784 | + items: List[Dict[str, Any]], |
| 785 | + ) -> Dict[int, str]: |
| 786 | + """Derive descriptive names for clusters from concept labels. |
| 787 | +
|
| 788 | + Uses TF-IDF-style scoring: terms frequent within a cluster but rare |
| 789 | + across other clusters get the highest score. Top 2 terms form the name. |
| 790 | +
|
| 791 | + Args: |
| 792 | + labels: DBSCAN cluster assignment per item (-1 = noise) |
| 793 | + items: List of item dicts with "label" keys |
| 794 | +
|
| 795 | + Returns: |
| 796 | + Dict mapping cluster_id -> descriptive name string |
| 797 | + """ |
| 798 | + unique = set(labels) |
| 799 | + unique.discard(-1) |
| 800 | + if not unique: |
| 801 | + return {} |
| 802 | + |
| 803 | + # Tokenize: collect word counts per cluster |
| 804 | + cluster_words: Dict[int, Counter] = {} |
| 805 | + for i, item in enumerate(items): |
| 806 | + cid = int(labels[i]) |
| 807 | + if cid == -1: |
| 808 | + continue |
| 809 | + if cid not in cluster_words: |
| 810 | + cluster_words[cid] = Counter() |
| 811 | + words = item.get("label", "").lower().split() |
| 812 | + for w in words: |
| 813 | + # Strip non-alpha chars, skip short/stop words |
| 814 | + w = w.strip("()-/,:;\"'") |
| 815 | + if len(w) <= 2 or w in self._STOP_WORDS: |
| 816 | + continue |
| 817 | + cluster_words[cid][w] += 1 |
| 818 | + |
| 819 | + # Document frequency: how many clusters contain each term |
| 820 | + num_clusters = len(unique) |
| 821 | + doc_freq: Counter = Counter() |
| 822 | + for wc in cluster_words.values(): |
| 823 | + for w in wc: |
| 824 | + doc_freq[w] += 1 |
| 825 | + |
| 826 | + # Score terms per cluster: tf * idf |
| 827 | + # Use str keys to match Pydantic Dict[str, str] models |
| 828 | + cluster_names: Dict[str, str] = {} |
| 829 | + for cid in sorted(int(c) for c in unique): |
| 830 | + wc = cluster_words.get(cid, Counter()) |
| 831 | + key = str(cid) |
| 832 | + if not wc: |
| 833 | + cluster_names[key] = f"Cluster {cid}" |
| 834 | + continue |
| 835 | + |
| 836 | + total = sum(wc.values()) |
| 837 | + scored = [] |
| 838 | + for w, count in wc.items(): |
| 839 | + tf = count / total |
| 840 | + if num_clusters <= 1: |
| 841 | + # Single cluster: rank by frequency only |
| 842 | + scored.append((w, tf, count)) |
| 843 | + else: |
| 844 | + idf = math.log(num_clusters / doc_freq[w]) if doc_freq[w] < num_clusters else 0.1 |
| 845 | + scored.append((w, tf * idf, count)) |
| 846 | + |
| 847 | + # Sort by score desc, break ties by raw count |
| 848 | + scored.sort(key=lambda x: (-x[1], -x[2])) |
| 849 | + # Take top 2 terms, title-case |
| 850 | + top = [s[0].title() for s in scored[:2]] |
| 851 | + cluster_names[key] = " ".join(top) if top else f"Cluster {cid}" |
| 852 | + |
| 853 | + return cluster_names |
| 854 | + |
688 | 855 | def generate_projection_dataset( |
689 | 856 | self, |
690 | 857 | ontology: str, |
@@ -814,6 +981,14 @@ def generate_projection_dataset( |
814 | 981 | center=center |
815 | 982 | ) |
816 | 983 |
|
| 984 | + # Run DBSCAN clustering on projected coordinates |
| 985 | + cluster_result = self._compute_clusters(projection) |
| 986 | + cluster_labels = cluster_result["cluster_labels"] |
| 987 | + |
| 988 | + # Derive descriptive names for each cluster from concept labels |
| 989 | + cluster_names = self._name_clusters(cluster_labels, items) |
| 990 | + cluster_result["cluster_names"] = cluster_names |
| 991 | + |
817 | 992 | # Batch compute fresh grounding if requested (only for concepts) |
818 | 993 | fresh_groundings = {} |
819 | 994 | if include_grounding and refresh_grounding and embedding_source in ("concepts", "combined"): |
@@ -846,7 +1021,8 @@ def generate_projection_dataset( |
846 | 1021 | "label": item["label"], |
847 | 1022 | "x": coords[0], |
848 | 1023 | "y": coords[1], |
849 | | - "z": coords[2] if n_components == 3 else 0.0 |
| 1024 | + "z": coords[2] if n_components == 3 else 0.0, |
| 1025 | + "cluster_id": int(cluster_labels[i]) if cluster_labels[i] != -1 else None |
850 | 1026 | } |
851 | 1027 |
|
852 | 1028 | # Add item type for combined mode |
@@ -892,6 +1068,12 @@ def generate_projection_dataset( |
892 | 1068 | if diversity_values: |
893 | 1069 | stats["diversity_range"] = [min(diversity_values), max(diversity_values)] |
894 | 1070 |
|
| 1071 | + # Cluster statistics |
| 1072 | + stats["cluster_count"] = cluster_result["cluster_count"] |
| 1073 | + stats["cluster_sizes"] = cluster_result["cluster_sizes"] |
| 1074 | + stats["cluster_names"] = cluster_result["cluster_names"] |
| 1075 | + stats["cluster_noise_count"] = cluster_result["noise_count"] |
| 1076 | + |
895 | 1077 | # Generate changelist ID for cache invalidation |
896 | 1078 | changelist_id = self._generate_changelist_id(f"{ontology}:{embedding_source}", len(items)) |
897 | 1079 |
|
|
0 commit comments