-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclustering.py
More file actions
51 lines (42 loc) · 1.86 KB
/
clustering.py
File metadata and controls
51 lines (42 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
def kmeans_clustering(data_list, num_clusters, max_iter=10000, n_components=None, verbose=True):
"""K-Means clustering over embeddings with cosine-friendly normalization.
Returns a structured dict with cluster centers and sorted member indices.
"""
try:
# Extract embeddings and similarity scores
embeddings = [item[0]["data"][0]["embedding"] for item in data_list]
scores = [item[1] for item in data_list]
embeddings = np.array(embeddings)
embeddings = normalize(embeddings, norm='l2')
if len(embeddings) < num_clusters:
if verbose:
print(f"num_samples {len(embeddings)} < num_clusters {num_clusters}; adjust to {len(embeddings)}")
num_clusters = len(embeddings)
kmeans = KMeans(
n_clusters=num_clusters,
init="k-means++",
max_iter=max_iter,
n_init=10,
random_state=42
)
cluster_labels = kmeans.fit_predict(embeddings)
centroids = kmeans.cluster_centers_
sil_score = 0
if num_clusters > 1:
sil_score = silhouette_score(embeddings, cluster_labels, metric="cosine")
cluster_result = {}
for cluster_id in range(num_clusters):
cluster_members = [idx for idx, label in enumerate(cluster_labels) if label == cluster_id]
sorted_members = sorted(cluster_members, key=lambda x: scores[x], reverse=True)
cluster_result[cluster_id] = {
"center_point": centroids[cluster_id].tolist(),
"labels": sorted_members
}
return cluster_result, sil_score
except Exception as e:
print(f"K-Means failed: {e}")
return None, 0