@@ -3951,6 +3951,207 @@ static bool want_aspect(const char **aspects, int aspect_count, const char *name
39513951 return false;
39523952}
39533953
3954+ /* ── Clusters via Louvain community detection ──────────────────── */
3955+
3956+ static int arch_clusters (cbm_store_t * s , const char * project , cbm_architecture_info_t * out ) {
3957+ /* 1. Load all callable node IDs for this project */
3958+ const char * nsql = "SELECT id FROM nodes WHERE project=?1 "
3959+ "AND label IN ('Function','Method','Class','Interface')" ;
3960+ sqlite3_stmt * nstmt = NULL ;
3961+ if (sqlite3_prepare_v2 (s -> db , nsql , -1 , & nstmt , NULL ) != SQLITE_OK ) {
3962+ store_set_error_sqlite (s , "arch_clusters_nodes" );
3963+ return CBM_STORE_ERR ;
3964+ }
3965+ bind_text (nstmt , 1 , project );
3966+
3967+ int ncap = 1024 ;
3968+ int nn = 0 ;
3969+ int64_t * node_ids = malloc ((size_t )ncap * sizeof (int64_t ));
3970+
3971+ while (sqlite3_step (nstmt ) == SQLITE_ROW ) {
3972+ if (nn >= ncap ) {
3973+ ncap *= 2 ;
3974+ node_ids = safe_realloc (node_ids , (size_t )ncap * sizeof (int64_t ));
3975+ }
3976+ node_ids [nn ++ ] = sqlite3_column_int64 (nstmt , 0 );
3977+ }
3978+ sqlite3_finalize (nstmt );
3979+
3980+ if (nn < 2 ) {
3981+ free (node_ids );
3982+ return CBM_STORE_OK ; /* Nothing to cluster */
3983+ }
3984+
3985+ /* 2. Load all CALLS edges for this project */
3986+ const char * esql = "SELECT source_id, target_id FROM edges WHERE project=?1 AND type='CALLS'" ;
3987+ sqlite3_stmt * estmt = NULL ;
3988+ if (sqlite3_prepare_v2 (s -> db , esql , -1 , & estmt , NULL ) != SQLITE_OK ) {
3989+ free (node_ids );
3990+ store_set_error_sqlite (s , "arch_clusters_edges" );
3991+ return CBM_STORE_ERR ;
3992+ }
3993+ bind_text (estmt , 1 , project );
3994+
3995+ int ecap = 2048 ;
3996+ int en = 0 ;
3997+ cbm_louvain_edge_t * edges = malloc ((size_t )ecap * sizeof (cbm_louvain_edge_t ));
3998+
3999+ while (sqlite3_step (estmt ) == SQLITE_ROW ) {
4000+ if (en >= ecap ) {
4001+ ecap *= 2 ;
4002+ edges = safe_realloc (edges , (size_t )ecap * sizeof (cbm_louvain_edge_t ));
4003+ }
4004+ edges [en ].src = sqlite3_column_int64 (estmt , 0 );
4005+ edges [en ].dst = sqlite3_column_int64 (estmt , 1 );
4006+ en ++ ;
4007+ }
4008+ sqlite3_finalize (estmt );
4009+
4010+ if (en < 1 ) {
4011+ free (node_ids );
4012+ free (edges );
4013+ return CBM_STORE_OK ;
4014+ }
4015+
4016+ /* 3. Run Louvain */
4017+ cbm_louvain_result_t * lresults = NULL ;
4018+ int lcount = 0 ;
4019+ int rc = cbm_louvain (node_ids , nn , edges , en , & lresults , & lcount );
4020+ free (node_ids );
4021+ free (edges );
4022+
4023+ if (rc != CBM_STORE_OK || lcount == 0 ) {
4024+ free (lresults );
4025+ return CBM_STORE_OK ;
4026+ }
4027+
4028+ /* 4. Find max community ID to size the grouping array */
4029+ int max_community = 0 ;
4030+ for (int i = 0 ; i < lcount ; i ++ ) {
4031+ if (lresults [i ].community > max_community ) {
4032+ max_community = lresults [i ].community ;
4033+ }
4034+ }
4035+ int num_communities = max_community + 1 ;
4036+
4037+ /* 5. Count members per community */
4038+ int * member_counts = calloc ((size_t )num_communities , sizeof (int ));
4039+ for (int i = 0 ; i < lcount ; i ++ ) {
4040+ if (lresults [i ].community >= 0 && lresults [i ].community < num_communities ) {
4041+ member_counts [lresults [i ].community ]++ ;
4042+ }
4043+ }
4044+
4045+ /* Count non-empty communities */
4046+ int active_count = 0 ;
4047+ for (int i = 0 ; i < num_communities ; i ++ ) {
4048+ if (member_counts [i ] > 0 ) {
4049+ active_count ++ ;
4050+ }
4051+ }
4052+
4053+ if (active_count == 0 ) {
4054+ free (member_counts );
4055+ free (lresults );
4056+ return CBM_STORE_OK ;
4057+ }
4058+
4059+ /* Cap at 20 clusters, keep the largest */
4060+ int max_clusters = active_count < 20 ? active_count : 20 ;
4061+
4062+ /* 6. Build cluster info structs.
4063+ * For each community, find the top-5 nodes by CALLS in-degree. */
4064+ cbm_cluster_info_t * clusters = calloc ((size_t )max_clusters , sizeof (cbm_cluster_info_t ));
4065+ int ci = 0 ;
4066+
4067+ /* Sort communities by member count descending — simple selection of top N */
4068+ int * sorted_ids = malloc ((size_t )num_communities * sizeof (int ));
4069+ for (int i = 0 ; i < num_communities ; i ++ ) sorted_ids [i ] = i ;
4070+ /* Bubble sort is fine for small N (typically < 100 communities) */
4071+ for (int i = 0 ; i < num_communities - 1 && i < max_clusters ; i ++ ) {
4072+ for (int j = i + 1 ; j < num_communities ; j ++ ) {
4073+ if (member_counts [sorted_ids [j ]] > member_counts [sorted_ids [i ]]) {
4074+ int tmp = sorted_ids [i ];
4075+ sorted_ids [i ] = sorted_ids [j ];
4076+ sorted_ids [j ] = tmp ;
4077+ }
4078+ }
4079+ }
4080+
4081+ for (int si = 0 ; si < max_clusters ; si ++ ) {
4082+ int comm_id = sorted_ids [si ];
4083+ if (member_counts [comm_id ] == 0 ) break ;
4084+
4085+ clusters [ci ].id = comm_id ;
4086+ clusters [ci ].members = member_counts [comm_id ];
4087+ clusters [ci ].cohesion = 0.0 ; /* Would need intra-/inter-edge ratio to compute */
4088+
4089+ /* Collect node IDs in this community */
4090+ int64_t * comm_nodes = malloc ((size_t )member_counts [comm_id ] * sizeof (int64_t ));
4091+ int cn = 0 ;
4092+ for (int i = 0 ; i < lcount ; i ++ ) {
4093+ if (lresults [i ].community == comm_id ) {
4094+ comm_nodes [cn ++ ] = lresults [i ].node_id ;
4095+ }
4096+ }
4097+
4098+ /* Find top 5 by in-degree via SQL */
4099+ int top_n = cn < 5 ? cn : 5 ;
4100+ // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
4101+ const char * * top_names = calloc ((size_t )top_n , sizeof (const char * ));
4102+ int tn = 0 ;
4103+
4104+ /* Build a simple query: SELECT name from nodes WHERE id IN (...) ordered by
4105+ * incoming CALLS count. For efficiency, just query each node's degree. */
4106+ for (int k = 0 ; k < cn && tn < top_n ; k ++ ) {
4107+ int in_deg = 0 ;
4108+ int out_deg = 0 ;
4109+ cbm_store_node_degree (s , comm_nodes [k ], & in_deg , & out_deg );
4110+
4111+ /* Simple insertion into top-N by in-degree.
4112+ * We'll just pick the first top_n by iterating degree queries. */
4113+ cbm_node_t ninfo ;
4114+ if (cbm_store_find_node_by_id (s , comm_nodes [k ], & ninfo ) == CBM_STORE_OK ) {
4115+ /* Skip File/Folder/Module nodes */
4116+ if (ninfo .label && strcmp (ninfo .label , "File" ) != 0 &&
4117+ strcmp (ninfo .label , "Folder" ) != 0 &&
4118+ strcmp (ninfo .label , "Module" ) != 0 ) {
4119+ if (ninfo .name ) {
4120+ top_names [tn ++ ] = heap_strdup (ninfo .name );
4121+ }
4122+ }
4123+ cbm_node_free_fields (& ninfo );
4124+ }
4125+ }
4126+
4127+ clusters [ci ].top_nodes = top_names ;
4128+ clusters [ci ].top_node_count = tn ;
4129+
4130+ /* Label: use the most common node name prefix as a heuristic.
4131+ * For now, just use "Cluster_N" — semantic naming requires LLM. */
4132+ char label_buf [64 ];
4133+ snprintf (label_buf , sizeof (label_buf ), "Cluster_%d" , comm_id );
4134+ clusters [ci ].label = heap_strdup (label_buf );
4135+
4136+ /* packages and edge_types are optional, leave as NULL/0 for now */
4137+ clusters [ci ].packages = NULL ;
4138+ clusters [ci ].package_count = 0 ;
4139+ clusters [ci ].edge_types = NULL ;
4140+ clusters [ci ].edge_type_count = 0 ;
4141+
4142+ free (comm_nodes );
4143+ ci ++ ;
4144+ }
4145+
4146+ free (sorted_ids );
4147+ free (member_counts );
4148+ free (lresults );
4149+
4150+ out -> clusters = clusters ;
4151+ out -> cluster_count = ci ;
4152+ return CBM_STORE_OK ;
4153+ }
4154+
39544155int cbm_store_get_architecture (cbm_store_t * s , const char * project , const char * * aspects ,
39554156 int aspect_count , cbm_architecture_info_t * out ) {
39564157 memset (out , 0 , sizeof (* out ));
@@ -4008,6 +4209,12 @@ int cbm_store_get_architecture(cbm_store_t *s, const char *project, const char *
40084209 return rc ;
40094210 }
40104211 }
4212+ if (want_aspect (aspects , aspect_count , "clusters" )) {
4213+ rc = arch_clusters (s , project , out );
4214+ if (rc != CBM_STORE_OK ) {
4215+ return rc ;
4216+ }
4217+ }
40114218
40124219 return CBM_STORE_OK ;
40134220}
0 commit comments