Skip to content

Commit 328676a

Browse files
feat: readme and cluster label mode
1 parent f2dbda6 commit 328676a

7 files changed

Lines changed: 307 additions & 41 deletions

File tree

ConsoleApp2/AiClusterLabeler.cs

Lines changed: 215 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,27 @@
22
using System.Collections.Generic;
33
using System.IO;
44
using System.Linq;
5+
using System.Net.Http;
56
using System.Text.Json;
67
using System.Threading.Tasks;
78

89
namespace GitCommitAnalyser
910
{
11+
public enum ClusterLabelingMode
12+
{
13+
PerCluster,
14+
SinglePrompt,
15+
LocalOnly,
16+
Hybrid
17+
}
18+
1019
public class AiClusterLabeler
1120
{
12-
public static async Task<Dictionary<uint, string>> PredictClusterNamesAsync(IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters, string labelsFilePath, string modelName = "gemini-2.5-flash", int commitProcessCount = 20)
21+
public static async Task<Dictionary<uint, string>> PredictClusterNamesAsync(
22+
IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters,
23+
string labelsFilePath,
24+
ClusterLabelingMode mode = ClusterLabelingMode.SinglePrompt,
25+
string modelName = "gemini-2.5-flash")
1326
{
1427
if (File.Exists(labelsFilePath))
1528
{
@@ -22,74 +35,242 @@ public static async Task<Dictionary<uint, string>> PredictClusterNamesAsync(IEnu
2235
}
2336
}
2437

38+
var clusterNames = new Dictionary<uint, string>();
2539
var apiKey = Environment.GetEnvironmentVariable("GEMINI_API_KEY");
26-
if (string.IsNullOrEmpty(apiKey))
40+
41+
bool needsApi = mode == ClusterLabelingMode.PerCluster || mode == ClusterLabelingMode.SinglePrompt || mode == ClusterLabelingMode.Hybrid;
42+
43+
if (needsApi && string.IsNullOrEmpty(apiKey))
2744
{
28-
Console.WriteLine("GEMINI_API_KEY environment variable not found. Using default cluster numeric names.");
29-
return clusters.ToDictionary(g => g.Key, g => $"Cluster {g.Key}");
45+
Console.WriteLine("GEMINI_API_KEY not found. Falling back to LocalOnly mode.");
46+
mode = ClusterLabelingMode.LocalOnly;
3047
}
3148

32-
var clusterNames = new System.Collections.Generic.Dictionary<uint, string>();
33-
using var httpClient = new System.Net.Http.HttpClient();
49+
using var httpClient = new HttpClient();
50+
string url = $"https://generativelanguage.googleapis.com/v1beta/models/{modelName}:generateContent?key={apiKey}";
51+
52+
Console.WriteLine($"\nPredicting cluster names using {mode} mode...");
53+
54+
switch (mode)
55+
{
56+
case ClusterLabelingMode.LocalOnly:
57+
clusterNames = ProcessLocalOnly(clusters);
58+
break;
59+
case ClusterLabelingMode.SinglePrompt:
60+
clusterNames = await ProcessSinglePromptAsync(httpClient, url, clusters);
61+
break;
62+
case ClusterLabelingMode.PerCluster:
63+
clusterNames = await ProcessPerClusterAsync(httpClient, url, clusters);
64+
break;
65+
case ClusterLabelingMode.Hybrid:
66+
clusterNames = await ProcessHybridAsync(httpClient, url, clusters);
67+
break;
68+
}
3469

35-
Console.WriteLine("\nPredicting cluster names using Gemini API...");
70+
// Ensure all clusters have a valid label, otherwise halt execution to prevent using bad tags
3671
foreach (var cluster in clusters)
3772
{
38-
var commitsToUse = cluster.Take(commitProcessCount).Select(c => c.CommitName).ToList();
39-
var prompt = "Based on the following git commit messages, provide a short 1-3 word category name for this cluster.\n\n" +
40-
"Commits:\n" + string.Join("\n", commitsToUse) + "\n\nCategory name:";
73+
if (!clusterNames.ContainsKey(cluster.Key) || string.IsNullOrWhiteSpace(clusterNames[cluster.Key]))
74+
{
75+
Console.WriteLine($"\nCritical Error: Failed to resolve a valid label for Cluster {cluster.Key}. Stopping program to prevent using placeholder tags.");
76+
Environment.Exit(1);
77+
}
78+
}
4179

42-
var requestBody = new
80+
// Save for future runs
81+
var saveFormat = clusterNames.ToDictionary(k => k.Key.ToString(), v => v.Value);
82+
var jsonOut = JsonSerializer.Serialize(saveFormat, new JsonSerializerOptions { WriteIndented = true });
83+
File.WriteAllText(labelsFilePath, jsonOut);
84+
85+
return clusterNames;
86+
}
87+
88+
private static Dictionary<uint, string> ProcessLocalOnly(IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters)
89+
{
90+
var clusterNames = new Dictionary<uint, string>();
91+
foreach (var cluster in clusters)
92+
{
93+
string label = GetLocalHeuristicLabel(cluster);
94+
if (!string.IsNullOrEmpty(label))
4395
{
44-
contents = new[]
96+
clusterNames[cluster.Key] = label;
97+
Console.WriteLine($"Cluster {cluster.Key} resolved locally as: {label}");
98+
}
99+
}
100+
return clusterNames;
101+
}
102+
103+
private static async Task<Dictionary<uint, string>> ProcessSinglePromptAsync(HttpClient httpClient, string url, IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters)
104+
{
105+
var clusterNames = new Dictionary<uint, string>();
106+
if (!clusters.Any()) return clusterNames;
107+
108+
var promptBuilder = new System.Text.StringBuilder();
109+
promptBuilder.AppendLine("Analyze the following clusters of git commit messages and provide a short 1-3 word category name for each.");
110+
promptBuilder.AppendLine("Return ONLY a valid JSON object mapping the cluster ID (as a string) to the category name. Example: {\"1\": \"Bug Fixes\", \"2\": \"Merges\"}");
111+
promptBuilder.AppendLine("\nClusters:");
112+
113+
foreach (var cluster in clusters)
114+
{
115+
var commitsToUse = cluster.Take(7).Select(c => c.CommitName);
116+
promptBuilder.AppendLine($"Cluster {cluster.Key}:");
117+
foreach (var c in commitsToUse) promptBuilder.AppendLine($"- {c}");
118+
}
119+
120+
var requestBody = new { contents = new[] { new { parts = new[] { new { text = promptBuilder.ToString() } } } } };
121+
122+
string responseText = await CallGeminiWithRetryAsync(httpClient, url, requestBody);
123+
124+
if (!string.IsNullOrEmpty(responseText))
125+
{
126+
try
127+
{
128+
string cleanJson = responseText.Replace("```json", "").Replace("```", "").Trim();
129+
var map = JsonSerializer.Deserialize<Dictionary<string, string>>(cleanJson);
130+
if (map != null)
45131
{
46-
new
132+
foreach (var kvp in map)
47133
{
48-
parts = new[] { new { text = prompt } }
134+
if (uint.TryParse(kvp.Key, out uint id))
135+
{
136+
clusterNames[id] = CleanLabel(kvp.Value);
137+
Console.WriteLine($"Cluster {id} predicted as: {clusterNames[id]}");
138+
}
49139
}
50140
}
51-
};
141+
}
142+
catch (Exception ex)
143+
{
144+
Console.WriteLine($"Failed to parse SinglePrompt JSON response: {ex.Message}");
145+
}
146+
}
147+
return clusterNames;
148+
}
52149

53-
var url = $"https://generativelanguage.googleapis.com/v1beta/models/{modelName}:generateContent?key={apiKey}";
54-
var jsonContent = new System.Net.Http.StringContent(JsonSerializer.Serialize(requestBody), System.Text.Encoding.UTF8, "application/json");
150+
private static async Task<Dictionary<uint, string>> ProcessPerClusterAsync(HttpClient httpClient, string url, IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters)
151+
{
152+
var clusterNames = new Dictionary<uint, string>();
153+
foreach (var cluster in clusters)
154+
{
155+
var commitsToUse = cluster.Take(20).Select(c => c.CommitName).ToList();
156+
var prompt = "Based on the following git commit messages, provide a short 1-3 word category name for this cluster.\n\n" +
157+
"Return ONLY the category name.\n\n" +
158+
"Guidelines:\n" +
159+
"- Focus on the main action or intent shared across the commits.\n" +
160+
"- Ignore noisy identifiers such as issue numbers, dependency names, hashes, usernames, and version numbers.\n" +
161+
"- Prefer broad reusable engineering categories.\n\n" +
162+
"Commits:\n" + string.Join("\n", commitsToUse) + "\n\nCategory:";
163+
164+
var requestBody = new { contents = new[] { new { parts = new[] { new { text = prompt } } } } };
165+
166+
string responseText = await CallGeminiWithRetryAsync(httpClient, url, requestBody);
167+
168+
if (!string.IsNullOrEmpty(responseText))
169+
{
170+
clusterNames[cluster.Key] = CleanLabel(responseText);
171+
Console.WriteLine($"Cluster {cluster.Key} predicted as: {clusterNames[cluster.Key]}");
172+
}
173+
else
174+
{
175+
Console.WriteLine($"Failed to predict name for Cluster {cluster.Key} (API fallback).");
176+
}
177+
}
178+
return clusterNames;
179+
}
180+
181+
private static async Task<Dictionary<uint, string>> ProcessHybridAsync(HttpClient httpClient, string url, IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters)
182+
{
183+
var clusterNames = ProcessLocalOnly(clusters);
184+
var unresolved = clusters.Where(c => !clusterNames.ContainsKey(c.Key)).ToList();
185+
186+
if (unresolved.Any())
187+
{
188+
Console.WriteLine($"Hybrid mode: {unresolved.Count} clusters unresolved locally. Sending to Gemini via SinglePrompt...");
189+
var geminiNames = await ProcessSinglePromptAsync(httpClient, url, unresolved);
190+
foreach (var kvp in geminiNames)
191+
{
192+
clusterNames[kvp.Key] = kvp.Value;
193+
}
194+
}
195+
196+
return clusterNames;
197+
}
198+
199+
private static string GetLocalHeuristicLabel(IEnumerable<CommitPredictionWithData> commits)
200+
{
201+
int count = commits.Count();
202+
if (count == 0) return null;
55203

204+
var msgs = commits.Select(c => c.CommitName.ToLowerInvariant()).ToList();
205+
206+
if ((double)msgs.Count(m => m.Contains("merge pull request") || m.Contains("merge branch")) / count > 0.4)
207+
return "Merges";
208+
209+
if ((double)msgs.Count(m => m.Contains("bump") || m.Contains("dependency") || m.Contains("npm") || m.Contains("yarn")) / count > 0.4)
210+
return "Dependencies";
211+
212+
if ((double)msgs.Count(m => m.Contains("doc") || m.Contains("readme")) / count > 0.4)
213+
return "Documentation";
214+
215+
if ((double)msgs.Count(m => m.Contains("fix") || m.Contains("bug") || m.Contains("patch")) / count > 0.4)
216+
return "Bug Fixes";
217+
218+
return null; // Unresolved
219+
}
220+
221+
private static async Task<string> CallGeminiWithRetryAsync(HttpClient httpClient, string url, object requestBody, int maxRetries = 3)
222+
{
223+
for (int attempt = 1; attempt <= maxRetries; attempt++)
224+
{
56225
try
57226
{
227+
// Recreate StringContent per retry to avoid stream exhaustion limits
228+
var jsonContent = new StringContent(JsonSerializer.Serialize(requestBody), System.Text.Encoding.UTF8, "application/json");
58229
var response = await httpClient.PostAsync(url, jsonContent);
230+
59231
if (response.IsSuccessStatusCode)
60232
{
61233
var responseString = await response.Content.ReadAsStringAsync();
62234
using var doc = JsonDocument.Parse(responseString);
63-
var text = doc.RootElement
235+
return doc.RootElement
64236
.GetProperty("candidates")[0]
65237
.GetProperty("content")
66238
.GetProperty("parts")[0]
67239
.GetProperty("text").GetString();
68-
69-
var cleanedName = text?.Trim().TrimEnd('\r', '\n', '.', '\"', '\'')
70-
.Replace("'", ""); // Additional cleanup for single quotes
71-
clusterNames[cluster.Key] = string.IsNullOrWhiteSpace(cleanedName) ? $"Cluster {cluster.Key}" : cleanedName;
72-
Console.WriteLine($"Cluster {cluster.Key} predicted as: {clusterNames[cluster.Key]}");
73240
}
74241
else
75242
{
76-
Console.WriteLine($"Failed to predict name for Cluster {cluster.Key}. Status: {response.StatusCode}");
77-
clusterNames[cluster.Key] = $"Cluster {cluster.Key}";
243+
var errorBody = await response.Content.ReadAsStringAsync();
244+
// Filter out non-retriable codes (like 400 Bad Request) but allow 429 and 5xx
245+
if ((int)response.StatusCode >= 400 && (int)response.StatusCode < 500 && response.StatusCode != System.Net.HttpStatusCode.TooManyRequests)
246+
{
247+
Console.WriteLine($"Client Error {response.StatusCode}. Aborting retry.");
248+
return null; // Don't retry a bad malformed json request
249+
}
250+
251+
Console.WriteLine($"API Status {response.StatusCode} on attempt {attempt}: {errorBody}");
78252
}
79253
}
80-
catch (System.Exception ex)
254+
catch (Exception ex)
81255
{
82-
Console.WriteLine($"Error predicting name for Cluster {cluster.Key}: {ex.Message}");
83-
clusterNames[cluster.Key] = $"Cluster {cluster.Key}";
256+
Console.WriteLine($"API Request Error on attempt {attempt}: {ex.Message}");
84257
}
85-
}
86258

87-
// Save for future runs
88-
var saveFormat = clusterNames.ToDictionary(k => k.Key.ToString(), v => v.Value);
89-
var jsonOut = JsonSerializer.Serialize(saveFormat, new JsonSerializerOptions { WriteIndented = true });
90-
File.WriteAllText(labelsFilePath, jsonOut);
259+
if (attempt < maxRetries)
260+
{
261+
Console.WriteLine("Waiting before retrying...");
262+
await Task.Delay(2000 * attempt);
263+
}
264+
}
265+
266+
Console.WriteLine("Max retries reached. Gracefully falling back.");
267+
return null;
268+
}
91269

92-
return clusterNames;
270+
private static string CleanLabel(string text)
271+
{
272+
if (string.IsNullOrWhiteSpace(text)) return null;
273+
return text.Trim().TrimEnd('\r', '\n', '.', '\"', '\'').Replace("'", "");
93274
}
94275
}
95276
}

ConsoleApp2/Analyser.cs

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,19 @@ public static int GetOrFindBestK(MLContext mlContext, IDataView trainData, IEsti
110110

111111
if (double.IsNaN(metrics.DaviesBouldinIndex)) continue;
112112

113-
// We prioritize Davies-Bouldin index for clustering quality
114-
if (metrics.DaviesBouldinIndex < bestMetric)
113+
// Apply a penalty multiplier to give preference to K values between 4 and 8
114+
// Since lower Davies-Bouldin index is better, we artificially inflate the score
115+
// for K values outside our preferred range to discourage the model from picking them
116+
// unless they are significantly better.
117+
double adjustedMetric = metrics.DaviesBouldinIndex;
118+
if (k < 4 || k > 8)
115119
{
116-
bestMetric = metrics.DaviesBouldinIndex;
120+
adjustedMetric *= 1.75; // 75% penalty
121+
}
122+
123+
if (adjustedMetric < bestMetric)
124+
{
125+
bestMetric = adjustedMetric;
117126
bestK = k;
118127
}
119128
}
@@ -136,14 +145,14 @@ public static IEnumerable<IGrouping<uint, CommitPredictionWithData>> GetClusters
136145
return results.GroupBy(x => x.PredictedClusterId).OrderBy(g => g.Key);
137146
}
138147

139-
public static void PrintClusterExamples(IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters, Dictionary<uint, string> clusterNames = null)
148+
public static void PrintClusterExamples(IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters, Dictionary<uint, string> clusterNames = null, int exampleCount = 2)
140149
{
141150
Console.WriteLine("\n--- Cluster Examples ---");
142151
foreach (var cluster in clusters)
143152
{
144153
var name = clusterNames != null && clusterNames.TryGetValue(cluster.Key, out var cn) ? cn: $"Cluster {cluster.Key}";
145154
Console.WriteLine($"\n{name}:");
146-
foreach (var example in cluster.Take(2)) // 2 examples each from each cluster
155+
foreach (var example in cluster.Take(exampleCount))
147156
{
148157
Console.WriteLine($" - [{example.Repository}] {example.CommitName}");
149158
}

0 commit comments

Comments
 (0)