Skip to content

Commit b57b7da

Browse files
feat: use hash to see if model+cluster prediction is to be reused
1 parent 2fbddf8 commit b57b7da

2 files changed

Lines changed: 136 additions & 12 deletions

File tree

ConsoleApp2/Analyser.cs

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using Microsoft.ML.Data;
2+
using System.Security.Cryptography;
23
using System.Text.Json;
34
using Microsoft.ML;
45

@@ -25,6 +26,15 @@ internal class Analyser
2526
{
2627
private const string FeaturesColumnName = "Features";
2728

29+
public static string CalculateFileHash(string filePath)
30+
{
31+
if (!File.Exists(filePath)) return string.Empty;
32+
using var sha256 = SHA256.Create();
33+
using var stream = File.OpenRead(filePath);
34+
var hash = sha256.ComputeHash(stream);
35+
return BitConverter.ToString(hash).Replace("-", "").ToLowerInvariant();
36+
}
37+
2838
public static IDataView LoadJsonDataForML(MLContext mlContext, string jsonFilePath)
2939
{
3040
if (!File.Exists(jsonFilePath))
@@ -125,16 +135,97 @@ public static IEnumerable<IGrouping<uint, CommitPredictionWithData>> GetClusters
125135
var results = mlContext.Data.CreateEnumerable<CommitPredictionWithData>(predictions, reuseRowObject: false).ToList();
126136
return results.GroupBy(x => x.PredictedClusterId).OrderBy(g => g.Key);
127137
}
128-
public static IEnumerable<IGrouping<uint, CommitPredictionWithData>> PredictClusterNames(IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters)
138+
139+
public static async Task<Dictionary<uint, string>> PredictClusterNamesAsync(IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters, string labelsFilePath, string modelName = "gemini-2.5-flash")
129140
{
130-
return null;
141+
if (File.Exists(labelsFilePath))
142+
{
143+
var json = File.ReadAllText(labelsFilePath);
144+
var dict = JsonSerializer.Deserialize<Dictionary<string, string>>(json);
145+
if (dict != null && dict.Count > 0)
146+
{
147+
Console.WriteLine($"Loaded cluster names from {labelsFilePath}.");
148+
return dict.ToDictionary(k => uint.Parse(k.Key), v => v.Value);
149+
}
150+
}
151+
152+
var apiKey = Environment.GetEnvironmentVariable("GEMINI_API_KEY");
153+
if (string.IsNullOrEmpty(apiKey))
154+
{
155+
Console.WriteLine("GEMINI_API_KEY environment variable not found. Using default cluster numeric names.");
156+
return clusters.ToDictionary(g => g.Key, g => $"Cluster {g.Key}");
157+
}
158+
159+
var clusterNames = new System.Collections.Generic.Dictionary<uint, string>();
160+
using var httpClient = new System.Net.Http.HttpClient();
161+
162+
Console.WriteLine("\nPredicting cluster names using Gemini API...");
163+
foreach (var cluster in clusters)
164+
{
165+
var commitsToUse = cluster.Take(10).Select(c => c.CommitName).ToList();
166+
var prompt = "Based on the following git commit messages, provide a short 1-3 word category name for this cluster.\n\n" +
167+
"Commits:\n" + string.Join("\n", commitsToUse) + "\n\nCategory name:";
168+
169+
var requestBody = new
170+
{
171+
contents = new[]
172+
{
173+
new
174+
{
175+
parts = new[] { new { text = prompt } }
176+
}
177+
}
178+
};
179+
180+
var url = $"https://generativelanguage.googleapis.com/v1beta/models/{modelName}:generateContent?key={apiKey}";
181+
var jsonContent = new System.Net.Http.StringContent(JsonSerializer.Serialize(requestBody), System.Text.Encoding.UTF8, "application/json");
182+
183+
try
184+
{
185+
var response = await httpClient.PostAsync(url, jsonContent);
186+
if (response.IsSuccessStatusCode)
187+
{
188+
var responseString = await response.Content.ReadAsStringAsync();
189+
using var doc = JsonDocument.Parse(responseString);
190+
var text = doc.RootElement
191+
.GetProperty("candidates")[0]
192+
.GetProperty("content")
193+
.GetProperty("parts")[0]
194+
.GetProperty("text").GetString();
195+
196+
var cleanedName = text?.Trim().TrimEnd('\r', '\n', '.', '\"', '\'')
197+
.Replace("'", ""); // Additional cleanup for single quotes
198+
clusterNames[cluster.Key] = string.IsNullOrWhiteSpace(cleanedName) ? $"Cluster {cluster.Key}" : cleanedName;
199+
Console.WriteLine($"Cluster {cluster.Key} predicted as: {clusterNames[cluster.Key]}");
200+
}
201+
else
202+
{
203+
Console.WriteLine($"Failed to predict name for Cluster {cluster.Key}. Status: {response.StatusCode}");
204+
clusterNames[cluster.Key] = $"Cluster {cluster.Key}";
205+
}
206+
}
207+
catch (System.Exception ex)
208+
{
209+
Console.WriteLine($"Error predicting name for Cluster {cluster.Key}: {ex.Message}");
210+
clusterNames[cluster.Key] = $"Cluster {cluster.Key}";
211+
}
212+
}
213+
214+
// Save for future runs
215+
var saveFormat = clusterNames.ToDictionary(k => k.Key.ToString(), v => v.Value);
216+
var jsonOut = JsonSerializer.Serialize(saveFormat, new JsonSerializerOptions { WriteIndented = true });
217+
File.WriteAllText(labelsFilePath, jsonOut);
218+
219+
return clusterNames;
131220
}
132-
public static void PrintClusterExamples(IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters)
221+
222+
public static void PrintClusterExamples(IEnumerable<IGrouping<uint, CommitPredictionWithData>> clusters, Dictionary<uint, string> clusterNames = null)
133223
{
134224
Console.WriteLine("\n--- Cluster Examples ---");
135225
foreach (var cluster in clusters)
136226
{
137-
Console.WriteLine($"\nCluster {cluster.Key}:");
227+
var name = clusterNames != null && clusterNames.TryGetValue(cluster.Key, out var cn) ? cn: $"Cluster {cluster.Key}";
228+
Console.WriteLine($"\n{name}:");
138229
foreach (var example in cluster.Take(2)) // 2 examples each from each cluster
139230
{
140231
Console.WriteLine($" - [{example.Repository}] {example.CommitName}");

ConsoleApp2/Program.cs

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,58 @@ static async Task Main(string[] args)
2424
// 1. Load data
2525
var dataView = Analyser.LoadJsonDataForML(mlContext, outputPath);
2626

27+
var labelsFilePath = "cluster_labels.json";
28+
2729
if (dataView != null)
2830
{
2931
Console.WriteLine($"Loaded {dataView.GetRowCount()} rows into ML.NET IDataView.");
3032

33+
var hashFilePath = "data_hash.txt";
34+
var modelFilePath = "kmeans_model.zip";
35+
36+
string currentHash = Analyser.CalculateFileHash(outputPath);
37+
string savedHash = File.Exists(hashFilePath) ? File.ReadAllText(hashFilePath) : string.Empty;
38+
39+
if (currentHash != savedHash)
40+
{
41+
Console.WriteLine("Data file changed or hash not found. Invalidating cache (K, labels, model)...");
42+
if (File.Exists(kFilePath)) File.Delete(kFilePath);
43+
if (File.Exists(labelsFilePath)) File.Delete(labelsFilePath);
44+
if (File.Exists(modelFilePath)) File.Delete(modelFilePath);
45+
File.WriteAllText(hashFilePath, currentHash);
46+
}
47+
3148
// 2. Split data into 80% train and 20% test
3249
var split = Analyser.SplitData(mlContext, dataView);
3350

34-
// 3. Featurize Text
35-
var featurizer = Analyser.FeaturizeText(mlContext);
51+
ITransformer model;
52+
if (File.Exists(modelFilePath))
53+
{
54+
Console.WriteLine($"Loading existing model from {modelFilePath}...");
55+
model = mlContext.Model.Load(modelFilePath, out var schema);
56+
}
57+
else
58+
{
59+
// 3. Featurize Text
60+
var featurizer = Analyser.FeaturizeText(mlContext);
3661

37-
// 4. Find Best K using Grid Search or load from file
38-
int bestK = Analyser.GetOrFindBestK(mlContext, split.TrainSet, featurizer, kFilePath);
62+
// 4. Find Best K using Grid Search or load from file
63+
int bestK = Analyser.GetOrFindBestK(mlContext, split.TrainSet, featurizer, kFilePath);
3964

40-
// 5. Train KMeans Clusterer
41-
var model = Analyser.TrainKMeansClusterer(mlContext, split.TrainSet, featurizer, bestK);
65+
// 5. Train KMeans Clusterer
66+
model = Analyser.TrainKMeansClusterer(mlContext, split.TrainSet, featurizer, bestK);
67+
68+
mlContext.Model.Save(model, split.TrainSet.Schema, modelFilePath);
69+
Console.WriteLine($"Saved newly trained model to {modelFilePath}.");
70+
}
4271

43-
// 6. Print 2 examples from each cluster
4472
var clusters = Analyser.GetClusters(mlContext, split.TrainSet, model);
45-
Analyser.PrintClusterExamples(clusters);
73+
74+
// 6. Predict cluster names via Gemini (or load from cache)
75+
var clusterNames = await Analyser.PredictClusterNamesAsync(clusters, labelsFilePath);
76+
77+
// 7. Print examples from each cluster with readable names
78+
Analyser.PrintClusterExamples(clusters, clusterNames);
4679
}
4780
}
4881
}

0 commit comments

Comments
 (0)