22using System . Collections . Generic ;
33using System . IO ;
44using System . Linq ;
5+ using System . Net . Http ;
56using System . Text . Json ;
67using System . Threading . Tasks ;
78
89namespace GitCommitAnalyser
910{
11+ public enum ClusterLabelingMode
12+ {
13+ PerCluster ,
14+ SinglePrompt ,
15+ LocalOnly ,
16+ Hybrid
17+ }
18+
1019 public class AiClusterLabeler
1120 {
12- public static async Task < Dictionary < uint , string > > PredictClusterNamesAsync ( IEnumerable < IGrouping < uint , CommitPredictionWithData > > clusters , string labelsFilePath , string modelName = "gemini-2.5-flash" , int commitProcessCount = 20 )
21+ public static async Task < Dictionary < uint , string > > PredictClusterNamesAsync (
22+ IEnumerable < IGrouping < uint , CommitPredictionWithData > > clusters ,
23+ string labelsFilePath ,
24+ ClusterLabelingMode mode = ClusterLabelingMode . SinglePrompt ,
25+ string modelName = "gemini-2.5-flash" )
1326 {
1427 if ( File . Exists ( labelsFilePath ) )
1528 {
@@ -22,74 +35,242 @@ public static async Task<Dictionary<uint, string>> PredictClusterNamesAsync(IEnu
2235 }
2336 }
2437
38+ var clusterNames = new Dictionary < uint , string > ( ) ;
2539 var apiKey = Environment . GetEnvironmentVariable ( "GEMINI_API_KEY" ) ;
26- if ( string . IsNullOrEmpty ( apiKey ) )
40+
41+ bool needsApi = mode == ClusterLabelingMode . PerCluster || mode == ClusterLabelingMode . SinglePrompt || mode == ClusterLabelingMode . Hybrid ;
42+
43+ if ( needsApi && string . IsNullOrEmpty ( apiKey ) )
2744 {
28- Console . WriteLine ( "GEMINI_API_KEY environment variable not found. Using default cluster numeric names ." ) ;
29- return clusters . ToDictionary ( g => g . Key , g => $ "Cluster { g . Key } " ) ;
45+ Console . WriteLine ( "GEMINI_API_KEY not found. Falling back to LocalOnly mode ." ) ;
46+ mode = ClusterLabelingMode . LocalOnly ;
3047 }
3148
32- var clusterNames = new System . Collections . Generic . Dictionary < uint , string > ( ) ;
33- using var httpClient = new System . Net . Http . HttpClient ( ) ;
49+ using var httpClient = new HttpClient ( ) ;
50+ string url = $ "https://generativelanguage.googleapis.com/v1beta/models/{ modelName } :generateContent?key={ apiKey } ";
51+
52+ Console . WriteLine ( $ "\n Predicting cluster names using { mode } mode...") ;
53+
54+ switch ( mode )
55+ {
56+ case ClusterLabelingMode . LocalOnly :
57+ clusterNames = ProcessLocalOnly ( clusters ) ;
58+ break ;
59+ case ClusterLabelingMode . SinglePrompt :
60+ clusterNames = await ProcessSinglePromptAsync ( httpClient , url , clusters ) ;
61+ break ;
62+ case ClusterLabelingMode . PerCluster :
63+ clusterNames = await ProcessPerClusterAsync ( httpClient , url , clusters ) ;
64+ break ;
65+ case ClusterLabelingMode . Hybrid :
66+ clusterNames = await ProcessHybridAsync ( httpClient , url , clusters ) ;
67+ break ;
68+ }
3469
35- Console . WriteLine ( " \n Predicting cluster names using Gemini API..." ) ;
70+ // Ensure all clusters have a valid label, otherwise halt execution to prevent using bad tags
3671 foreach ( var cluster in clusters )
3772 {
38- var commitsToUse = cluster . Take ( commitProcessCount ) . Select ( c => c . CommitName ) . ToList ( ) ;
39- var prompt = "Based on the following git commit messages, provide a short 1-3 word category name for this cluster.\n \n " +
40- "Commits:\n " + string . Join ( "\n " , commitsToUse ) + "\n \n Category name:" ;
73+ if ( ! clusterNames . ContainsKey ( cluster . Key ) || string . IsNullOrWhiteSpace ( clusterNames [ cluster . Key ] ) )
74+ {
75+ Console . WriteLine ( $ "\n Critical Error: Failed to resolve a valid label for Cluster { cluster . Key } . Stopping program to prevent using placeholder tags.") ;
76+ Environment . Exit ( 1 ) ;
77+ }
78+ }
4179
42- var requestBody = new
80+ // Save for future runs
81+ var saveFormat = clusterNames . ToDictionary ( k => k . Key . ToString ( ) , v => v . Value ) ;
82+ var jsonOut = JsonSerializer . Serialize ( saveFormat , new JsonSerializerOptions { WriteIndented = true } ) ;
83+ File . WriteAllText ( labelsFilePath , jsonOut ) ;
84+
85+ return clusterNames ;
86+ }
87+
88+ private static Dictionary < uint , string > ProcessLocalOnly ( IEnumerable < IGrouping < uint , CommitPredictionWithData > > clusters )
89+ {
90+ var clusterNames = new Dictionary < uint , string > ( ) ;
91+ foreach ( var cluster in clusters )
92+ {
93+ string label = GetLocalHeuristicLabel ( cluster ) ;
94+ if ( ! string . IsNullOrEmpty ( label ) )
4395 {
44- contents = new [ ]
96+ clusterNames [ cluster . Key ] = label ;
97+ Console . WriteLine ( $ "Cluster { cluster . Key } resolved locally as: { label } ") ;
98+ }
99+ }
100+ return clusterNames ;
101+ }
102+
103+ private static async Task < Dictionary < uint , string > > ProcessSinglePromptAsync ( HttpClient httpClient , string url , IEnumerable < IGrouping < uint , CommitPredictionWithData > > clusters )
104+ {
105+ var clusterNames = new Dictionary < uint , string > ( ) ;
106+ if ( ! clusters . Any ( ) ) return clusterNames ;
107+
108+ var promptBuilder = new System . Text . StringBuilder ( ) ;
109+ promptBuilder . AppendLine ( "Analyze the following clusters of git commit messages and provide a short 1-3 word category name for each." ) ;
110+ promptBuilder . AppendLine ( "Return ONLY a valid JSON object mapping the cluster ID (as a string) to the category name. Example: {\" 1\" : \" Bug Fixes\" , \" 2\" : \" Merges\" }" ) ;
111+ promptBuilder . AppendLine ( "\n Clusters:" ) ;
112+
113+ foreach ( var cluster in clusters )
114+ {
115+ var commitsToUse = cluster . Take ( 7 ) . Select ( c => c . CommitName ) ;
116+ promptBuilder . AppendLine ( $ "Cluster { cluster . Key } :") ;
117+ foreach ( var c in commitsToUse ) promptBuilder . AppendLine ( $ "- { c } ") ;
118+ }
119+
120+ var requestBody = new { contents = new [ ] { new { parts = new [ ] { new { text = promptBuilder . ToString ( ) } } } } } ;
121+
122+ string responseText = await CallGeminiWithRetryAsync ( httpClient , url , requestBody ) ;
123+
124+ if ( ! string . IsNullOrEmpty ( responseText ) )
125+ {
126+ try
127+ {
128+ string cleanJson = responseText . Replace ( "```json" , "" ) . Replace ( "```" , "" ) . Trim ( ) ;
129+ var map = JsonSerializer . Deserialize < Dictionary < string , string > > ( cleanJson ) ;
130+ if ( map != null )
45131 {
46- new
132+ foreach ( var kvp in map )
47133 {
48- parts = new [ ] { new { text = prompt } }
134+ if ( uint . TryParse ( kvp . Key , out uint id ) )
135+ {
136+ clusterNames [ id ] = CleanLabel ( kvp . Value ) ;
137+ Console . WriteLine ( $ "Cluster { id } predicted as: { clusterNames [ id ] } ") ;
138+ }
49139 }
50140 }
51- } ;
141+ }
142+ catch ( Exception ex )
143+ {
144+ Console . WriteLine ( $ "Failed to parse SinglePrompt JSON response: { ex . Message } ") ;
145+ }
146+ }
147+ return clusterNames ;
148+ }
52149
53- var url = $ "https://generativelanguage.googleapis.com/v1beta/models/{ modelName } :generateContent?key={ apiKey } ";
54- var jsonContent = new System . Net . Http . StringContent ( JsonSerializer . Serialize ( requestBody ) , System . Text . Encoding . UTF8 , "application/json" ) ;
150+ private static async Task < Dictionary < uint , string > > ProcessPerClusterAsync ( HttpClient httpClient , string url , IEnumerable < IGrouping < uint , CommitPredictionWithData > > clusters )
151+ {
152+ var clusterNames = new Dictionary < uint , string > ( ) ;
153+ foreach ( var cluster in clusters )
154+ {
155+ var commitsToUse = cluster . Take ( 20 ) . Select ( c => c . CommitName ) . ToList ( ) ;
156+ var prompt = "Based on the following git commit messages, provide a short 1-3 word category name for this cluster.\n \n " +
157+ "Return ONLY the category name.\n \n " +
158+ "Guidelines:\n " +
159+ "- Focus on the main action or intent shared across the commits.\n " +
160+ "- Ignore noisy identifiers such as issue numbers, dependency names, hashes, usernames, and version numbers.\n " +
161+ "- Prefer broad reusable engineering categories.\n \n " +
162+ "Commits:\n " + string . Join ( "\n " , commitsToUse ) + "\n \n Category:" ;
163+
164+ var requestBody = new { contents = new [ ] { new { parts = new [ ] { new { text = prompt } } } } } ;
165+
166+ string responseText = await CallGeminiWithRetryAsync ( httpClient , url , requestBody ) ;
167+
168+ if ( ! string . IsNullOrEmpty ( responseText ) )
169+ {
170+ clusterNames [ cluster . Key ] = CleanLabel ( responseText ) ;
171+ Console . WriteLine ( $ "Cluster { cluster . Key } predicted as: { clusterNames [ cluster . Key ] } ") ;
172+ }
173+ else
174+ {
175+ Console . WriteLine ( $ "Failed to predict name for Cluster { cluster . Key } (API fallback).") ;
176+ }
177+ }
178+ return clusterNames ;
179+ }
180+
181+ private static async Task < Dictionary < uint , string > > ProcessHybridAsync ( HttpClient httpClient , string url , IEnumerable < IGrouping < uint , CommitPredictionWithData > > clusters )
182+ {
183+ var clusterNames = ProcessLocalOnly ( clusters ) ;
184+ var unresolved = clusters . Where ( c => ! clusterNames . ContainsKey ( c . Key ) ) . ToList ( ) ;
185+
186+ if ( unresolved . Any ( ) )
187+ {
188+ Console . WriteLine ( $ "Hybrid mode: { unresolved . Count } clusters unresolved locally. Sending to Gemini via SinglePrompt...") ;
189+ var geminiNames = await ProcessSinglePromptAsync ( httpClient , url , unresolved ) ;
190+ foreach ( var kvp in geminiNames )
191+ {
192+ clusterNames [ kvp . Key ] = kvp . Value ;
193+ }
194+ }
195+
196+ return clusterNames ;
197+ }
198+
199+ private static string GetLocalHeuristicLabel ( IEnumerable < CommitPredictionWithData > commits )
200+ {
201+ int count = commits . Count ( ) ;
202+ if ( count == 0 ) return null ;
55203
204+ var msgs = commits . Select ( c => c . CommitName . ToLowerInvariant ( ) ) . ToList ( ) ;
205+
206+ if ( ( double ) msgs . Count ( m => m . Contains ( "merge pull request" ) || m . Contains ( "merge branch" ) ) / count > 0.4 )
207+ return "Merges" ;
208+
209+ if ( ( double ) msgs . Count ( m => m . Contains ( "bump" ) || m . Contains ( "dependency" ) || m . Contains ( "npm" ) || m . Contains ( "yarn" ) ) / count > 0.4 )
210+ return "Dependencies" ;
211+
212+ if ( ( double ) msgs . Count ( m => m . Contains ( "doc" ) || m . Contains ( "readme" ) ) / count > 0.4 )
213+ return "Documentation" ;
214+
215+ if ( ( double ) msgs . Count ( m => m . Contains ( "fix" ) || m . Contains ( "bug" ) || m . Contains ( "patch" ) ) / count > 0.4 )
216+ return "Bug Fixes" ;
217+
218+ return null ; // Unresolved
219+ }
220+
221+ private static async Task < string > CallGeminiWithRetryAsync ( HttpClient httpClient , string url , object requestBody , int maxRetries = 3 )
222+ {
223+ for ( int attempt = 1 ; attempt <= maxRetries ; attempt ++ )
224+ {
56225 try
57226 {
227+ // Recreate StringContent per retry to avoid stream exhaustion limits
228+ var jsonContent = new StringContent ( JsonSerializer . Serialize ( requestBody ) , System . Text . Encoding . UTF8 , "application/json" ) ;
58229 var response = await httpClient . PostAsync ( url , jsonContent ) ;
230+
59231 if ( response . IsSuccessStatusCode )
60232 {
61233 var responseString = await response . Content . ReadAsStringAsync ( ) ;
62234 using var doc = JsonDocument . Parse ( responseString ) ;
63- var text = doc . RootElement
235+ return doc . RootElement
64236 . GetProperty ( "candidates" ) [ 0 ]
65237 . GetProperty ( "content" )
66238 . GetProperty ( "parts" ) [ 0 ]
67239 . GetProperty ( "text" ) . GetString ( ) ;
68-
69- var cleanedName = text ? . Trim ( ) . TrimEnd ( '\r ' , '\n ' , '.' , '\" ' , '\' ' )
70- . Replace ( "'" , "" ) ; // Additional cleanup for single quotes
71- clusterNames [ cluster . Key ] = string . IsNullOrWhiteSpace ( cleanedName ) ? $ "Cluster { cluster . Key } " : cleanedName ;
72- Console . WriteLine ( $ "Cluster { cluster . Key } predicted as: { clusterNames [ cluster . Key ] } ") ;
73240 }
74241 else
75242 {
76- Console . WriteLine ( $ "Failed to predict name for Cluster { cluster . Key } . Status: { response . StatusCode } ") ;
77- clusterNames [ cluster . Key ] = $ "Cluster { cluster . Key } ";
243+ var errorBody = await response . Content . ReadAsStringAsync ( ) ;
244+ // Filter out non-retriable codes (like 400 Bad Request) but allow 429 and 5xx
245+ if ( ( int ) response . StatusCode >= 400 && ( int ) response . StatusCode < 500 && response . StatusCode != System . Net . HttpStatusCode . TooManyRequests )
246+ {
247+ Console . WriteLine ( $ "Client Error { response . StatusCode } . Aborting retry.") ;
248+ return null ; // Don't retry a bad malformed json request
249+ }
250+
251+ Console . WriteLine ( $ "API Status { response . StatusCode } on attempt { attempt } : { errorBody } ") ;
78252 }
79253 }
80- catch ( System . Exception ex )
254+ catch ( Exception ex )
81255 {
82- Console . WriteLine ( $ "Error predicting name for Cluster { cluster . Key } : { ex . Message } ") ;
83- clusterNames [ cluster . Key ] = $ "Cluster { cluster . Key } ";
256+ Console . WriteLine ( $ "API Request Error on attempt { attempt } : { ex . Message } ") ;
84257 }
85- }
86258
87- // Save for future runs
88- var saveFormat = clusterNames . ToDictionary ( k => k . Key . ToString ( ) , v => v . Value ) ;
89- var jsonOut = JsonSerializer . Serialize ( saveFormat , new JsonSerializerOptions { WriteIndented = true } ) ;
90- File . WriteAllText ( labelsFilePath , jsonOut ) ;
259+ if ( attempt < maxRetries )
260+ {
261+ Console . WriteLine ( "Waiting before retrying..." ) ;
262+ await Task . Delay ( 2000 * attempt ) ;
263+ }
264+ }
265+
266+ Console . WriteLine ( "Max retries reached. Gracefully falling back." ) ;
267+ return null ;
268+ }
91269
92- return clusterNames ;
270+ private static string CleanLabel ( string text )
271+ {
272+ if ( string . IsNullOrWhiteSpace ( text ) ) return null ;
273+ return text . Trim ( ) . TrimEnd ( '\r ' , '\n ' , '.' , '\" ' , '\' ' ) . Replace ( "'" , "" ) ;
93274 }
94275 }
95276}
0 commit comments