@@ -69,7 +69,7 @@ public static IEstimator<ITransformer> FeaturizeText(MLContext mlContext)
6969 return mlContext . Transforms . Text . FeaturizeText ( FeaturesColumnName , nameof ( CommitMLData . CommitName ) ) ;
7070 }
7171
72- public static int GetOrFindBestK ( MLContext mlContext , IDataView trainData , IDataView testData , IEstimator < ITransformer > featurizer , string kFilePath )
72+ public static int GetOrFindBestK ( MLContext mlContext , IDataView trainData , IEstimator < ITransformer > featurizer , string kFilePath )
7373 {
7474 if ( File . Exists ( kFilePath ) )
7575 {
@@ -80,16 +80,20 @@ public static int GetOrFindBestK(MLContext mlContext, IDataView trainData, IData
8080 }
8181 }
8282
83- Console . WriteLine ( "Finding best K via Grid Search..." ) ;
83+ Console . WriteLine ( "Finding best K via Grid Search using validation split..." ) ;
84+ var split = mlContext . Data . TrainTestSplit ( trainData , testFraction : 0.2 ) ;
85+ var subTrainData = split . TrainSet ;
86+ var validationData = split . TestSet ;
87+
8488 int bestK = 2 ;
8589 double bestMetric = double . MaxValue ; // Lower Davies-Bouldin is better for measuring clustering quality
8690
8791 for ( int k = 2 ; k <= 10 ; k ++ )
8892 {
8993 var pipeline = featurizer . Append ( mlContext . Clustering . Trainers . KMeans ( featureColumnName : FeaturesColumnName , numberOfClusters : k ) ) ;
90- var model = pipeline . Fit ( trainData ) ;
91-
92- var predictions = model . Transform ( testData ) ;
94+ var model = pipeline . Fit ( subTrainData ) ;
95+
96+ var predictions = model . Transform ( validationData ) ;
9397 var metrics = mlContext . Clustering . Evaluate ( predictions , labelColumnName : null , scoreColumnName : "Score" , featureColumnName : FeaturesColumnName ) ;
9498
9599 Console . WriteLine ( $ "K = { k } | Davies-Bouldin: { metrics . DaviesBouldinIndex : F4} | Avg Distance: { metrics . AverageDistance : F4} ") ;
0 commit comments