11namespace Explorer
22{
33 using System ;
4+ using System . Collections . Generic ;
45 using System . Linq ;
56 using System . Threading ;
67 using System . Threading . Tasks ;
@@ -10,6 +11,8 @@ namespace Explorer
1011
1112 internal class TextColumnExplorer : ExplorerBase
1213 {
14+ private const double SuppressedRatioThreshold = 0.1 ;
15+
1316 public TextColumnExplorer ( IQueryResolver queryResolver , string tableName , string columnName )
1417 : base ( queryResolver )
1518 {
@@ -27,24 +30,22 @@ public override async Task Explore(CancellationToken cancellationToken)
2730 new DistinctColumnValues ( TableName , ColumnName ) ,
2831 cancellationToken ) ;
2932
30- var ( totalValueCount , suppressedValueCount ) = distinctValuesQ . ResultRows . CountTotalAndSuppressed ( ) ;
33+ var counts = distinctValuesQ . ResultRows . CountTotalAndSuppressed ( ) ;
3134
32- PublishMetric ( new UntypedMetric ( name : "distinct.suppressed_count" , metric : suppressedValueCount ) ) ;
35+ PublishMetric ( new UntypedMetric ( name : "distinct.suppressed_count" , metric : counts . SuppressedCount ) ) ;
3336
3437 // This shouldn't happen, but check anyway.
35- if ( totalValueCount == 0 )
38+ if ( counts . TotalCount == 0 )
3639 {
3740 throw new Exception (
3841 $ "Total value count for { TableName } , { ColumnName } is zero.") ;
3942 }
4043
41- PublishMetric ( new UntypedMetric ( name : "distinct.total_count" , metric : totalValueCount ) ) ;
42-
43- var suppressedValueRatio = ( double ) suppressedValueCount / totalValueCount ;
44+ PublishMetric ( new UntypedMetric ( name : "distinct.total_count" , metric : counts . TotalCount ) ) ;
4445
4546 var distinctValueCounts =
4647 from row in distinctValuesQ . ResultRows
47- where ! row . DistinctData . IsSuppressed && ! row . DistinctData . IsNull
48+ where row . DistinctData . HasValue
4849 orderby row . Count descending
4950 select new
5051 {
@@ -53,6 +54,69 @@ orderby row.Count descending
5354 } ;
5455
5556 PublishMetric ( new UntypedMetric ( name : "distinct.top_values" , metric : distinctValueCounts . Take ( 10 ) ) ) ;
57+
58+ if ( counts . SuppressedCountRatio >= SuppressedRatioThreshold )
59+ {
60+ // we compute the common prefixes only if the row is not categorical
61+ await ExplorePrefixes ( cancellationToken ) ;
62+ }
63+ }
64+
65+ private async Task < IEnumerable < Prefix > > ExplorePrefixes ( CancellationToken cancellationToken )
66+ {
67+ var allPrefixes = new List < Prefix > ( ) ;
68+ var length = 0 ;
69+ while ( true )
70+ {
71+ length ++ ;
72+ var prefixesQ = await ResolveQuery < TextColumnPrefix . Result > (
73+ new TextColumnPrefix ( TableName , ColumnName , length ) ,
74+ cancellationToken ) ;
75+
76+ var counts = prefixesQ . ResultRows . CountTotalAndSuppressed ( ) ;
77+ var avgCount = ( double ) counts . NonSuppressedCount / counts . NonSuppressedRows ;
78+
79+ var prefixes =
80+ from row in prefixesQ . ResultRows
81+ let frequency = ( double ) row . Count / counts . NonSuppressedCount
82+ where row . HasValue && row . Count > avgCount
83+ orderby frequency descending
84+ select new Prefix ( row . Prefix , frequency ) ;
85+
86+ if ( ! prefixes . Any ( ) )
87+ {
88+ break ;
89+ }
90+
91+ if ( length > prefixes . Max ( p => p . Value . Length ) )
92+ {
93+ break ;
94+ }
95+
96+ allPrefixes . AddRange ( prefixes ) ;
97+ }
98+
99+ var ret =
100+ from row in allPrefixes
101+ orderby row . Value . Length ascending, row . Frequency descending
102+ select row ;
103+
104+ PublishMetric ( new UntypedMetric ( name : "text.prefixes" , metric : ret ) ) ;
105+
106+ return ret ;
107+ }
108+
109+ private struct Prefix
110+ {
111+ public Prefix ( string value , double frequency )
112+ {
113+ Value = value ;
114+ Frequency = frequency ;
115+ }
116+
117+ public string Value { get ; }
118+
119+ public double Frequency { get ; }
56120 }
57121 }
58122}
0 commit comments