@@ -10,6 +10,8 @@ namespace Explorer.Explorers
1010 using Explorer . Common ;
1111 using Explorer . Queries ;
1212
13+ using SubstringWithCountList = Explorer . Common . ValueWithCountList < string > ;
14+
1315 internal class TextColumnExplorer : ExplorerBase
1416 {
1517 public const string EmailAddressChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_." ;
@@ -64,11 +66,7 @@ private static async Task<IEnumerable<string>> GenerateStrings(DConnection conn,
6466 var substrings = await ExploreSubstrings ( conn , ctx , substringLengths : new int [ ] { 3 , 4 } ) ;
6567 var rand = new Random ( Environment . TickCount ) ;
6668 return Enumerable . Range ( 0 , GeneratedValuesCount ) . Select ( _
67- => substrings . GenerateString (
68- minLength : 3 ,
69- minSubstringLength : 3 ,
70- maxSubstringLength : 4 ,
71- rand ) ) ;
69+ => GenerateString ( substrings , minLength : 3 , rand ) ) ;
7270 }
7371
7472 private static async Task < IEnumerable < string > > GenerateEmails ( DConnection conn , ExplorerContext ctx )
@@ -80,12 +78,7 @@ private static async Task<IEnumerable<string>> GenerateEmails(DConnection conn,
8078 var emails = new List < string > ( GeneratedValuesCount ) ;
8179 for ( var i = 0 ; emails . Count < GeneratedValuesCount && i < GeneratedValuesCount * 100 ; i ++ )
8280 {
83- var s = substrings . GenerateString (
84- minLength : 3 ,
85- minSubstringLength : 3 ,
86- maxSubstringLength : 4 ,
87- rand ) ;
88- var email = GenerateEmail ( s , domains , tlds , rand ) ;
81+ var email = GenerateEmail ( substrings , domains , tlds , rand ) ;
8982 if ( ! string . IsNullOrEmpty ( email ) )
9083 {
9184 emails . Add ( email ) ;
@@ -94,9 +87,27 @@ private static async Task<IEnumerable<string>> GenerateEmails(DConnection conn,
9487 return emails ;
9588 }
9689
97- private static string GenerateEmail ( string str , SubstringWithCountList domains , SubstringWithCountList tlds , Random rand )
90+ private static string GenerateString ( SubstringsData substrings , int minLength , Random rand )
91+ {
92+ var sb = new StringBuilder ( ) ;
93+ var len = rand . Next ( minLength , substrings . Count ) ;
94+ for ( var pos = 0 ; pos < substrings . Count && sb . Length < len ; pos ++ )
95+ {
96+ var str = substrings . GetRandomSubstring ( pos , rand ) ;
97+ sb . Append ( str ) ;
98+ pos += str . Length ;
99+ }
100+ return sb . ToString ( ) ;
101+ }
102+
103+ private static string GenerateEmail (
104+ SubstringsData substrings ,
105+ SubstringWithCountList domains ,
106+ SubstringWithCountList tlds ,
107+ Random rand )
98108 {
99109 // create local-part section
110+ var str = GenerateString ( substrings , minLength : 6 , rand ) ;
100111 var allParts = str . Split ( '@' , StringSplitOptions . RemoveEmptyEntries ) ;
101112 var sb = new StringBuilder ( ) ;
102113 var partIndex = 0 ;
@@ -107,12 +118,6 @@ private static string GenerateEmail(string str, SubstringWithCountList domains,
107118 pnext /= 2 ;
108119 partIndex ++ ;
109120 }
110- for ( var replaced = 1 ; replaced != 0 ; )
111- {
112- var oldlen = sb . Length ;
113- sb . Replace ( ".." , "." ) ;
114- replaced = oldlen - sb . Length ;
115- }
116121 var localParts = sb . ToString ( )
117122 . Split ( '.' , StringSplitOptions . RemoveEmptyEntries )
118123 . Where ( s => s . Length == 1 || s . Length > 3 )
@@ -125,7 +130,7 @@ private static string GenerateEmail(string str, SubstringWithCountList domains,
125130 if ( domains . Count >= EmailDomainsCountThreshold )
126131 {
127132 // if the number of distinct domains is big enough we select one from the extracted list
128- return localPart + domains . GetSubstring ( rand ) ;
133+ return localPart + domains . GetRandomValue ( rand , @default : string . Empty ) ;
129134 }
130135
131136 // create domain section
@@ -145,17 +150,17 @@ private static string GenerateEmail(string str, SubstringWithCountList domains,
145150 {
146151 return string . Empty ;
147152 }
148- return localPart + "@" + domain + tlds . GetSubstring ( rand ) ;
153+ return localPart + "@" + domain + tlds . GetRandomValue ( rand , @default : string . Empty ) ;
149154 }
150155
151156 /// <summary>
152157 /// Finds common substrings for each position in the texts of the specified column.
153158 /// It uses a batch approach to query for several positions (specified using SubstringQueryColumnCount)
154159 /// using a single query.
155160 /// </summary>
156- private static async Task < SubstringDataCollection > ExploreSubstrings ( DConnection conn , ExplorerContext ctx , params int [ ] substringLengths )
161+ private static async Task < SubstringsData > ExploreSubstrings ( DConnection conn , ExplorerContext ctx , params int [ ] substringLengths )
157162 {
158- var substrings = new SubstringDataCollection ( maxSubstringLength : substringLengths . Max ( ) ) ;
163+ var substrings = new SubstringsData ( ) ;
159164 foreach ( var length in substringLengths )
160165 {
161166 var hasRows = true ;
@@ -182,166 +187,54 @@ private static async Task<bool> CheckIsEmail(DConnection conn, ExplorerContext c
182187 var emailCheck = await conn . Exec (
183188 new TextColumnTrim ( ctx . Table , ctx . Column , TextColumnTrimType . Both , EmailAddressChars ) ) ;
184189
185- var counts = ValueCounts . Compute ( emailCheck . Rows ) ;
186-
187- return counts . TotalCount == emailCheck . Rows
188- . Where ( r => r . IsNull || r . Value == "@" )
189- . Sum ( r => r . Count ) ;
190+ return emailCheck . Rows . All ( r => r . IsNull || r . Value == "@" ) ;
190191 }
191192
192193 private static async Task < SubstringWithCountList > ExploreEmailDomains ( DConnection conn , ExplorerContext ctx )
193194 {
194195 var domains = await conn . Exec ( new TextColumnTrim ( ctx . Table , ctx . Column , TextColumnTrimType . Leading , EmailAddressChars ) ) ;
195- var totalCount = 0L ;
196- var domain = new SubstringWithCountList ( ) ;
197- foreach ( var row in domains . Rows )
198- {
199- if ( row . HasValue && row . Value . StartsWith ( "@" , StringComparison . InvariantCulture ) )
200- {
201- totalCount += row . Count ;
202- domain . Add ( ( row . Value , totalCount ) ) ;
203- }
204- }
205- return domain ;
196+
197+ return SubstringWithCountList . FromValueWithCountEnum (
198+ domains . Rows
199+ . Where ( r => r . HasValue && r . Value . StartsWith ( "@" , StringComparison . InvariantCulture ) ) ) ;
206200 }
207201
208202 private static async Task < SubstringWithCountList > ExploreEmailTopLevelDomains ( DConnection conn , ExplorerContext ctx )
209203 {
210204 var suffixes = await conn . Exec ( new TextColumnSuffix ( ctx . Table , ctx . Column , 3 , 7 ) ) ;
211- var totalCount = 0L ;
212- var tlds = new SubstringWithCountList ( ) ;
213- foreach ( var row in suffixes . Rows )
214- {
215- if ( row . HasValue && row . Value . StartsWith ( "." , StringComparison . InvariantCulture ) )
216- {
217- totalCount += row . Count ;
218- tlds . Add ( ( row . Value , totalCount ) ) ;
219- }
220- }
221- return tlds ;
222- }
223- }
224-
225- internal class SubstringWithCountList : List < ( string Value , long Count ) >
226- {
227- public long TotalCount => Count == 0 ? 0 : this [ ^ 1 ] . Count ;
228-
229- public string GetSubstring ( Random rand )
230- {
231- if ( Count == 0 )
232- {
233- return string . Empty ;
234- }
235- var rcount = rand . NextLong ( TotalCount ) ;
236- return FindSubstring ( rcount ) ;
237- }
238-
239- private string FindSubstring ( long count )
240- {
241- var left = 0 ;
242- var right = Count - 1 ;
243- while ( true )
244- {
245- var middle = ( left + right ) / 2 ;
246- if ( middle == 0 || middle == Count - 1 )
247- {
248- return this [ middle ] . Value ;
249- }
250- if ( count < this [ middle ] . Count )
251- {
252- if ( count >= this [ middle - 1 ] . Count )
253- {
254- return this [ middle - 1 ] . Value ;
255- }
256- right = middle ;
257- }
258- else if ( count > this [ middle ] . Count )
259- {
260- if ( count <= this [ middle + 1 ] . Count )
261- {
262- return this [ middle ] . Value ;
263- }
264- left = middle ;
265- }
266- else
267- {
268- return this [ middle ] . Value ;
269- }
270- }
271- }
272- }
273-
274- internal class SubstringDataCollection
275- {
276- public SubstringDataCollection ( int maxSubstringLength )
277- {
278- MaxSubstringLength = maxSubstringLength ;
279- Substrings = new List < Item > ( ) ;
280- }
281-
282- private List < Item > Substrings { get ; }
283-
284- private int MaxSubstringLength { get ; }
285-
286- public void Add ( int pos , string s , long count )
287- {
288- while ( Substrings . Count <= pos )
289- {
290- Substrings . Add ( new Item ( MaxSubstringLength ) ) ;
291- }
292- Substrings [ pos ] . Add ( s , count ) ;
293- }
294205
295- public string GenerateString ( int minLength , int minSubstringLength , int maxSubstringLength , Random rand )
296- {
297- var sb = new StringBuilder ( ) ;
298- var len = rand . Next ( minLength , Substrings . Count ) ;
299- for ( var pos = 0 ; pos < Substrings . Count && sb . Length < len ; pos ++ )
300- {
301- var str = Substrings [ pos ] . GetSubstring ( minSubstringLength , maxSubstringLength , rand ) ;
302- sb . Append ( str ) ;
303- pos += str . Length ;
304- }
305- return sb . ToString ( ) ;
206+ return SubstringWithCountList . FromValueWithCountEnum (
207+ suffixes . Rows
208+ . Where ( r => r . HasValue && r . Value . StartsWith ( "." , StringComparison . InvariantCulture ) ) ) ;
306209 }
307210
308211 /// <summary>
309- /// Stores the substrings from a certain position in a column,
212+ /// Stores the substrings at each position in a column,
310213 /// together with the number of occurences (counts) for each substring.
311- /// The substrings are grouped separately by length.
312214 /// </summary>
313- internal class Item
215+ internal class SubstringsData
314216 {
315- public Item ( int maxSubstringLength )
316- {
317- Data = new List < SubstringWithCountList > ( maxSubstringLength )
217+ public SubstringsData ( )
318218 {
319- new SubstringWithCountList ( ) { ( string . Empty , 0 ) } ,
320- } ;
321- for ( var i = 1 ; i <= maxSubstringLength ; i ++ )
322- {
323- Data . Add ( new SubstringWithCountList ( ) ) ;
324- }
219+ Substrings = new List < SubstringWithCountList > ( ) ;
325220 }
326221
327- private List < SubstringWithCountList > Data { get ; }
222+ public int Count => Substrings . Count ;
328223
329- public void Add ( string s , long count )
330- {
331- var substrings = Data [ s . Length ] ;
332- substrings . Add ( ( s , substrings . TotalCount + count ) ) ;
333- }
224+ private List < SubstringWithCountList > Substrings { get ; }
334225
335- public string GetSubstring ( int minLength , int maxLength , Random rand )
226+ public void Add ( int pos , string s , long count )
336227 {
337- if ( maxLength >= Data . Count )
228+ while ( Substrings . Count <= pos )
338229 {
339- throw new ArgumentException ( $ " { nameof ( maxLength ) } should be smaller than { Data . Count } ." , nameof ( maxLength ) ) ;
230+ Substrings . Add ( new SubstringWithCountList ( ) ) ;
340231 }
341- // TODO: distribute value over all alternatives according to counts (not with the same probability)
342- var sslen = rand . Next ( minLength , maxLength + 1 ) ;
343- var substrings = Data [ sslen ] ;
344- return substrings . GetSubstring ( rand ) ;
232+ Substrings [ pos ] . AddValueCount ( s , count ) ;
233+ }
234+
235+ public string GetRandomSubstring ( int pos , Random rand )
236+ {
237+ return Substrings [ pos ] . GetRandomValue ( rand , string . Empty ) ;
345238 }
346239 }
347240 }
0 commit comments