Skip to content

Commit b17ab28

Browse files
author
Andrei Bozantan
authored
Merge pull request #140 from diffix/andrei/text-columns-pattern
More cleanup for TextColumnExplorer
2 parents a46b0c6 + cc0387c commit b17ab28

3 files changed

Lines changed: 123 additions & 159 deletions

File tree

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
namespace Explorer.Common
2+
{
3+
using System;
4+
using System.Collections.Generic;
5+
6+
internal class ValueWithCountList<T> : List<(T Value, long Count)>
7+
{
8+
public long TotalCount => Count == 0 ? 0 : this[^1].Count;
9+
10+
public static ValueWithCountList<T> FromValueWithCountEnum(IEnumerable<ValueWithCount<T>> valueCounts)
11+
{
12+
var ret = new ValueWithCountList<T>();
13+
foreach (var vc in valueCounts)
14+
{
15+
ret.AddValueCount(vc.Value, vc.Count);
16+
}
17+
return ret;
18+
}
19+
20+
public void AddValueCount(T value, long count)
21+
{
22+
Add((value, TotalCount + count));
23+
}
24+
25+
public T GetRandomValue(Random rand, T @default)
26+
{
27+
if (Count == 0)
28+
{
29+
return @default;
30+
}
31+
var rcount = rand.NextLong(TotalCount);
32+
return FindSubstring(rcount);
33+
}
34+
35+
private T FindSubstring(long count)
36+
{
37+
var left = 0;
38+
var right = Count - 1;
39+
while (true)
40+
{
41+
var middle = (left + right) / 2;
42+
if (middle == 0 || middle == Count - 1)
43+
{
44+
return this[middle].Value;
45+
}
46+
if (count < this[middle].Count)
47+
{
48+
if (count >= this[middle - 1].Count)
49+
{
50+
return this[middle - 1].Value;
51+
}
52+
right = middle;
53+
}
54+
else if (count > this[middle].Count)
55+
{
56+
if (count <= this[middle + 1].Count)
57+
{
58+
return this[middle].Value;
59+
}
60+
left = middle;
61+
}
62+
else
63+
{
64+
return this[middle].Value;
65+
}
66+
}
67+
}
68+
}
69+
}

src/explorer/Explorers/TextColumnExplorer.cs

Lines changed: 50 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ namespace Explorer.Explorers
1010
using Explorer.Common;
1111
using Explorer.Queries;
1212

13+
using SubstringWithCountList = Explorer.Common.ValueWithCountList<string>;
14+
1315
internal class TextColumnExplorer : ExplorerBase
1416
{
1517
public const string EmailAddressChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_.";
@@ -64,11 +66,7 @@ private static async Task<IEnumerable<string>> GenerateStrings(DConnection conn,
6466
var substrings = await ExploreSubstrings(conn, ctx, substringLengths: new int[] { 3, 4 });
6567
var rand = new Random(Environment.TickCount);
6668
return Enumerable.Range(0, GeneratedValuesCount).Select(_
67-
=> substrings.GenerateString(
68-
minLength: 3,
69-
minSubstringLength: 3,
70-
maxSubstringLength: 4,
71-
rand));
69+
=> GenerateString(substrings, minLength: 3, rand));
7270
}
7371

7472
private static async Task<IEnumerable<string>> GenerateEmails(DConnection conn, ExplorerContext ctx)
@@ -80,12 +78,7 @@ private static async Task<IEnumerable<string>> GenerateEmails(DConnection conn,
8078
var emails = new List<string>(GeneratedValuesCount);
8179
for (var i = 0; emails.Count < GeneratedValuesCount && i < GeneratedValuesCount * 100; i++)
8280
{
83-
var s = substrings.GenerateString(
84-
minLength: 3,
85-
minSubstringLength: 3,
86-
maxSubstringLength: 4,
87-
rand);
88-
var email = GenerateEmail(s, domains, tlds, rand);
81+
var email = GenerateEmail(substrings, domains, tlds, rand);
8982
if (!string.IsNullOrEmpty(email))
9083
{
9184
emails.Add(email);
@@ -94,9 +87,27 @@ private static async Task<IEnumerable<string>> GenerateEmails(DConnection conn,
9487
return emails;
9588
}
9689

97-
private static string GenerateEmail(string str, SubstringWithCountList domains, SubstringWithCountList tlds, Random rand)
90+
private static string GenerateString(SubstringsData substrings, int minLength, Random rand)
91+
{
92+
var sb = new StringBuilder();
93+
var len = rand.Next(minLength, substrings.Count);
94+
for (var pos = 0; pos < substrings.Count && sb.Length < len; pos++)
95+
{
96+
var str = substrings.GetRandomSubstring(pos, rand);
97+
sb.Append(str);
98+
pos += str.Length;
99+
}
100+
return sb.ToString();
101+
}
102+
103+
private static string GenerateEmail(
104+
SubstringsData substrings,
105+
SubstringWithCountList domains,
106+
SubstringWithCountList tlds,
107+
Random rand)
98108
{
99109
// create local-part section
110+
var str = GenerateString(substrings, minLength: 6, rand);
100111
var allParts = str.Split('@', StringSplitOptions.RemoveEmptyEntries);
101112
var sb = new StringBuilder();
102113
var partIndex = 0;
@@ -107,12 +118,6 @@ private static string GenerateEmail(string str, SubstringWithCountList domains,
107118
pnext /= 2;
108119
partIndex++;
109120
}
110-
for (var replaced = 1; replaced != 0;)
111-
{
112-
var oldlen = sb.Length;
113-
sb.Replace("..", ".");
114-
replaced = oldlen - sb.Length;
115-
}
116121
var localParts = sb.ToString()
117122
.Split('.', StringSplitOptions.RemoveEmptyEntries)
118123
.Where(s => s.Length == 1 || s.Length > 3)
@@ -125,7 +130,7 @@ private static string GenerateEmail(string str, SubstringWithCountList domains,
125130
if (domains.Count >= EmailDomainsCountThreshold)
126131
{
127132
// if the number of distinct domains is big enough we select one from the extracted list
128-
return localPart + domains.GetSubstring(rand);
133+
return localPart + domains.GetRandomValue(rand, @default: string.Empty);
129134
}
130135

131136
// create domain section
@@ -145,17 +150,17 @@ private static string GenerateEmail(string str, SubstringWithCountList domains,
145150
{
146151
return string.Empty;
147152
}
148-
return localPart + "@" + domain + tlds.GetSubstring(rand);
153+
return localPart + "@" + domain + tlds.GetRandomValue(rand, @default: string.Empty);
149154
}
150155

151156
/// <summary>
152157
/// Finds common substrings for each position in the texts of the specified column.
153158
/// It uses a batch approach to query for several positions (specified using SubstringQueryColumnCount)
154159
/// using a single query.
155160
/// </summary>
156-
private static async Task<SubstringDataCollection> ExploreSubstrings(DConnection conn, ExplorerContext ctx, params int[] substringLengths)
161+
private static async Task<SubstringsData> ExploreSubstrings(DConnection conn, ExplorerContext ctx, params int[] substringLengths)
157162
{
158-
var substrings = new SubstringDataCollection(maxSubstringLength: substringLengths.Max());
163+
var substrings = new SubstringsData();
159164
foreach (var length in substringLengths)
160165
{
161166
var hasRows = true;
@@ -182,166 +187,54 @@ private static async Task<bool> CheckIsEmail(DConnection conn, ExplorerContext c
182187
var emailCheck = await conn.Exec(
183188
new TextColumnTrim(ctx.Table, ctx.Column, TextColumnTrimType.Both, EmailAddressChars));
184189

185-
var counts = ValueCounts.Compute(emailCheck.Rows);
186-
187-
return counts.TotalCount == emailCheck.Rows
188-
.Where(r => r.IsNull || r.Value == "@")
189-
.Sum(r => r.Count);
190+
return emailCheck.Rows.All(r => r.IsNull || r.Value == "@");
190191
}
191192

192193
private static async Task<SubstringWithCountList> ExploreEmailDomains(DConnection conn, ExplorerContext ctx)
193194
{
194195
var domains = await conn.Exec(new TextColumnTrim(ctx.Table, ctx.Column, TextColumnTrimType.Leading, EmailAddressChars));
195-
var totalCount = 0L;
196-
var domain = new SubstringWithCountList();
197-
foreach (var row in domains.Rows)
198-
{
199-
if (row.HasValue && row.Value.StartsWith("@", StringComparison.InvariantCulture))
200-
{
201-
totalCount += row.Count;
202-
domain.Add((row.Value, totalCount));
203-
}
204-
}
205-
return domain;
196+
197+
return SubstringWithCountList.FromValueWithCountEnum(
198+
domains.Rows
199+
.Where(r => r.HasValue && r.Value.StartsWith("@", StringComparison.InvariantCulture)));
206200
}
207201

208202
private static async Task<SubstringWithCountList> ExploreEmailTopLevelDomains(DConnection conn, ExplorerContext ctx)
209203
{
210204
var suffixes = await conn.Exec(new TextColumnSuffix(ctx.Table, ctx.Column, 3, 7));
211-
var totalCount = 0L;
212-
var tlds = new SubstringWithCountList();
213-
foreach (var row in suffixes.Rows)
214-
{
215-
if (row.HasValue && row.Value.StartsWith(".", StringComparison.InvariantCulture))
216-
{
217-
totalCount += row.Count;
218-
tlds.Add((row.Value, totalCount));
219-
}
220-
}
221-
return tlds;
222-
}
223-
}
224-
225-
internal class SubstringWithCountList : List<(string Value, long Count)>
226-
{
227-
public long TotalCount => Count == 0 ? 0 : this[^1].Count;
228-
229-
public string GetSubstring(Random rand)
230-
{
231-
if (Count == 0)
232-
{
233-
return string.Empty;
234-
}
235-
var rcount = rand.NextLong(TotalCount);
236-
return FindSubstring(rcount);
237-
}
238-
239-
private string FindSubstring(long count)
240-
{
241-
var left = 0;
242-
var right = Count - 1;
243-
while (true)
244-
{
245-
var middle = (left + right) / 2;
246-
if (middle == 0 || middle == Count - 1)
247-
{
248-
return this[middle].Value;
249-
}
250-
if (count < this[middle].Count)
251-
{
252-
if (count >= this[middle - 1].Count)
253-
{
254-
return this[middle - 1].Value;
255-
}
256-
right = middle;
257-
}
258-
else if (count > this[middle].Count)
259-
{
260-
if (count <= this[middle + 1].Count)
261-
{
262-
return this[middle].Value;
263-
}
264-
left = middle;
265-
}
266-
else
267-
{
268-
return this[middle].Value;
269-
}
270-
}
271-
}
272-
}
273-
274-
internal class SubstringDataCollection
275-
{
276-
public SubstringDataCollection(int maxSubstringLength)
277-
{
278-
MaxSubstringLength = maxSubstringLength;
279-
Substrings = new List<Item>();
280-
}
281-
282-
private List<Item> Substrings { get; }
283-
284-
private int MaxSubstringLength { get; }
285-
286-
public void Add(int pos, string s, long count)
287-
{
288-
while (Substrings.Count <= pos)
289-
{
290-
Substrings.Add(new Item(MaxSubstringLength));
291-
}
292-
Substrings[pos].Add(s, count);
293-
}
294205

295-
public string GenerateString(int minLength, int minSubstringLength, int maxSubstringLength, Random rand)
296-
{
297-
var sb = new StringBuilder();
298-
var len = rand.Next(minLength, Substrings.Count);
299-
for (var pos = 0; pos < Substrings.Count && sb.Length < len; pos++)
300-
{
301-
var str = Substrings[pos].GetSubstring(minSubstringLength, maxSubstringLength, rand);
302-
sb.Append(str);
303-
pos += str.Length;
304-
}
305-
return sb.ToString();
206+
return SubstringWithCountList.FromValueWithCountEnum(
207+
suffixes.Rows
208+
.Where(r => r.HasValue && r.Value.StartsWith(".", StringComparison.InvariantCulture)));
306209
}
307210

308211
/// <summary>
309-
/// Stores the substrings from a certain position in a column,
212+
/// Stores the substrings at each position in a column,
310213
/// together with the number of occurences (counts) for each substring.
311-
/// The substrings are grouped separately by length.
312214
/// </summary>
313-
internal class Item
215+
internal class SubstringsData
314216
{
315-
public Item(int maxSubstringLength)
316-
{
317-
Data = new List<SubstringWithCountList>(maxSubstringLength)
217+
public SubstringsData()
318218
{
319-
new SubstringWithCountList() { (string.Empty, 0) },
320-
};
321-
for (var i = 1; i <= maxSubstringLength; i++)
322-
{
323-
Data.Add(new SubstringWithCountList());
324-
}
219+
Substrings = new List<SubstringWithCountList>();
325220
}
326221

327-
private List<SubstringWithCountList> Data { get; }
222+
public int Count => Substrings.Count;
328223

329-
public void Add(string s, long count)
330-
{
331-
var substrings = Data[s.Length];
332-
substrings.Add((s, substrings.TotalCount + count));
333-
}
224+
private List<SubstringWithCountList> Substrings { get; }
334225

335-
public string GetSubstring(int minLength, int maxLength, Random rand)
226+
public void Add(int pos, string s, long count)
336227
{
337-
if (maxLength >= Data.Count)
228+
while (Substrings.Count <= pos)
338229
{
339-
throw new ArgumentException($"{nameof(maxLength)} should be smaller than {Data.Count}.", nameof(maxLength));
230+
Substrings.Add(new SubstringWithCountList());
340231
}
341-
// TODO: distribute value over all alternatives according to counts (not with the same probability)
342-
var sslen = rand.Next(minLength, maxLength + 1);
343-
var substrings = Data[sslen];
344-
return substrings.GetSubstring(rand);
232+
Substrings[pos].AddValueCount(s, count);
233+
}
234+
235+
public string GetRandomSubstring(int pos, Random rand)
236+
{
237+
return Substrings[pos].GetRandomValue(rand, string.Empty);
345238
}
346239
}
347240
}

tests/explorer.api.tests/TextColumnExplorerTests.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ public async void TestClientsEmail()
2828
var metric_svalues = metrics.First(m => m.Name == "synthetic_values");
2929
var values = metric_svalues.Metric as IEnumerable<string>;
3030
Assert.True(values.All(v => v.Length >= 3));
31-
Assert.True(values.All(v => v.Contains('@', StringComparison.InvariantCulture)));
31+
Assert.True(values.All(v => v.Count(c => c == '@') == 1));
3232
Assert.True(values.All(v => v.Contains('.', StringComparison.InvariantCulture)));
33+
Assert.True(values.All(v => !v.Contains("..", StringComparison.InvariantCulture)));
3334
}
3435

3536
[Fact]
@@ -41,8 +42,9 @@ public async void TestCardsEmail()
4142
var metric_svalues = metrics.First(m => m.Name == "synthetic_values");
4243
var values = metric_svalues.Metric as IEnumerable<string>;
4344
Assert.True(values.All(v => v.Length >= 3));
44-
Assert.True(values.All(v => v.Contains('@', StringComparison.InvariantCulture)));
45+
Assert.True(values.All(v => v.Count(c => c == '@') == 1));
4546
Assert.True(values.All(v => v.Contains('.', StringComparison.InvariantCulture)));
47+
Assert.True(values.All(v => !v.Contains("..", StringComparison.InvariantCulture)));
4648
}
4749

4850
[Fact]

0 commit comments

Comments
 (0)