Skip to content

Commit 92aa3ed

Browse files
author
Andrei Bozantan
authored
Merge pull request #92 from diffix/andrei/text-columns-prefix
Andrei/text columns prefix
2 parents b75397f + 3dd2f99 commit 92aa3ed

8 files changed

Lines changed: 199 additions & 44 deletions

File tree

src/explorer.api/Explorers/BoolColumnExplorer.cs

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,32 +27,30 @@ public override async Task Explore(CancellationToken cancellationToken)
2727
new DistinctColumnValues(TableName, ColumnName),
2828
cancellationToken);
2929

30-
var (totalValueCount, suppressedValueCount) = distinctValuesQ.ResultRows.CountTotalAndSuppressed();
30+
var counts = distinctValuesQ.ResultRows.CountTotalAndSuppressed();
3131

32-
PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: suppressedValueCount));
32+
PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: counts.SuppressedCount));
3333

3434
// This shouldn't happen, but check anyway.
35-
if (totalValueCount == 0)
35+
if (counts.TotalCount == 0)
3636
{
3737
throw new Exception(
3838
$"Total value count for {TableName}, {ColumnName} is zero.");
3939
}
4040

41-
PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: totalValueCount));
42-
43-
var suppressedValueRatio = (double)suppressedValueCount / totalValueCount;
41+
PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: counts.TotalCount));
4442

4543
var distinctValueCounts =
4644
from row in distinctValuesQ.ResultRows
47-
where !row.DistinctData.IsSuppressed
45+
where row.DistinctData.HasValue
4846
orderby row.Count descending
4947
select new
5048
{
5149
row.DistinctData.Value,
5250
row.Count,
5351
};
5452

55-
PublishMetric(new UntypedMetric(name: "distinct.values", metric: distinctValueCounts));
53+
PublishMetric(new UntypedMetric(name: "distinct.top_values", metric: distinctValueCounts.Take(10)));
5654
}
5755
}
5856
}

src/explorer.api/Explorers/CategoricalColumnExplorer.cs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,24 +27,22 @@ public override async Task Explore(CancellationToken cancellationToken)
2727
new DistinctColumnValues(TableName, ColumnName),
2828
cancellationToken);
2929

30-
var (totalValueCount, suppressedValueCount) = distinctValuesQ.ResultRows.CountTotalAndSuppressed();
30+
var counts = distinctValuesQ.ResultRows.CountTotalAndSuppressed();
3131

32-
PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: suppressedValueCount));
32+
PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: counts.SuppressedCount));
3333

3434
// This shouldn't happen, but check anyway.
35-
if (totalValueCount == 0)
35+
if (counts.TotalCount == 0)
3636
{
3737
throw new Exception(
3838
$"Total value count for {TableName}, {ColumnName} is zero.");
3939
}
4040

41-
PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: totalValueCount));
42-
43-
var suppressedValueRatio = (double)suppressedValueCount / totalValueCount;
41+
PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: counts.TotalCount));
4442

4543
var distinctValueCounts =
4644
from row in distinctValuesQ.ResultRows
47-
where !row.DistinctData.IsSuppressed && !row.IsNull
45+
where row.DistinctData.HasValue
4846
orderby row.Count descending
4947
select new
5048
{

src/explorer.api/Explorers/DatetimeColumnExplorer.cs

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -146,17 +146,15 @@ private void ProcessLinearBuckets(
146146
var valueCounts = group
147147
.Select(row => new AircloakValueCount<DateTime>(row.GroupingValue, row.Count, row.CountNoise));
148148

149-
var (totalCount, suppressedCount) = valueCounts.CountTotalAndSuppressed();
149+
var counts = valueCounts.CountTotalAndSuppressed();
150150

151-
var suppressedRatio = (double)suppressedCount / totalCount;
152-
153-
if (suppressedRatio > SuppressedRatioThreshold)
151+
if (counts.SuppressedCountRatio > SuppressedRatioThreshold)
154152
{
155153
break;
156154
}
157155

158156
PublishMetric(new UntypedMetric(name: $"dates_linear.{label}", metric: DatetimeMetric(
159-
totalCount, suppressedCount, valueCounts)));
157+
counts.TotalCount, counts.SuppressedCount, valueCounts)));
160158
}
161159
}
162160

@@ -189,17 +187,15 @@ private void ProcessCyclicalBuckets(
189187
var valueCounts = group
190188
.Select(row => new AircloakValueCount<int>(row.GroupingValue, row.Count, row.CountNoise));
191189

192-
var (totalCount, suppressedCount) = valueCounts.CountTotalAndSuppressed();
193-
194-
var suppressedRatio = (double)suppressedCount / totalCount;
190+
var counts = valueCounts.CountTotalAndSuppressed();
195191

196-
if (suppressedRatio > SuppressedRatioThreshold)
192+
if (counts.SuppressedCountRatio > SuppressedRatioThreshold)
197193
{
198194
break;
199195
}
200196

201197
PublishMetric(new UntypedMetric(name: $"dates_cyclical.{label}", metric: DatetimeMetric(
202-
totalCount, suppressedCount, valueCounts)));
198+
counts.TotalCount, counts.SuppressedCount, valueCounts)));
203199
}
204200
}
205201

src/explorer.api/Explorers/EmailColumnExplorer.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ public override async Task Explore(CancellationToken cancellationToken)
3030
new TextColumnTrim(TableName, ColumnName, TextColumnTrimType.Both, EmailAddressChars),
3131
cancellationToken);
3232

33-
var (totalValueCount, suppressedValueCount) = emailCheckQ.ResultRows.CountTotalAndSuppressed();
33+
var counts = emailCheckQ.ResultRows.CountTotalAndSuppressed();
3434

35-
var isEmail = totalValueCount == emailCheckQ.ResultRows
35+
var isEmail = counts.TotalCount == emailCheckQ.ResultRows
3636
.Where(r => r.TrimmedText == "@" || r.IsNull)
3737
.Sum(r => r.Count);
3838

src/explorer.api/Explorers/TextColumnExplorer.cs

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
namespace Explorer
22
{
33
using System;
4+
using System.Collections.Generic;
45
using System.Linq;
56
using System.Threading;
67
using System.Threading.Tasks;
@@ -10,6 +11,8 @@ namespace Explorer
1011

1112
internal class TextColumnExplorer : ExplorerBase
1213
{
14+
private const double SuppressedRatioThreshold = 0.1;
15+
1316
public TextColumnExplorer(IQueryResolver queryResolver, string tableName, string columnName)
1417
: base(queryResolver)
1518
{
@@ -27,24 +30,22 @@ public override async Task Explore(CancellationToken cancellationToken)
2730
new DistinctColumnValues(TableName, ColumnName),
2831
cancellationToken);
2932

30-
var (totalValueCount, suppressedValueCount) = distinctValuesQ.ResultRows.CountTotalAndSuppressed();
33+
var counts = distinctValuesQ.ResultRows.CountTotalAndSuppressed();
3134

32-
PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: suppressedValueCount));
35+
PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: counts.SuppressedCount));
3336

3437
// This shouldn't happen, but check anyway.
35-
if (totalValueCount == 0)
38+
if (counts.TotalCount == 0)
3639
{
3740
throw new Exception(
3841
$"Total value count for {TableName}, {ColumnName} is zero.");
3942
}
4043

41-
PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: totalValueCount));
42-
43-
var suppressedValueRatio = (double)suppressedValueCount / totalValueCount;
44+
PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: counts.TotalCount));
4445

4546
var distinctValueCounts =
4647
from row in distinctValuesQ.ResultRows
47-
where !row.DistinctData.IsSuppressed && !row.DistinctData.IsNull
48+
where row.DistinctData.HasValue
4849
orderby row.Count descending
4950
select new
5051
{
@@ -53,6 +54,69 @@ orderby row.Count descending
5354
};
5455

5556
PublishMetric(new UntypedMetric(name: "distinct.top_values", metric: distinctValueCounts.Take(10)));
57+
58+
if (counts.SuppressedCountRatio >= SuppressedRatioThreshold)
59+
{
60+
// we compute the common prefixes only if the row is not categorical
61+
await ExplorePrefixes(cancellationToken);
62+
}
63+
}
64+
65+
private async Task<IEnumerable<Prefix>> ExplorePrefixes(CancellationToken cancellationToken)
66+
{
67+
var allPrefixes = new List<Prefix>();
68+
var length = 0;
69+
while (true)
70+
{
71+
length++;
72+
var prefixesQ = await ResolveQuery<TextColumnPrefix.Result>(
73+
new TextColumnPrefix(TableName, ColumnName, length),
74+
cancellationToken);
75+
76+
var counts = prefixesQ.ResultRows.CountTotalAndSuppressed();
77+
var avgCount = (double)counts.NonSuppressedCount / counts.NonSuppressedRows;
78+
79+
var prefixes =
80+
from row in prefixesQ.ResultRows
81+
let frequency = (double)row.Count / counts.NonSuppressedCount
82+
where row.HasValue && row.Count > avgCount
83+
orderby frequency descending
84+
select new Prefix(row.Prefix, frequency);
85+
86+
if (!prefixes.Any())
87+
{
88+
break;
89+
}
90+
91+
if (length > prefixes.Max(p => p.Value.Length))
92+
{
93+
break;
94+
}
95+
96+
allPrefixes.AddRange(prefixes);
97+
}
98+
99+
var ret =
100+
from row in allPrefixes
101+
orderby row.Value.Length ascending, row.Frequency descending
102+
select row;
103+
104+
PublishMetric(new UntypedMetric(name: "text.prefixes", metric: ret));
105+
106+
return ret;
107+
}
108+
109+
private struct Prefix
110+
{
111+
public Prefix(string value, double frequency)
112+
{
113+
Value = value;
114+
Frequency = frequency;
115+
}
116+
117+
public string Value { get; }
118+
119+
public double Frequency { get; }
56120
}
57121
}
58122
}

src/explorer.api/Extensions/DiffixExtensions.cs

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,47 @@ namespace Explorer.Diffix.Extensions
77

88
internal static class DiffixExtensions
99
{
10-
public static (long Total, long Suppressed) CountTotalAndSuppressed<T>(this IEnumerable<T> valueCounts)
11-
where T : ICountAggregate, INullable, ISuppressible
10+
public static CountResultType CountTotalAndSuppressed<T>(this IEnumerable<T> valueCounts)
11+
where T : ICountAggregate, ISuppressible
1212
=> valueCounts.Aggregate(
13-
(0L, 0L),
14-
(acc, next) => (
15-
acc.Item1 + next.Count,
16-
acc.Item2 + (next.IsSuppressed ? next.Count : 0L)));
13+
default(CountResultType),
14+
(acc, row) => new CountResultType(acc, row.Count, row.IsSuppressed));
1715
}
16+
17+
#pragma warning disable CA1815 // Struct type should override Equals
18+
#pragma warning disable SA1201 // A struct should not follow a class
19+
public struct CountResultType
20+
{
21+
public CountResultType(CountResultType cr, long count, bool isSuppressed)
22+
{
23+
TotalCount = cr.TotalCount + count;
24+
TotalRows = cr.TotalRows + 1;
25+
if (isSuppressed)
26+
{
27+
SuppressedCount = cr.SuppressedCount + count;
28+
SuppressedRows = cr.SuppressedRows + 1;
29+
}
30+
else
31+
{
32+
SuppressedCount = cr.SuppressedCount;
33+
SuppressedRows = cr.SuppressedRows;
34+
}
35+
}
36+
37+
public long TotalCount { get; }
38+
39+
public long SuppressedCount { get; }
40+
41+
public long TotalRows { get; }
42+
43+
public long SuppressedRows { get; }
44+
45+
public long NonSuppressedRows => TotalRows - SuppressedRows;
46+
47+
public long NonSuppressedCount => TotalCount - SuppressedCount;
48+
49+
public double SuppressedCountRatio => (double)SuppressedCount / TotalCount;
50+
}
51+
#pragma warning restore CA1815 // Struct type should override Equals
52+
#pragma warning restore SA1201 // A struct should not follow a class
1853
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
namespace Explorer.Queries
2+
{
3+
using System.Linq;
4+
using System.Text.Json;
5+
6+
using Aircloak.JsonApi;
7+
using Aircloak.JsonApi.JsonConversion;
8+
using Aircloak.JsonApi.ResponseTypes;
9+
10+
using Explorer.Diffix.Interfaces;
11+
12+
internal class TextColumnPrefix :
13+
IQuerySpec<TextColumnPrefix.Result>
14+
{
15+
public TextColumnPrefix(string tableName, string columnName, int length)
16+
{
17+
// TODO: determine prefix length dynamically
18+
TableName = tableName;
19+
ColumnName = columnName;
20+
Length = length;
21+
}
22+
23+
public string QueryStatement => $@"
24+
select
25+
left({ColumnName}, {Length}),
26+
count(*),
27+
count_noise(*)
28+
from {TableName}
29+
group by 1
30+
having length(left({ColumnName}, {Length})) = {Length}";
31+
32+
private string TableName { get; }
33+
34+
private string ColumnName { get; }
35+
36+
private int Length { get; }
37+
38+
public Result FromJsonArray(ref Utf8JsonReader reader) => new Result(ref reader);
39+
40+
public class Result : ICountAggregate, INullable, ISuppressible
41+
{
42+
private readonly AircloakValue<string> prefixColumn;
43+
44+
public Result(ref Utf8JsonReader reader)
45+
{
46+
prefixColumn = reader.ParseAircloakResultValue<string>();
47+
Count = reader.ParseCount();
48+
CountNoise = reader.ParseCountNoise();
49+
}
50+
51+
public string Prefix => prefixColumn.HasValue ? prefixColumn.Value : string.Empty;
52+
53+
public long Count { get; set; }
54+
55+
public double? CountNoise { get; set; }
56+
57+
public bool IsNull => prefixColumn.IsNull;
58+
59+
public bool IsSuppressed => prefixColumn.IsSuppressed;
60+
61+
public bool HasValue => prefixColumn.HasValue;
62+
}
63+
}
64+
}

tests/explorer.api.tests/TextColumnTrimTests.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ public async void TestEmailPositive()
2525
TestDataSource,
2626
nameof(TextColumnTrimTests));
2727

28-
var (totalValueCount, suppressedValueCount) = query.ResultRows.CountTotalAndSuppressed();
28+
var counts = query.ResultRows.CountTotalAndSuppressed();
2929

30-
var isEmail = totalValueCount == query.ResultRows
30+
var isEmail = counts.TotalCount == query.ResultRows
3131
.Where(r => r.TrimmedText == "@")
3232
.Sum(r => r.Count);
3333

@@ -42,9 +42,9 @@ public async void TestEmailNegative()
4242
TestDataSource,
4343
nameof(TextColumnTrimTests));
4444

45-
var (totalValueCount, suppressedValueCount) = query.ResultRows.CountTotalAndSuppressed();
45+
var counts = query.ResultRows.CountTotalAndSuppressed();
4646

47-
var isEmail = totalValueCount == query.ResultRows
47+
var isEmail = counts.TotalCount == query.ResultRows
4848
.Where(r => r.TrimmedText == "@")
4949
.Sum(r => r.Count);
5050

0 commit comments

Comments
 (0)