Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ obj
/*.vspx

/TestResults
.nuget/nuget.exe
36 changes: 21 additions & 15 deletions CsvQuery/Csv/CsvAnalyzer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

public class CsvAnalyzer
{
private const int MaxLinesToAnalyze = 20;

private class Stat
{
public int Occurances;
Expand All @@ -32,7 +34,6 @@ public static CsvSettings Analyze(string csvString)
}

// First do a letter frequency analysis on each row
var s = new StringReader(csvString);
string line;
int lineCount = 0, linesQuoted=0;
var frequencies = new List<Dictionary<char, int>>();
Expand All @@ -44,6 +45,8 @@ public static CsvSettings Analyze(string csvString)
var inQuotes = false;
var letterFrequencyQuoted = new Dictionary<char, int>();

using (var s = new StringReader(csvString))
{
while ((line = s.ReadLine()) != null)
{
if (line.Length == 0)
Expand Down Expand Up @@ -83,7 +86,8 @@ public static CsvSettings Analyze(string csvString)
linesQuoted++;
}

if (lineCount++ > 20) break;
if (lineCount++ > MaxLinesToAnalyze) break;
}
}

// Then check the variance on the frequency of each char
Expand Down Expand Up @@ -113,16 +117,16 @@ public static CsvSettings Analyze(string csvString)
if (frequency.ContainsKey(c)) f = frequency[c];
variance += (f - mean) * (f - mean);
}
variance /= lineCount;
variance /= linesQuoted;
variancesQuoted.Add(c, variance);
}

// The char with lowest variance is most likely the separator
result = new CsvSettings { Separator = GetSeparatorFromVariance(variances, occurrences, lineCount, out var uncertancy) };
var separatorQuoted = GetSeparatorFromVariance(variancesQuoted, occurrencesQuoted, linesQuoted, out var uncertancyQuoted);
if (uncertancyQuoted < uncertancy)
result = new CsvSettings { Separator = GetSeparatorFromVariance(variances, occurrences, lineCount, out var uncertainty) };
var separatorQuoted = GetSeparatorFromVariance(variancesQuoted, occurrencesQuoted, linesQuoted, out var uncertaintyQuoted);
if (uncertaintyQuoted < uncertainty)
result.Separator = separatorQuoted;
else if (uncertancy < uncertancyQuoted || (uncertancy == uncertancyQuoted && lineCount > linesQuoted)) // It was better ignoring quotes!
else if (uncertainty < uncertaintyQuoted || (uncertainty == uncertaintyQuoted && lineCount > linesQuoted)) // It was better ignoring quotes!
result.UseQuotes = false;

if (result.Separator != default(char)) return result;
Expand Down Expand Up @@ -189,7 +193,8 @@ private static CsvSettings DetectW3C(string csvString)

private static Dictionary<char, Stat> CalcVariances(string csvString, char textQualifyer, char escapeChar)
{
var s = new StringReader(csvString);
using (var s = new StringReader(csvString))
{
string line;
int lineCount = 0;
var statistics = new Dictionary<char, Stat>();
Expand All @@ -211,7 +216,7 @@ private static Dictionary<char, Stat> CalcVariances(string csvString, char textQ
}

frequencies.Add(letterFrequency);
if (lineCount++ > 20) break;
if (lineCount++ > MaxLinesToAnalyze) break;
}

// Then check the variance on the frequency of each char
Expand All @@ -230,12 +235,13 @@ private static Dictionary<char, Stat> CalcVariances(string csvString, char textQ
}

return statistics;
}
}

private static char GetSeparatorFromVariance(Dictionary<char, float> variances, Dictionary<char, int> occurrences, int lineCount, out int uncertancy)
private static char GetSeparatorFromVariance(Dictionary<char, float> variances, Dictionary<char, int> occurrences, int lineCount, out int uncertainty)
{
var preferredSeparators = Main.Settings.Separators.Replace("\\t", "\t");
uncertancy = 0;
uncertainty = 0;

// The char with lowest variance is most likely the separator
// Optimistic: check prefered with 0 variance
Expand All @@ -248,21 +254,21 @@ private static char GetSeparatorFromVariance(Dictionary<char, float> variances,
if (separator != null)
return separator.Value;

uncertancy++;
uncertainty++;
var defaultKV = default(KeyValuePair<char, float>);

// Ok, no perfect separator. Check if the best char that exists on all lines is a prefered separator
var sortedVariances = variances.OrderBy(x => x.Value).ToList();
var best = sortedVariances.FirstOrDefault(x => occurrences[x.Key] >= lineCount);
if (!best.Equals(defaultKV) && preferredSeparators.IndexOf(best.Key) != -1)
return best.Key;
uncertancy++;
uncertainty++;

// No? Second best?
best = sortedVariances.Where(x => occurrences[x.Key] >= lineCount).Skip(1).FirstOrDefault();
if (!best.Equals(defaultKV) && preferredSeparators.IndexOf(best.Key) != -1)
return best.Key;
uncertancy++;
uncertainty++;

// Ok, screw the preferred separators, is any other char a perfect separator? (and common, i.e. at least 3 per line)
separator = variances
Expand All @@ -273,7 +279,7 @@ private static char GetSeparatorFromVariance(Dictionary<char, float> variances,
if (separator != null)
return separator.Value;

uncertancy++;
uncertainty++;
// Ok, I have no idea
return '\0';
}
Expand Down
16 changes: 11 additions & 5 deletions CsvQuery/Forms/QueryWindow.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
/// </summary><inheritdoc />
public partial class QueryWindow : Form
{
/// <summary> Maximum number of characters read from the document for auto-detection </summary>
private const int MaxAnalysisTextLength = 100000;

/// <summary> Maximum number of rows shown in results when data is very large </summary>
private const int MaxResultRows = 10000;

/// <summary> Background worker </summary>
private Task _worker = Task.CompletedTask;
private Color[] _winColors = null;
Expand Down Expand Up @@ -131,7 +137,7 @@ void SafeAction()
}
catch (Exception e)
{
Trace.TraceError("CSV Action failed: {0}", e.Message);
Trace.TraceError("CSV Action failed: {0}", e.ToString());
this.Message("Error when executing an action: " + e.Message, Resources.Title_CSV_Query_Error);
}
finally
Expand Down Expand Up @@ -172,7 +178,7 @@ private void Analyze(bool silent)
var bufferId = NotepadPPGateway.GetCurrentBufferId();

var textLength = PluginBase.CurrentScintillaGateway.GetTextLength();
var text = PluginBase.CurrentScintillaGateway.GetTextRange(0, Math.Min(100000, textLength));
var text = PluginBase.CurrentScintillaGateway.GetTextRange(0, Math.Min(MaxAnalysisTextLength, textLength));

watch.Checkpoint("GetText");

Expand Down Expand Up @@ -249,7 +255,7 @@ private void Parse(CsvSettings csvSettings, DiagnosticTimer watch, TextReader te
|| this._lastRunQuery.bufferId != previousBufferId)
{
var selectQuery = "SELECT * FROM THIS";
if (count > 10000) selectQuery = Main.DataStorage.CreateLimitedSelect(10000);
if (count > MaxResultRows) selectQuery = Main.DataStorage.CreateLimitedSelect(MaxResultRows);
this.UiThread(() => this.txbQuery.Text = selectQuery);
}
else if (this._lastRunQuery.bufferId == previousBufferId && this._lastRunQuery.query != null)
Expand Down Expand Up @@ -295,9 +301,9 @@ private void Execute(IntPtr bufferId, DiagnosticTimer watch)
this.Message("Could not execute query:\n" + e.Message, Resources.Title_CSV_Query_Error);
return;
}
catch (Exception)
catch (Exception e)
{
this.Message("Could not execute query", Resources.Title_CSV_Query_Error);
this.Message("Could not execute query: " + e.Message, Resources.Title_CSV_Query_Error);
return;
}
watch.Checkpoint("Execute query");
Expand Down
6 changes: 2 additions & 4 deletions CsvQuery/Tools/Extensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,8 @@ public static class Extensions
/// <param name="c">Occurance that should be counted</param>
public static void Increase<T>(this Dictionary<T, int> counts, T c)
{
if (!counts.ContainsKey(c))
counts.Add(c, 1);
else
counts[c]++;
counts.TryGetValue(c, out var current);
counts[c] = current + 1;
}

/// <summary>
Expand Down
1 change: 0 additions & 1 deletion CsvQuery/Tools/JsonParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,6 @@ private string ReadPropertyName(TextReader reader)
return propertyName;
}

if (ch == '\'') return this.ReadString(reader, '\'');
// technically not allowed with unquoted prop-names, but wtf
if ((ch < 'a' || ch > 'z') && (ch < 'A' || ch > 'Z') && ch != '_' && ch != '$')
throw new JsonException($"Unexpected character '{ch}' starting propertyname");
Expand Down