Skip to content

Commit ff2b191

Browse files
authored
Add more precise tests for CR/LF characters (#60)
Fix issues with inconsistent CRLF handling with buffered reading rather than line-by-line reading. Add more tests and automate deployment.
1 parent 55833f5 commit ff2b191

12 files changed

Lines changed: 222 additions & 58 deletions
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
name: NuGet Publish
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
7+
# Allows you to run this workflow manually from the Actions tab
8+
workflow_dispatch:
9+
10+
jobs:
11+
build:
12+
runs-on: ubuntu-latest
13+
name: Update NuGet package
14+
steps:
15+
- name: Checkout repository
16+
uses: actions/checkout@v1
17+
18+
- name: Setup .NET Core @ Latest
19+
uses: actions/setup-dotnet@v1
20+
with:
21+
dotnet-version: |
22+
5.0.x
23+
6.0.x
24+
7.0.x
25+
26+
- name: Build (Framework 2.0)
27+
run: msbuild ./src/net20/src.net20.csproj /property:Configuration=Release
28+
- name: Build (Framework 4.0)
29+
run: msbuild ./src/net40/src.net40.csproj /property:Configuration=Release
30+
- name: Build (Framework 4.5)
31+
run: msbuild ./src/net45/src.net45.csproj /property:Configuration=Release
32+
- name: Build (DotNetCore 5.0)
33+
run: dotnet build -c Release ./src/net50/src.net50.csproj
34+
- name: Build (NetStandard 2.0)
35+
run: dotnet build -c Release ./src/netstandard20/src.netstandard20.csproj
36+
37+
- name: Setup Nuget
38+
uses: nuget/setup-nuget@v1
39+
with:
40+
nuget-api-key: ${{ secrets.NUGET_API_KEY }}
41+
nuget-version: "5.x"
42+
43+
- name: Run Nuget pack
44+
run: nuget pack CSVFile.nuspec
45+
46+
- name: Push generated package to GitHub registry
47+
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
48+
run: nuget push *.nupkg -Source 'https://api.nuget.org/v3/index.json' -ApiKey ${{secrets.NUGET_API_KEY}}

CSVFile.nuspec

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,20 @@
22
<package >
33
<metadata>
44
<id>CSVFile</id>
5-
<version>3.1.1</version>
5+
<version>3.1.2</version>
66
<title>CSVFile</title>
77
<authors>Ted Spence</authors>
88
<owners>Ted Spence</owners>
99
<license type="file">docs/LICENSE</license>
1010
<projectUrl>https://github.com/tspence/csharp-csv-reader</projectUrl>
1111
<requireLicenseAcceptance>false</requireLicenseAcceptance>
12-
<summary>Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with most dot net versions.</summary>
13-
<description>Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with most dot net versions.</description>
12+
<summary>Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with DotNetFramework (2.0 onwards) and DotNetCore.</summary>
13+
<description>Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with DotNetFramework (2.0 onwards) and DotNetCore.</description>
1414
<icon>docs/icons8-spreadsheet-96.png</icon>
1515
<releaseNotes>
16-
March 7, 2023
16+
July 18, 2023
1717

18-
* Fix issue when reading a stream with a text qualified field that ends with a newline
18+
* Fix issue with inconsistent handling of embedded newlines in the streaming version of the reader
1919
</releaseNotes>
2020
<readme>docs/README.md</readme>
2121
<copyright>Copyright 2006 - 2023</copyright>
@@ -33,10 +33,10 @@
3333
<file src=".\LICENSE" target="docs/LICENSE"/>
3434
<file src=".\README.md" target="docs/README.md"/>
3535
<file src=".\icons8-spreadsheet-96.png" target="docs/icons8-spreadsheet-96.png"/>
36-
<file src="src\net20\bin\release\net20\*" target="lib\net20" />
37-
<file src="src\net40\bin\release\net40\*" target="lib\net40" />
38-
<file src="src\net45\bin\release\net45\*" target="lib\net45" />
39-
<file src="src\netstandard20\bin\release\netstandard2.0\*" target="lib\netstandard20" />
40-
<file src="src\net50\bin\release\net5.0\*" target="lib\net5.0" />
36+
<file src="src\net20\bin\Release\*" target="lib\net20" />
37+
<file src="src\net40\bin\Release\*" target="lib\net40" />
38+
<file src="src\net45\bin\Release\*" target="lib\net45" />
39+
<file src="src\netstandard20\bin\Release\netstandard2.0\*" target="lib\netstandard20" />
40+
<file src="src\net50\bin\Release\net5.0\*" target="lib\net5.0" />
4141
</files>
4242
</package>

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
[![NuGet](https://img.shields.io/nuget/v/CSVFile.svg?style=plastic)](https://www.nuget.org/packages/CSVFile/)
2-
![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/tspence/csharp-csv-reader/dotnet.yml?branch=main)
3-
[![SonarCloud Coverage](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=coverage)](https://sonarcloud.io/component_measures?id=tspence_csharp-csv-reader&metric=coverage&view=list)
4-
[![SonarCloud Bugs](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=bugs)](https://sonarcloud.io/project/issues?resolved=false&types=BUG&id=tspence_csharp-csv-reader)
5-
[![SonarCloud Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=vulnerabilities)](https://sonarcloud.io/project/issues?resolved=false&types=VULNERABILITY&id=tspence_csharp-csv-reader)
2+
[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/tspence/csharp-csv-reader/dotnet.yml?branch=main)](https://github.com/tspence/csharp-csv-reader/actions/workflows/dotnet.yml)
3+
[![SonarCloud Coverage](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=coverage)](https://sonarcloud.io/summary/overall?id=tspence_csharp-csv-reader)
4+
[![SonarCloud Bugs](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=bugs)](https://sonarcloud.io/summary/overall?id=tspence_csharp-csv-reader)
5+
[![SonarCloud Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=vulnerabilities)](https://sonarcloud.io/summary/overall?id=tspence_csharp-csv-reader)
66

77
# CSVFile
88
This library is a series of unit tested, thoroughly commented CSV parsing functions which I have developed off and on since 2006. Extremely small and easy to implement; includes unit tests for the majority of odd CSV edge cases. Library supports different delimiters, qualifiers, and embedded newlines. Can read and write from data tables.

src/CSV.cs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,19 +56,22 @@ public static class CSV
5656
/// <returns>An enumerable object that can be examined to retrieve rows from the stream.</returns>
5757
public static IEnumerable<string[]> ParseStream(StreamReader inStream, CSVSettings settings = null)
5858
{
59+
int bufferSize = settings?.BufferSize ?? CSVSettings.DEFAULT_BUFFER_SIZE;
60+
var buffer = new char[bufferSize];
5961
var machine = new CSVStateMachine(settings);
6062
while (machine.State == CSVState.CanKeepGoing)
6163
{
6264
var line = string.Empty;
63-
if (!inStream.EndOfStream)
65+
if (machine.NeedsMoreText() && !inStream.EndOfStream)
6466
{
65-
line = inStream.ReadLine();
67+
var readChars = inStream.ReadBlock(buffer, 0, bufferSize);
68+
line = new string(buffer, 0, readChars);
6669
}
67-
var row = machine.ParseLine(line, inStream.EndOfStream);
70+
var row = machine.ParseChunk(line, inStream.EndOfStream);
6871
if (row != null)
6972
{
7073
yield return row;
71-
}
74+
}
7275
}
7376
}
7477

@@ -81,15 +84,18 @@ public static IEnumerable<string[]> ParseStream(StreamReader inStream, CSVSettin
8184
/// <returns>An enumerable object that can be examined to retrieve rows from the stream.</returns>
8285
public static async IAsyncEnumerable<string[]> ParseStreamAsync(StreamReader inStream, CSVSettings settings = null)
8386
{
87+
int bufferSize = settings?.BufferSize ?? CSVSettings.DEFAULT_BUFFER_SIZE;
88+
var buffer = new char[bufferSize];
8489
var machine = new CSVStateMachine(settings);
8590
while (machine.State == CSVState.CanKeepGoing)
8691
{
8792
var line = string.Empty;
88-
if (!inStream.EndOfStream)
93+
if (machine.NeedsMoreText() && !inStream.EndOfStream)
8994
{
90-
line = await inStream.ReadLineAsync();
95+
var readChars = await inStream.ReadBlockAsync(buffer, 0, bufferSize);
96+
line = new string(buffer, 0, readChars);
9197
}
92-
var row = machine.ParseLine(line, inStream.EndOfStream);
98+
var row = machine.ParseChunk(line, inStream.EndOfStream);
9399
if (row != null)
94100
{
95101
yield return row;

src/CSVSettings.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,12 @@ public class CSVSettings
117117
/// </summary>
118118
public bool IgnoreEmptyLineForDeserialization { get; set; }
119119

120+
/// <summary>
121+
/// When reading data from a stream, this is the block size to read at once.
122+
/// </summary>
123+
public int BufferSize { get; set; } = DEFAULT_BUFFER_SIZE;
124+
internal static readonly int DEFAULT_BUFFER_SIZE = 65536;
125+
120126
/// <summary>
121127
/// The encoding for converting streams of bytes to strings
122128
/// </summary>

src/CSVStateMachine.cs

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,15 @@ public class CSVStateMachine
5959
/// </summary>
6060
public CSVState State { get; private set; }
6161

62+
/// <summary>
63+
/// Returns true if we need more text
64+
/// </summary>
65+
/// <returns></returns>
66+
public bool NeedsMoreText()
67+
{
68+
return String.IsNullOrEmpty(_line) || _position >= _line.Length;
69+
}
70+
6271
/// <summary>
6372
/// Constructs a new state machine to begin processing CSV text
6473
/// </summary>
@@ -78,24 +87,6 @@ public CSVStateMachine(CSVSettings settings)
7887
State = CSVState.CanKeepGoing;
7988
}
8089

81-
/// <summary>
82-
/// Parse a single line when read from a stream.
83-
///
84-
/// Call this function when you are using the "ReadLine" or "ReadLineAsync" functions so that
85-
/// each line will obey the CSV Settings rules for line separators.
86-
/// </summary>
87-
/// <param name="line"></param>
88-
/// <param name="reachedEnd"></param>
89-
/// <returns></returns>
90-
public string[] ParseLine(string line, bool reachedEnd)
91-
{
92-
if (!string.IsNullOrEmpty(line))
93-
{
94-
line += _settings.LineSeparator;
95-
}
96-
return ParseChunk(line, reachedEnd);
97-
}
98-
9990
/// <summary>
10091
/// Parse a new chunk of text retrieved via some other means than a stream.
10192
///
@@ -108,12 +99,18 @@ public string[] ParseLine(string line, bool reachedEnd)
10899
public string[] ParseChunk(string chunk, bool reachedEnd)
109100
{
110101
// Detect end of stream
111-
if (reachedEnd && string.IsNullOrEmpty(chunk) && _position == -1)
102+
if (reachedEnd && string.IsNullOrEmpty(chunk) && _position == -1 && string.IsNullOrEmpty(_line))
112103
{
113104
State = CSVState.Done;
114105
return null;
115106
}
116107

108+
// If we're at the end of the line, remember to backtrack one because we increment immediately
109+
if (_position == _line.Length)
110+
{
111+
_position -= 1;
112+
}
113+
117114
// Add this chunk to the current processing logic
118115
_line += chunk;
119116

@@ -199,10 +196,22 @@ public string[] ParseChunk(string chunk, bool reachedEnd)
199196
_position--;
200197
}
201198
// Are we at a line separator? Let's do a quick test first
202-
else if (c == _settings.LineSeparator[0] && _position + _settings.LineSeparator.Length <= _line.Length)
199+
else if (c == _settings.LineSeparator[0])
203200
{
204-
if (string.Equals(_line.Substring(_position, _settings.LineSeparator.Length),
205-
_settings.LineSeparator))
201+
// If we don't have enough characters left to test the line separator properly, ask for more
202+
var notEnoughChars = _position + _settings.LineSeparator.Length > _line.Length;
203+
if (notEnoughChars && !reachedEnd)
204+
{
205+
return null;
206+
}
207+
208+
// If we have reached the end, but this isn't a complete line separator, it's just text
209+
if (notEnoughChars && reachedEnd)
210+
{
211+
_work.Append(c);
212+
}
213+
// OK, we have enough characters, see if this is a line separator
214+
else if (string.Equals(_line.Substring(_position, _settings.LineSeparator.Length), _settings.LineSeparator))
206215
{
207216
_line = _line.Substring(_position + _settings.LineSeparator.Length);
208217
_position = -1;
@@ -212,6 +221,11 @@ public string[] ParseChunk(string chunk, bool reachedEnd)
212221
_work.Length = 0;
213222
return row;
214223
}
224+
// It's not a line separator, it's just a normal character
225+
else
226+
{
227+
_work.Append(c);
228+
}
215229
}
216230
// Does this start a new field?
217231
else if (c == _delimiter)

tests/AsyncReaderTest.cs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ public async Task TestBasicReader()
2727
// Skip header row
2828
var settings = new CSVSettings()
2929
{
30-
HeaderRowIncluded = false
30+
HeaderRowIncluded = false,
31+
LineSeparator = "\n",
3132
};
3233

3334
// Convert into stream
@@ -88,7 +89,8 @@ public async Task TestDanglingFields()
8889
// Skip header row
8990
var settings = new CSVSettings()
9091
{
91-
HeaderRowIncluded = false
92+
HeaderRowIncluded = false,
93+
LineSeparator = "\n",
9294
};
9395

9496
// Convert into stream
@@ -156,7 +158,7 @@ public async Task TestAlternateDelimiterQualifiers()
156158
"Dr. Kelso\tChief of Medicine\tx100";
157159

158160
// Convert into stream
159-
var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t' };
161+
var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t', LineSeparator = "\n" };
160162
using (var cr = CSVReader.FromString(source, settings))
161163
{
162164
Assert.AreEqual("Name", cr.Headers[0]);

tests/BasicParseTests.cs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,5 +236,24 @@ public void TestIssue53()
236236
Assert.AreEqual("Normal", line3[9]);
237237
Assert.AreEqual("", line3[10]);
238238
}
239+
240+
[Test]
241+
public void TestMultipleNewlines()
242+
{
243+
// Specific issue reported by domdere
244+
var line1 = CSV.ParseLine("\"test\",\"blah\r\n\r\n\r\nfoo\",\"Normal\"");
245+
Assert.AreEqual("test", line1[0]);
246+
Assert.AreEqual("blah\r\n\r\n\r\nfoo", line1[1]);
247+
Assert.AreEqual("Normal", line1[2]);
248+
249+
// Test a few potential use cases here
250+
var line2 = CSV.ParseLine("\"test\",\"\n\n\",\"\r\n\r\n\r\n\",\"Normal\",\"\",\"\r\r\r\r\r\"");
251+
Assert.AreEqual("test", line2[0]);
252+
Assert.AreEqual("\n\n", line2[1]);
253+
Assert.AreEqual("\r\n\r\n\r\n", line2[2]);
254+
Assert.AreEqual("Normal", line2[3]);
255+
Assert.AreEqual("", line2[4]);
256+
Assert.AreEqual("\r\r\r\r\r", line2[5]);
257+
}
239258
}
240259
}

tests/ChopTest.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@ public void DataTableChopTest()
9797
Assert.AreEqual(list[i].email, results[i].email);
9898
}
9999
}
100-
// Clean up
101100
finally
102101
{
103102
if (Directory.Exists(dirname))

tests/DataTableReaderTest.cs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,11 @@ public class DataTableReaderTest
3333
[Test]
3434
public void TestBasicDataTable()
3535
{
36-
var dt = CSVDataTable.FromString(source);
36+
var settings = new CSVSettings()
37+
{
38+
LineSeparator = "\n",
39+
};
40+
var dt = CSVDataTable.FromString(source, settings);
3741
Assert.AreEqual(3, dt.Columns.Count);
3842
Assert.AreEqual(4, dt.Rows.Count);
3943
Assert.AreEqual("JD", dt.Rows[0].ItemArray[0]);
@@ -53,12 +57,16 @@ public void TestBasicDataTable()
5357
[Test]
5458
public void TestDataTableWithEmbeddedNewlines()
5559
{
56-
var dt = CSVDataTable.FromString(source_embedded_newlines);
60+
var settings = new CSVSettings()
61+
{
62+
LineSeparator = "\n",
63+
};
64+
var dt = CSVDataTable.FromString(source_embedded_newlines, settings);
5765
Assert.AreEqual(3, dt.Columns.Count);
5866
Assert.AreEqual(4, dt.Rows.Count);
5967
Assert.AreEqual("JD", dt.Rows[0].ItemArray[0]);
6068
Assert.AreEqual("Janitor", dt.Rows[1].ItemArray[0]);
61-
Assert.AreEqual("Dr. Reed, " + Environment.NewLine + "Eliot", dt.Rows[2].ItemArray[0]);
69+
Assert.AreEqual("Dr. Reed, \nEliot", dt.Rows[2].ItemArray[0]);
6270
Assert.AreEqual("Dr. Kelso", dt.Rows[3].ItemArray[0]);
6371
Assert.AreEqual("Doctor", dt.Rows[0].ItemArray[1]);
6472
Assert.AreEqual("Janitor", dt.Rows[1].ItemArray[1]);

0 commit comments

Comments
 (0)