Skip to content

Commit

Permalink
Add more precise tests for CR/LF characters (tspence#60)
Browse files Browse the repository at this point in the history
Fix issues with inconsistent CRLF handling with buffered reading rather than line-by-line reading.
Add more tests and automate deployment.
  • Loading branch information
tspence committed Jul 19, 2023
1 parent 55833f5 commit ff2b191
Show file tree
Hide file tree
Showing 12 changed files with 222 additions and 58 deletions.
48 changes: 48 additions & 0 deletions .github/workflows/nuget-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: NuGet Publish

on:
push:
branches: [ main ]

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest
name: Update NuGet package
steps:
- name: Checkout repository
uses: actions/checkout@v1

- name: Setup .NET Core @ Latest
uses: actions/setup-dotnet@v1
with:
dotnet-version: |
5.0.x
6.0.x
7.0.x
- name: Build (Framework 2.0)
run: msbuild ./src/net20/src.net20.csproj /property:Configuration=Release
- name: Build (Framework 4.0)
run: msbuild ./src/net40/src.net40.csproj /property:Configuration=Release
- name: Build (Framework 4.5)
run: msbuild ./src/net45/src.net45.csproj /property:Configuration=Release
- name: Build (DotNetCore 5.0)
run: dotnet build -c Release ./src/net50/src.net50.csproj
- name: Build (NetStandard 2.0)
run: dotnet build -c Release ./src/netstandard20/src.netstandard20.csproj

- name: Setup Nuget
uses: nuget/setup-nuget@v1
with:
nuget-api-key: ${{ secrets.NUGET_API_KEY }}
nuget-version: "5.x"

- name: Run Nuget pack
run: nuget pack CSVFile.nuspec

- name: Push generated package to GitHub registry
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: nuget push *.nupkg -Source 'https://api.nuget.org/v3/index.json' -ApiKey ${{secrets.NUGET_API_KEY}}
20 changes: 10 additions & 10 deletions CSVFile.nuspec
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@
<package >
<metadata>
<id>CSVFile</id>
<version>3.1.1</version>
<version>3.1.2</version>
<title>CSVFile</title>
<authors>Ted Spence</authors>
<owners>Ted Spence</owners>
<license type="file">docs/LICENSE</license>
<projectUrl>https://github.com/tspence/csharp-csv-reader</projectUrl>
<requireLicenseAcceptance>false</requireLicenseAcceptance>
<summary>Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with most dot net versions.</summary>
<description>Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with most dot net versions.</description>
<summary>Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with DotNetFramework (2.0 onwards) and DotNetCore.</summary>
<description>Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with DotNetFramework (2.0 onwards) and DotNetCore.</description>
<icon>docs/icons8-spreadsheet-96.png</icon>
<releaseNotes>
March 7, 2023
July 18, 2023

* Fix issue when reading a stream with a text qualified field that ends with a newline
* Fix issue with inconsistent handling of embedded newlines in the streaming version of the reader
</releaseNotes>
<readme>docs/README.md</readme>
<copyright>Copyright 2006 - 2023</copyright>
Expand All @@ -33,10 +33,10 @@
<file src=".\LICENSE" target="docs/LICENSE"/>
<file src=".\README.md" target="docs/README.md"/>
<file src=".\icons8-spreadsheet-96.png" target="docs/icons8-spreadsheet-96.png"/>
<file src="src\net20\bin\release\net20\*" target="lib\net20" />
<file src="src\net40\bin\release\net40\*" target="lib\net40" />
<file src="src\net45\bin\release\net45\*" target="lib\net45" />
<file src="src\netstandard20\bin\release\netstandard2.0\*" target="lib\netstandard20" />
<file src="src\net50\bin\release\net5.0\*" target="lib\net5.0" />
<file src="src\net20\bin\Release\*" target="lib\net20" />
<file src="src\net40\bin\Release\*" target="lib\net40" />
<file src="src\net45\bin\Release\*" target="lib\net45" />
<file src="src\netstandard20\bin\Release\netstandard2.0\*" target="lib\netstandard20" />
<file src="src\net50\bin\Release\net5.0\*" target="lib\net5.0" />
</files>
</package>
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[![NuGet](https://img.shields.io/nuget/v/CSVFile.svg?style=plastic)](https://www.nuget.org/packages/CSVFile/)
![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/tspence/csharp-csv-reader/dotnet.yml?branch=main)
[![SonarCloud Coverage](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=coverage)](https://sonarcloud.io/component_measures?id=tspence_csharp-csv-reader&metric=coverage&view=list)
[![SonarCloud Bugs](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=bugs)](https://sonarcloud.io/project/issues?resolved=false&types=BUG&id=tspence_csharp-csv-reader)
[![SonarCloud Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=vulnerabilities)](https://sonarcloud.io/project/issues?resolved=false&types=VULNERABILITY&id=tspence_csharp-csv-reader)
[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/tspence/csharp-csv-reader/dotnet.yml?branch=main)](https://github.com/tspence/csharp-csv-reader/actions/workflows/dotnet.yml)
[![SonarCloud Coverage](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=coverage)](https://sonarcloud.io/summary/overall?id=tspence_csharp-csv-reader)
[![SonarCloud Bugs](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=bugs)](https://sonarcloud.io/summary/overall?id=tspence_csharp-csv-reader)
[![SonarCloud Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=vulnerabilities)](https://sonarcloud.io/summary/overall?id=tspence_csharp-csv-reader)

# CSVFile
This library is a series of unit tested, thoroughly commented CSV parsing functions which I have developed off and on since 2006. Extremely small and easy to implement; includes unit tests for the majority of odd CSV edge cases. Library supports different delimiters, qualifiers, and embedded newlines. Can read and write from data tables.
Expand Down
20 changes: 13 additions & 7 deletions src/CSV.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,22 @@ public static class CSV
/// <returns>An enumerable object that can be examined to retrieve rows from the stream.</returns>
public static IEnumerable<string[]> ParseStream(StreamReader inStream, CSVSettings settings = null)
{
int bufferSize = settings?.BufferSize ?? CSVSettings.DEFAULT_BUFFER_SIZE;
var buffer = new char[bufferSize];
var machine = new CSVStateMachine(settings);
while (machine.State == CSVState.CanKeepGoing)
{
var line = string.Empty;
if (!inStream.EndOfStream)
if (machine.NeedsMoreText() && !inStream.EndOfStream)
{
line = inStream.ReadLine();
var readChars = inStream.ReadBlock(buffer, 0, bufferSize);
line = new string(buffer, 0, readChars);
}
var row = machine.ParseLine(line, inStream.EndOfStream);
var row = machine.ParseChunk(line, inStream.EndOfStream);
if (row != null)
{
yield return row;
}
}
}
}

Expand All @@ -81,15 +84,18 @@ public static IEnumerable<string[]> ParseStream(StreamReader inStream, CSVSettin
/// <returns>An enumerable object that can be examined to retrieve rows from the stream.</returns>
public static async IAsyncEnumerable<string[]> ParseStreamAsync(StreamReader inStream, CSVSettings settings = null)
{
int bufferSize = settings?.BufferSize ?? CSVSettings.DEFAULT_BUFFER_SIZE;
var buffer = new char[bufferSize];
var machine = new CSVStateMachine(settings);
while (machine.State == CSVState.CanKeepGoing)
{
var line = string.Empty;
if (!inStream.EndOfStream)
if (machine.NeedsMoreText() && !inStream.EndOfStream)
{
line = await inStream.ReadLineAsync();
var readChars = await inStream.ReadBlockAsync(buffer, 0, bufferSize);
line = new string(buffer, 0, readChars);
}
var row = machine.ParseLine(line, inStream.EndOfStream);
var row = machine.ParseChunk(line, inStream.EndOfStream);
if (row != null)
{
yield return row;
Expand Down
6 changes: 6 additions & 0 deletions src/CSVSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ public class CSVSettings
/// </summary>
public bool IgnoreEmptyLineForDeserialization { get; set; }

/// <summary>
/// When reading data from a stream, this is the block size to read at once.
/// </summary>
public int BufferSize { get; set; } = DEFAULT_BUFFER_SIZE;
internal static readonly int DEFAULT_BUFFER_SIZE = 65536;

/// <summary>
/// The encoding for converting streams of bytes to strings
/// </summary>
Expand Down
58 changes: 36 additions & 22 deletions src/CSVStateMachine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ public class CSVStateMachine
/// </summary>
public CSVState State { get; private set; }

/// <summary>
/// Returns true if we need more text
/// </summary>
/// <returns></returns>
public bool NeedsMoreText()
{
return String.IsNullOrEmpty(_line) || _position >= _line.Length;
}

/// <summary>
/// Constructs a new state machine to begin processing CSV text
/// </summary>
Expand All @@ -78,24 +87,6 @@ public CSVStateMachine(CSVSettings settings)
State = CSVState.CanKeepGoing;
}

/// <summary>
/// Parse a single line when read from a stream.
///
/// Call this function when you are using the "ReadLine" or "ReadLineAsync" functions so that
/// each line will obey the CSV Settings rules for line separators.
/// </summary>
/// <param name="line"></param>
/// <param name="reachedEnd"></param>
/// <returns></returns>
public string[] ParseLine(string line, bool reachedEnd)
{
if (!string.IsNullOrEmpty(line))
{
line += _settings.LineSeparator;
}
return ParseChunk(line, reachedEnd);
}

/// <summary>
/// Parse a new chunk of text retrieved via some other means than a stream.
///
Expand All @@ -108,12 +99,18 @@ public string[] ParseLine(string line, bool reachedEnd)
public string[] ParseChunk(string chunk, bool reachedEnd)
{
// Detect end of stream
if (reachedEnd && string.IsNullOrEmpty(chunk) && _position == -1)
if (reachedEnd && string.IsNullOrEmpty(chunk) && _position == -1 && string.IsNullOrEmpty(_line))
{
State = CSVState.Done;
return null;
}

// If we're at the end of the line, remember to backtrack one because we increment immediately
if (_position == _line.Length)
{
_position -= 1;
}

// Add this chunk to the current processing logic
_line += chunk;

Expand Down Expand Up @@ -199,10 +196,22 @@ public string[] ParseChunk(string chunk, bool reachedEnd)
_position--;
}
// Are we at a line separator? Let's do a quick test first
else if (c == _settings.LineSeparator[0] && _position + _settings.LineSeparator.Length <= _line.Length)
else if (c == _settings.LineSeparator[0])
{
if (string.Equals(_line.Substring(_position, _settings.LineSeparator.Length),
_settings.LineSeparator))
// If we don't have enough characters left to test the line separator properly, ask for more
var notEnoughChars = _position + _settings.LineSeparator.Length > _line.Length;
if (notEnoughChars && !reachedEnd)
{
return null;
}

// If we have reached the end, but this isn't a complete line separator, it's just text
if (notEnoughChars && reachedEnd)
{
_work.Append(c);
}
// OK, we have enough characters, see if this is a line separator
else if (string.Equals(_line.Substring(_position, _settings.LineSeparator.Length), _settings.LineSeparator))
{
_line = _line.Substring(_position + _settings.LineSeparator.Length);
_position = -1;
Expand All @@ -212,6 +221,11 @@ public string[] ParseChunk(string chunk, bool reachedEnd)
_work.Length = 0;
return row;
}
// It's not a line separator, it's just a normal character
else
{
_work.Append(c);
}
}
// Does this start a new field?
else if (c == _delimiter)
Expand Down
8 changes: 5 additions & 3 deletions tests/AsyncReaderTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ public async Task TestBasicReader()
// Skip header row
var settings = new CSVSettings()
{
HeaderRowIncluded = false
HeaderRowIncluded = false,
LineSeparator = "\n",
};

// Convert into stream
Expand Down Expand Up @@ -88,7 +89,8 @@ public async Task TestDanglingFields()
// Skip header row
var settings = new CSVSettings()
{
HeaderRowIncluded = false
HeaderRowIncluded = false,
LineSeparator = "\n",
};

// Convert into stream
Expand Down Expand Up @@ -156,7 +158,7 @@ public async Task TestAlternateDelimiterQualifiers()
"Dr. Kelso\tChief of Medicine\tx100";

// Convert into stream
var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t' };
var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t', LineSeparator = "\n" };
using (var cr = CSVReader.FromString(source, settings))
{
Assert.AreEqual("Name", cr.Headers[0]);
Expand Down
19 changes: 19 additions & 0 deletions tests/BasicParseTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -236,5 +236,24 @@ public void TestIssue53()
Assert.AreEqual("Normal", line3[9]);
Assert.AreEqual("", line3[10]);
}

[Test]
public void TestMultipleNewlines()
{
// Specific issue reported by domdere
var line1 = CSV.ParseLine("\"test\",\"blah\r\n\r\n\r\nfoo\",\"Normal\"");
Assert.AreEqual("test", line1[0]);
Assert.AreEqual("blah\r\n\r\n\r\nfoo", line1[1]);
Assert.AreEqual("Normal", line1[2]);

// Test a few potential use cases here
var line2 = CSV.ParseLine("\"test\",\"\n\n\",\"\r\n\r\n\r\n\",\"Normal\",\"\",\"\r\r\r\r\r\"");
Assert.AreEqual("test", line2[0]);
Assert.AreEqual("\n\n", line2[1]);
Assert.AreEqual("\r\n\r\n\r\n", line2[2]);
Assert.AreEqual("Normal", line2[3]);
Assert.AreEqual("", line2[4]);
Assert.AreEqual("\r\r\r\r\r", line2[5]);
}
}
}
1 change: 0 additions & 1 deletion tests/ChopTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ public void DataTableChopTest()
Assert.AreEqual(list[i].email, results[i].email);
}
}
// Clean up
finally
{
if (Directory.Exists(dirname))
Expand Down
14 changes: 11 additions & 3 deletions tests/DataTableReaderTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@ public class DataTableReaderTest
[Test]
public void TestBasicDataTable()
{
var dt = CSVDataTable.FromString(source);
var settings = new CSVSettings()
{
LineSeparator = "\n",
};
var dt = CSVDataTable.FromString(source, settings);
Assert.AreEqual(3, dt.Columns.Count);
Assert.AreEqual(4, dt.Rows.Count);
Assert.AreEqual("JD", dt.Rows[0].ItemArray[0]);
Expand All @@ -53,12 +57,16 @@ public void TestBasicDataTable()
[Test]
public void TestDataTableWithEmbeddedNewlines()
{
var dt = CSVDataTable.FromString(source_embedded_newlines);
var settings = new CSVSettings()
{
LineSeparator = "\n",
};
var dt = CSVDataTable.FromString(source_embedded_newlines, settings);
Assert.AreEqual(3, dt.Columns.Count);
Assert.AreEqual(4, dt.Rows.Count);
Assert.AreEqual("JD", dt.Rows[0].ItemArray[0]);
Assert.AreEqual("Janitor", dt.Rows[1].ItemArray[0]);
Assert.AreEqual("Dr. Reed, " + Environment.NewLine + "Eliot", dt.Rows[2].ItemArray[0]);
Assert.AreEqual("Dr. Reed, \nEliot", dt.Rows[2].ItemArray[0]);
Assert.AreEqual("Dr. Kelso", dt.Rows[3].ItemArray[0]);
Assert.AreEqual("Doctor", dt.Rows[0].ItemArray[1]);
Assert.AreEqual("Janitor", dt.Rows[1].ItemArray[1]);
Expand Down
Loading

0 comments on commit ff2b191

Please sign in to comment.