From ff2b19174012a4ac838243c76ca0beb04a985e98 Mon Sep 17 00:00:00 2001 From: Ted Spence Date: Tue, 18 Jul 2023 23:24:18 -0700 Subject: [PATCH] Add more precise tests for CR/LF characters (#60) Fix issues with inconsistent CRLF handling with buffered reading rather than line-by-line reading. Add more tests and automate deployment. --- .github/workflows/nuget-publish.yml | 48 +++++++++++++++++++++ CSVFile.nuspec | 20 ++++----- README.md | 8 ++-- src/CSV.cs | 20 +++++---- src/CSVSettings.cs | 6 +++ src/CSVStateMachine.cs | 58 +++++++++++++++---------- tests/AsyncReaderTest.cs | 8 ++-- tests/BasicParseTests.cs | 19 +++++++++ tests/ChopTest.cs | 1 - tests/DataTableReaderTest.cs | 14 +++++-- tests/ReaderTest.cs | 65 ++++++++++++++++++++++++++--- tests/SerializationTest.cs | 13 ++++-- 12 files changed, 222 insertions(+), 58 deletions(-) create mode 100644 .github/workflows/nuget-publish.yml diff --git a/.github/workflows/nuget-publish.yml b/.github/workflows/nuget-publish.yml new file mode 100644 index 0000000..391a618 --- /dev/null +++ b/.github/workflows/nuget-publish.yml @@ -0,0 +1,48 @@ +name: NuGet Publish + +on: + push: + branches: [ main ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + name: Update NuGet package + steps: + - name: Checkout repository + uses: actions/checkout@v1 + + - name: Setup .NET Core @ Latest + uses: actions/setup-dotnet@v1 + with: + dotnet-version: | + 5.0.x + 6.0.x + 7.0.x + + - name: Build (Framework 2.0) + run: msbuild ./src/net20/src.net20.csproj /property:Configuration=Release + - name: Build (Framework 4.0) + run: msbuild ./src/net40/src.net40.csproj /property:Configuration=Release + - name: Build (Framework 4.5) + run: msbuild ./src/net45/src.net45.csproj /property:Configuration=Release + - name: Build (DotNetCore 5.0) + run: dotnet build -c Release ./src/net50/src.net50.csproj + - name: Build (NetStandard 2.0) + run: dotnet build -c Release ./src/netstandard20/src.netstandard20.csproj + + - name: Setup Nuget + uses: nuget/setup-nuget@v1 + with: + nuget-api-key: ${{ secrets.NUGET_API_KEY }} + nuget-version: "5.x" + + - name: Run Nuget pack + run: nuget pack CSVFile.nuspec + + - name: Push generated package to GitHub registry + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + run: nuget push *.nupkg -Source 'https://api.nuget.org/v3/index.json' -ApiKey ${{secrets.NUGET_API_KEY}} \ No newline at end of file diff --git a/CSVFile.nuspec b/CSVFile.nuspec index 484f608..86f6ac8 100644 --- a/CSVFile.nuspec +++ b/CSVFile.nuspec @@ -2,20 +2,20 @@ CSVFile - 3.1.1 + 3.1.2 CSVFile Ted Spence Ted Spence docs/LICENSE https://github.com/tspence/csharp-csv-reader false - Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with most dot net versions. - Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with most dot net versions. + Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with DotNetFramework (2.0 onwards) and DotNetCore. + Tiny and fast CSV and TSV parsing library (40KB) with zero dependencies. Compatible with DotNetFramework (2.0 onwards) and DotNetCore. docs/icons8-spreadsheet-96.png - March 7, 2023 + July 18, 2023 - * Fix issue when reading a stream with a text qualified field that ends with a newline + * Fix issue with inconsistent handling of embedded newlines in the streaming version of the reader docs/README.md Copyright 2006 - 2023 @@ -33,10 +33,10 @@ - - - - - + + + + + diff --git a/README.md b/README.md index 0817d85..9bf952a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ [![NuGet](https://img.shields.io/nuget/v/CSVFile.svg?style=plastic)](https://www.nuget.org/packages/CSVFile/) -![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/tspence/csharp-csv-reader/dotnet.yml?branch=main) -[![SonarCloud Coverage](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=coverage)](https://sonarcloud.io/component_measures?id=tspence_csharp-csv-reader&metric=coverage&view=list) -[![SonarCloud Bugs](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=bugs)](https://sonarcloud.io/project/issues?resolved=false&types=BUG&id=tspence_csharp-csv-reader) -[![SonarCloud Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=vulnerabilities)](https://sonarcloud.io/project/issues?resolved=false&types=VULNERABILITY&id=tspence_csharp-csv-reader) +[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/tspence/csharp-csv-reader/dotnet.yml?branch=main)](https://github.com/tspence/csharp-csv-reader/actions/workflows/dotnet.yml) +[![SonarCloud Coverage](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=coverage)](https://sonarcloud.io/summary/overall?id=tspence_csharp-csv-reader) +[![SonarCloud Bugs](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=bugs)](https://sonarcloud.io/summary/overall?id=tspence_csharp-csv-reader) +[![SonarCloud Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=tspence_csharp-csv-reader&metric=vulnerabilities)](https://sonarcloud.io/summary/overall?id=tspence_csharp-csv-reader) # CSVFile This library is a series of unit tested, thoroughly commented CSV parsing functions which I have developed off and on since 2006. Extremely small and easy to implement; includes unit tests for the majority of odd CSV edge cases. Library supports different delimiters, qualifiers, and embedded newlines. Can read and write from data tables. diff --git a/src/CSV.cs b/src/CSV.cs index 5d1eace..cbe28d5 100644 --- a/src/CSV.cs +++ b/src/CSV.cs @@ -56,19 +56,22 @@ public static class CSV /// An enumerable object that can be examined to retrieve rows from the stream. public static IEnumerable ParseStream(StreamReader inStream, CSVSettings settings = null) { + int bufferSize = settings?.BufferSize ?? CSVSettings.DEFAULT_BUFFER_SIZE; + var buffer = new char[bufferSize]; var machine = new CSVStateMachine(settings); while (machine.State == CSVState.CanKeepGoing) { var line = string.Empty; - if (!inStream.EndOfStream) + if (machine.NeedsMoreText() && !inStream.EndOfStream) { - line = inStream.ReadLine(); + var readChars = inStream.ReadBlock(buffer, 0, bufferSize); + line = new string(buffer, 0, readChars); } - var row = machine.ParseLine(line, inStream.EndOfStream); + var row = machine.ParseChunk(line, inStream.EndOfStream); if (row != null) { yield return row; - } + } } } @@ -81,15 +84,18 @@ public static IEnumerable ParseStream(StreamReader inStream, CSVSettin /// An enumerable object that can be examined to retrieve rows from the stream. public static async IAsyncEnumerable ParseStreamAsync(StreamReader inStream, CSVSettings settings = null) { + int bufferSize = settings?.BufferSize ?? CSVSettings.DEFAULT_BUFFER_SIZE; + var buffer = new char[bufferSize]; var machine = new CSVStateMachine(settings); while (machine.State == CSVState.CanKeepGoing) { var line = string.Empty; - if (!inStream.EndOfStream) + if (machine.NeedsMoreText() && !inStream.EndOfStream) { - line = await inStream.ReadLineAsync(); + var readChars = await inStream.ReadBlockAsync(buffer, 0, bufferSize); + line = new string(buffer, 0, readChars); } - var row = machine.ParseLine(line, inStream.EndOfStream); + var row = machine.ParseChunk(line, inStream.EndOfStream); if (row != null) { yield return row; diff --git a/src/CSVSettings.cs b/src/CSVSettings.cs index b5624bd..dd350f7 100644 --- a/src/CSVSettings.cs +++ b/src/CSVSettings.cs @@ -117,6 +117,12 @@ public class CSVSettings /// public bool IgnoreEmptyLineForDeserialization { get; set; } + /// + /// When reading data from a stream, this is the block size to read at once. + /// + public int BufferSize { get; set; } = DEFAULT_BUFFER_SIZE; + internal static readonly int DEFAULT_BUFFER_SIZE = 65536; + /// /// The encoding for converting streams of bytes to strings /// diff --git a/src/CSVStateMachine.cs b/src/CSVStateMachine.cs index 4ff29f6..65c0265 100644 --- a/src/CSVStateMachine.cs +++ b/src/CSVStateMachine.cs @@ -59,6 +59,15 @@ public class CSVStateMachine /// public CSVState State { get; private set; } + /// + /// Returns true if we need more text + /// + /// + public bool NeedsMoreText() + { + return String.IsNullOrEmpty(_line) || _position >= _line.Length; + } + /// /// Constructs a new state machine to begin processing CSV text /// @@ -78,24 +87,6 @@ public CSVStateMachine(CSVSettings settings) State = CSVState.CanKeepGoing; } - /// - /// Parse a single line when read from a stream. - /// - /// Call this function when you are using the "ReadLine" or "ReadLineAsync" functions so that - /// each line will obey the CSV Settings rules for line separators. - /// - /// - /// - /// - public string[] ParseLine(string line, bool reachedEnd) - { - if (!string.IsNullOrEmpty(line)) - { - line += _settings.LineSeparator; - } - return ParseChunk(line, reachedEnd); - } - /// /// Parse a new chunk of text retrieved via some other means than a stream. /// @@ -108,12 +99,18 @@ public string[] ParseLine(string line, bool reachedEnd) public string[] ParseChunk(string chunk, bool reachedEnd) { // Detect end of stream - if (reachedEnd && string.IsNullOrEmpty(chunk) && _position == -1) + if (reachedEnd && string.IsNullOrEmpty(chunk) && _position == -1 && string.IsNullOrEmpty(_line)) { State = CSVState.Done; return null; } + // If we're at the end of the line, remember to backtrack one because we increment immediately + if (_position == _line.Length) + { + _position -= 1; + } + // Add this chunk to the current processing logic _line += chunk; @@ -199,10 +196,22 @@ public string[] ParseChunk(string chunk, bool reachedEnd) _position--; } // Are we at a line separator? Let's do a quick test first - else if (c == _settings.LineSeparator[0] && _position + _settings.LineSeparator.Length <= _line.Length) + else if (c == _settings.LineSeparator[0]) { - if (string.Equals(_line.Substring(_position, _settings.LineSeparator.Length), - _settings.LineSeparator)) + // If we don't have enough characters left to test the line separator properly, ask for more + var notEnoughChars = _position + _settings.LineSeparator.Length > _line.Length; + if (notEnoughChars && !reachedEnd) + { + return null; + } + + // If we have reached the end, but this isn't a complete line separator, it's just text + if (notEnoughChars && reachedEnd) + { + _work.Append(c); + } + // OK, we have enough characters, see if this is a line separator + else if (string.Equals(_line.Substring(_position, _settings.LineSeparator.Length), _settings.LineSeparator)) { _line = _line.Substring(_position + _settings.LineSeparator.Length); _position = -1; @@ -212,6 +221,11 @@ public string[] ParseChunk(string chunk, bool reachedEnd) _work.Length = 0; return row; } + // It's not a line separator, it's just a normal character + else + { + _work.Append(c); + } } // Does this start a new field? else if (c == _delimiter) diff --git a/tests/AsyncReaderTest.cs b/tests/AsyncReaderTest.cs index 50396fb..7bcfdc0 100644 --- a/tests/AsyncReaderTest.cs +++ b/tests/AsyncReaderTest.cs @@ -27,7 +27,8 @@ public async Task TestBasicReader() // Skip header row var settings = new CSVSettings() { - HeaderRowIncluded = false + HeaderRowIncluded = false, + LineSeparator = "\n", }; // Convert into stream @@ -88,7 +89,8 @@ public async Task TestDanglingFields() // Skip header row var settings = new CSVSettings() { - HeaderRowIncluded = false + HeaderRowIncluded = false, + LineSeparator = "\n", }; // Convert into stream @@ -156,7 +158,7 @@ public async Task TestAlternateDelimiterQualifiers() "Dr. Kelso\tChief of Medicine\tx100"; // Convert into stream - var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t' }; + var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t', LineSeparator = "\n" }; using (var cr = CSVReader.FromString(source, settings)) { Assert.AreEqual("Name", cr.Headers[0]); diff --git a/tests/BasicParseTests.cs b/tests/BasicParseTests.cs index e929b79..39c5eb5 100644 --- a/tests/BasicParseTests.cs +++ b/tests/BasicParseTests.cs @@ -236,5 +236,24 @@ public void TestIssue53() Assert.AreEqual("Normal", line3[9]); Assert.AreEqual("", line3[10]); } + + [Test] + public void TestMultipleNewlines() + { + // Specific issue reported by domdere + var line1 = CSV.ParseLine("\"test\",\"blah\r\n\r\n\r\nfoo\",\"Normal\""); + Assert.AreEqual("test", line1[0]); + Assert.AreEqual("blah\r\n\r\n\r\nfoo", line1[1]); + Assert.AreEqual("Normal", line1[2]); + + // Test a few potential use cases here + var line2 = CSV.ParseLine("\"test\",\"\n\n\",\"\r\n\r\n\r\n\",\"Normal\",\"\",\"\r\r\r\r\r\""); + Assert.AreEqual("test", line2[0]); + Assert.AreEqual("\n\n", line2[1]); + Assert.AreEqual("\r\n\r\n\r\n", line2[2]); + Assert.AreEqual("Normal", line2[3]); + Assert.AreEqual("", line2[4]); + Assert.AreEqual("\r\r\r\r\r", line2[5]); + } } } diff --git a/tests/ChopTest.cs b/tests/ChopTest.cs index 182ca3b..2b5acf1 100644 --- a/tests/ChopTest.cs +++ b/tests/ChopTest.cs @@ -97,7 +97,6 @@ public void DataTableChopTest() Assert.AreEqual(list[i].email, results[i].email); } } - // Clean up finally { if (Directory.Exists(dirname)) diff --git a/tests/DataTableReaderTest.cs b/tests/DataTableReaderTest.cs index ceb6334..e4c5096 100644 --- a/tests/DataTableReaderTest.cs +++ b/tests/DataTableReaderTest.cs @@ -33,7 +33,11 @@ public class DataTableReaderTest [Test] public void TestBasicDataTable() { - var dt = CSVDataTable.FromString(source); + var settings = new CSVSettings() + { + LineSeparator = "\n", + }; + var dt = CSVDataTable.FromString(source, settings); Assert.AreEqual(3, dt.Columns.Count); Assert.AreEqual(4, dt.Rows.Count); Assert.AreEqual("JD", dt.Rows[0].ItemArray[0]); @@ -53,12 +57,16 @@ public void TestBasicDataTable() [Test] public void TestDataTableWithEmbeddedNewlines() { - var dt = CSVDataTable.FromString(source_embedded_newlines); + var settings = new CSVSettings() + { + LineSeparator = "\n", + }; + var dt = CSVDataTable.FromString(source_embedded_newlines, settings); Assert.AreEqual(3, dt.Columns.Count); Assert.AreEqual(4, dt.Rows.Count); Assert.AreEqual("JD", dt.Rows[0].ItemArray[0]); Assert.AreEqual("Janitor", dt.Rows[1].ItemArray[0]); - Assert.AreEqual("Dr. Reed, " + Environment.NewLine + "Eliot", dt.Rows[2].ItemArray[0]); + Assert.AreEqual("Dr. Reed, \nEliot", dt.Rows[2].ItemArray[0]); Assert.AreEqual("Dr. Kelso", dt.Rows[3].ItemArray[0]); Assert.AreEqual("Doctor", dt.Rows[0].ItemArray[1]); Assert.AreEqual("Janitor", dt.Rows[1].ItemArray[1]); diff --git a/tests/ReaderTest.cs b/tests/ReaderTest.cs index 5545452..df00d63 100644 --- a/tests/ReaderTest.cs +++ b/tests/ReaderTest.cs @@ -29,7 +29,8 @@ public void TestBasicReader() // Skip header row var settings = new CSVSettings() { - HeaderRowIncluded = false + HeaderRowIncluded = false, + LineSeparator = "\n", }; // Convert into stream @@ -88,7 +89,8 @@ public void TestDanglingFields() // Skip header row var settings = new CSVSettings() { - HeaderRowIncluded = false + HeaderRowIncluded = false, + LineSeparator = "\n", }; // Convert into stream @@ -156,7 +158,7 @@ public void TestAlternateDelimiterQualifiers() "Dr. Kelso\tChief of Medicine\tx100"; // Convert into stream - var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t' }; + var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t', LineSeparator = "\n", }; using (var cr = CSVReader.FromString(source, settings)) { Assert.AreEqual("Name", cr.Headers[0]); @@ -210,7 +212,7 @@ public void TestSepLineOverride() "Dr. Kelso|Chief of Medicine|x100"; // Convert into stream - var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t', AllowSepLine = true }; + var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t', AllowSepLine = true, LineSeparator = "\n", }; using (var cr = CSVReader.FromString(source, settings)) { // The field delimiter should have been changed, but the original object should remain the same @@ -281,6 +283,59 @@ public void TestIssue53() } } + [Test] + public void TestMultipleNewlines() + { + var settings = new CSVSettings() + { + HeaderRowIncluded = false, + LineSeparator = "\r\n", + }; + + // This use case was reported by domdere as https://github.com/tspence/csharp-csv-reader/issues/59 + var source = "\"test\",\"blah\r\n\r\n\r\nfoo\",\"Normal\""; + using (var cr = CSVReader.FromString(source, settings)) + { + foreach (var line in cr.Lines()) + { + Assert.AreEqual("test", line[0]); + Assert.AreEqual("blah\r\n\r\n\r\nfoo", line[1]); + Assert.AreEqual("Normal", line[2]); + } + } + + // Test a few potential use cases here + var source2 = "\"test\",\"\n\n\",\"\r\n\r\n\r\n\",\"Normal\",\"\",\"\r\r\r\r\r\""; + using (var cr = CSVReader.FromString(source2, settings)) + { + foreach (var line in cr.Lines()) + { + Assert.AreEqual("test", line[0]); + Assert.AreEqual("\n\n", line[1]); + Assert.AreEqual("\r\n\r\n\r\n", line[2]); + Assert.AreEqual("Normal", line[3]); + Assert.AreEqual("", line[4]); + Assert.AreEqual("\r\r\r\r\r", line[5]); + } + } + + // Test a false single CR within the text + var source3 = "\"test\",\"\n\n\",\"\r\n\r\n\r\n\",\"Normal\",\"\",\"\r\r\r\r\r\",\r\r\r\n"; + using (var cr = CSVReader.FromString(source3, settings)) + { + foreach (var line in cr.Lines()) + { + Assert.AreEqual("test", line[0]); + Assert.AreEqual("\n\n", line[1]); + Assert.AreEqual("\r\n\r\n\r\n", line[2]); + Assert.AreEqual("Normal", line[3]); + Assert.AreEqual("", line[4]); + Assert.AreEqual("\r\r\r\r\r", line[5]); + Assert.AreEqual("\r\r", line[6]); + } + } + } + #if HAS_ASYNC_IENUM [Test] public async Task TestAsyncReader() @@ -296,7 +351,7 @@ public async Task TestAsyncReader() "Dr. Kelso|Chief of Medicine|x100"; // Convert into stream - var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t', AllowSepLine = true }; + var settings = new CSVSettings() { HeaderRowIncluded = true, FieldDelimiter = '\t', LineSeparator = "\n", AllowSepLine = true }; using (var cr = CSVReader.FromString(source, settings)) { // The field delimiter should have been changed, but the original object should remain the same diff --git a/tests/SerializationTest.cs b/tests/SerializationTest.cs index c13c752..a50ea8f 100644 --- a/tests/SerializationTest.cs +++ b/tests/SerializationTest.cs @@ -141,7 +141,8 @@ public void TestCaseInsensitiveDeserializer() "NULL,7,Fourth"; // Deserialize back from a CSV string - should not throw any errors! - var newList = CSV.Deserialize(csv, CSVSettings.CSV_PERMIT_NULL).ToList(); + var permitNull = new CSVSettings() { AllowNull = true, LineSeparator = "\n", NullToken = "NULL" }; + var newList = CSV.Deserialize(csv, permitNull).ToList(); // Compare original objects to new ones for (var i = 0; i < list.Count; i++) @@ -164,6 +165,7 @@ public void TestNullDeserialization() { AllowNull = true, NullToken = string.Empty, + LineSeparator = "\n", }; // Deserialize back from a CSV string - should not throw any errors! @@ -186,6 +188,7 @@ public void TestNullDeserializationWithToken() { AllowNull = true, NullToken = "SPECIAL_NULL", + LineSeparator = "\n", }; // Deserialize back from a CSV string - should not throw any errors! @@ -223,6 +226,7 @@ public void DeserializeLastColumnNullableSingle() settings.AllowNull = true; settings.NullToken = string.Empty; settings.IgnoreEmptyLineForDeserialization = true; + settings.LineSeparator = "\n"; // Deserialize back from a CSV string - should not throw any errors! var list = CSV.Deserialize(csv, settings).ToList(); @@ -261,6 +265,7 @@ public async Task DeserializeLastColumnNullableSingleAsync() settings.AllowNull = true; settings.NullToken = string.Empty; settings.IgnoreEmptyLineForDeserialization = true; + settings.LineSeparator = "\n"; // Try deserializing using the async version var list = await CSV.DeserializeAsync(csv, settings).ToListAsync(); @@ -306,7 +311,8 @@ public void DeserializeReadOnlyProperty() { AllowNull = true, NullToken = string.Empty, - IgnoreEmptyLineForDeserialization = true + IgnoreEmptyLineForDeserialization = true, + LineSeparator = "\n", }; // Deserialize back from a CSV string - should throw an error because "ReadOnlySingle" is read only @@ -376,7 +382,8 @@ public void DeserializeExcludeColumns() AllowNull = true, NullToken = string.Empty, IgnoreEmptyLineForDeserialization = true, - ExcludedColumns = new []{ "TestString" } + ExcludedColumns = new []{ "TestString" }, + LineSeparator = "\n", }; // Try deserializing - we should see nulls in the TestString column