From 06cb0e9b538e5f16fc1b6803706c363fbee3d50e Mon Sep 17 00:00:00 2001 From: Kona Farry Date: Fri, 16 Jan 2026 11:29:34 -0800 Subject: [PATCH 01/10] fix import in test --- Tests/swift-csvTests/swift_csvTests.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/swift-csvTests/swift_csvTests.swift b/Tests/swift-csvTests/swift_csvTests.swift index 66ce56c..c77f1f4 100644 --- a/Tests/swift-csvTests/swift_csvTests.swift +++ b/Tests/swift-csvTests/swift_csvTests.swift @@ -1,5 +1,5 @@ import XCTest -@testable import swift_csv +@testable import SwiftCSV final class swift_csvTests: XCTestCase { func testExample() throws { From dd829ee824cc165784db2da9e820cd13f5aca487 Mon Sep 17 00:00:00 2001 From: Kona Farry Date: Fri, 16 Jan 2026 11:37:10 -0800 Subject: [PATCH 02/10] support for reading local files on Linux --- Sources/swift-csv/AsyncRawCSVIterator.swift | 11 +++- Sources/swift-csv/LinuxAsyncFileBytes.swift | 62 +++++++++++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) mode change 100644 => 100755 Sources/swift-csv/AsyncRawCSVIterator.swift create mode 100755 Sources/swift-csv/LinuxAsyncFileBytes.swift diff --git a/Sources/swift-csv/AsyncRawCSVIterator.swift b/Sources/swift-csv/AsyncRawCSVIterator.swift old mode 100644 new mode 100755 index cd0550d..fd076d8 --- a/Sources/swift-csv/AsyncRawCSVIterator.swift +++ b/Sources/swift-csv/AsyncRawCSVIterator.swift @@ -10,8 +10,11 @@ import Foundation /// A CSV iterator can lazily parse a CSV file. The whole file is not loaded into memory. Instead, it is parsed when the data is requested. If the data is not stored outside the iterator, the file can be parsed without using a lot of memory. The iterator can parse local and remote data. public struct AsyncRawCSVIterator: AsyncIteratorProtocol where Encoding.CodeUnit == UInt8 { public typealias Element = [String] - + #if os(Linux) + var iterator: LinuxAsyncFileBytes.Iterator + #else var iterator: URL.AsyncBytes.AsyncIterator + #endif @usableFromInline var pieces: [String] = [] @@ -29,7 +32,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt /// Create a new CSV iterator for the given URL. /// - Parameters: - /// - url: The CSV source. This can be a URL to a local or remote file. + /// - url: The CSV source. On Linux, this MUST be a file URL (`URL.isFileURL`). On all other systems, this can be a URL to a local or remote file. /// - as: The type to decode to. /// - hasHeaders: Mark whether the CSV file has a header. If true, the header will be used to check if each row has a valid length. If false, the first row length will be used instead. /// - skipInvalidRows: If enabled, no errors will be thrown for rows that have an incorrect amount of columns. @@ -44,7 +47,11 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt escapeCharacter: Character = "\"", encoding: Encoding.Type = UTF8.self ) async throws { + #if os(Linux) + let iterator = try LinuxAsyncFileBytes(url: url, bufferSize: 64 * 1024).makeAsyncIterator() + #else let iterator = url.resourceBytes.makeAsyncIterator() + #endif self.skipInvalidRows = skipInvalidRows self.iterator = iterator diff --git a/Sources/swift-csv/LinuxAsyncFileBytes.swift b/Sources/swift-csv/LinuxAsyncFileBytes.swift new file mode 100755 index 0000000..230554f --- /dev/null +++ b/Sources/swift-csv/LinuxAsyncFileBytes.swift @@ -0,0 +1,62 @@ +#if os(Linux) +import Foundation +import Glibc + +struct LinuxAsyncFileBytes: AsyncSequence { + typealias Element = UInt8 + + let fd: Int32 + let bufferSize: Int + + struct Iterator: AsyncIteratorProtocol { + let fd: Int32 + let bufferSize: Int + var buffer: [UInt8] = [] + var index: Int = 0 + + mutating func next() async throws -> UInt8? { + if index < buffer.count { + //return unconsumed buffer bytes if the exist + let b = buffer[index] + index += 1 + return b + } + + //if there are no buffered bytes, read the next batch and return first + buffer = [UInt8](repeating: 0, count: bufferSize) + let count = read(fd, &buffer, bufferSize) + + if count > 0 { + buffer.removeLast(buffer.count - count) + index = 1 + return buffer[0] + } else if count == 0 { + close(fd) + return nil //eof + } else { + throw POSIXError(POSIXError.Code(rawValue: errno) ?? .EIO) + } + } + } + + func makeAsyncIterator() -> Iterator { + Iterator(fd: fd, bufferSize: bufferSize) + } +} + +extension LinuxAsyncFileBytes { + init(url: URL, bufferSize: Int) throws { + precondition(url.isFileURL, "URL must be a file:// URL") + + let path = url.path + + let fd = open(path, O_RDONLY) + guard fd >= 0 else { + throw POSIXError(POSIXError.Code(rawValue: errno) ?? .EIO) + } + + self.fd = fd + self.bufferSize = bufferSize + } +} +#endif \ No newline at end of file From f6ad8a296cf5ef1432fb3020abeaba5178364c93 Mon Sep 17 00:00:00 2001 From: Kona Farry Date: Thu, 23 Apr 2026 11:47:09 -0700 Subject: [PATCH 03/10] Handle case with empty last column on last row with no newline after --- Sources/swift-csv/AsyncRawCSVIterator.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Sources/swift-csv/AsyncRawCSVIterator.swift b/Sources/swift-csv/AsyncRawCSVIterator.swift index fd076d8..83ed906 100755 --- a/Sources/swift-csv/AsyncRawCSVIterator.swift +++ b/Sources/swift-csv/AsyncRawCSVIterator.swift @@ -100,6 +100,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt @usableFromInline mutating func readLine() async throws -> Bool { var isEscaped = false + var hasDelimeter = false var startIndex: Int = 0 @@ -117,6 +118,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt } case delimiter where !isEscaped: // comma + hasDelimeter = true pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) bytes.removeAll(keepingCapacity: true) startIndex = 0 @@ -136,7 +138,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt } } - if !bytes.isEmpty { + if !bytes.isEmpty || hasDelimeter { pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) } From d4f3a99a2904c1338ee2d9c88d7acbf051808b9e Mon Sep 17 00:00:00 2001 From: Kona Farry Date: Sat, 9 May 2026 16:13:40 -0700 Subject: [PATCH 04/10] Add option to ignore leading whitespace (space or tab) --- Sources/swift-csv/AsyncCodableCSVIterator.swift | 3 ++- Sources/swift-csv/AsyncRawAsDictCSVIterator.swift | 3 ++- Sources/swift-csv/AsyncRawCSVIterator.swift | 8 ++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Sources/swift-csv/AsyncCodableCSVIterator.swift b/Sources/swift-csv/AsyncCodableCSVIterator.swift index 527668c..f211fae 100644 --- a/Sources/swift-csv/AsyncCodableCSVIterator.swift +++ b/Sources/swift-csv/AsyncCodableCSVIterator.swift @@ -42,10 +42,11 @@ public struct AsyncCodableCSVIterator: skipInvalidRows: Bool = false, delimiter: Character = ",", escapeCharacter: Character = "\"", + ignoreLeadingWhitespace: Bool = false, encoding: Encoding.Type = UTF8.self, booleanDecodingBehavior: BooleanDecodingBehavior = .disabled ) async throws { - let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: hasHeaders, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, encoding: encoding) + let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: hasHeaders, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, ignoreLeadingWhitespace: ignoreLeadingWhitespace, encoding: encoding) self.iterator = iterator self.headers = iterator.headers diff --git a/Sources/swift-csv/AsyncRawAsDictCSVIterator.swift b/Sources/swift-csv/AsyncRawAsDictCSVIterator.swift index 98f111f..8e3cbbb 100644 --- a/Sources/swift-csv/AsyncRawAsDictCSVIterator.swift +++ b/Sources/swift-csv/AsyncRawAsDictCSVIterator.swift @@ -28,9 +28,10 @@ public struct AsyncRawAsDictCSVIterator: AsyncIterat skipInvalidRows: Bool = false, delimiter: Character = ",", escapeCharacter: Character = "\"", + ignoreLeadingWhitespace: Bool = false, encoding: Encoding.Type = UTF8.self ) async throws { - let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: true, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, encoding: encoding) + let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: true, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, ignoreLeadingWhitespace: ignoreLeadingWhitespace, encoding: encoding) self.iterator = iterator self.headers = iterator.headers! diff --git a/Sources/swift-csv/AsyncRawCSVIterator.swift b/Sources/swift-csv/AsyncRawCSVIterator.swift index 83ed906..03f89f4 100755 --- a/Sources/swift-csv/AsyncRawCSVIterator.swift +++ b/Sources/swift-csv/AsyncRawCSVIterator.swift @@ -29,6 +29,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt let skipInvalidRows: Bool let delimiter: UInt8 let escapeCharacter: UInt8 + let ignoreLeadingWhitespace: Bool /// Create a new CSV iterator for the given URL. /// - Parameters: @@ -45,6 +46,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt skipInvalidRows: Bool = false, delimiter: Character = ",", escapeCharacter: Character = "\"", + ignoreLeadingWhitespace: Bool = false, encoding: Encoding.Type = UTF8.self ) async throws { #if os(Linux) @@ -66,6 +68,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt self.delimiter = delimiter self.escapeCharacter = escapeCharacter + self.ignoreLeadingWhitespace = ignoreLeadingWhitespace if hasHeaders { _ = try await readLine() @@ -133,6 +136,11 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) return true + case 32 where ignoreLeadingWhitespace && !isEscaped && bytes.isEmpty: // leading space + continue + case 9 where ignoreLeadingWhitespace && !isEscaped && bytes.isEmpty: // leading tab + continue + default: bytes.append(value) } From 43e8f5780205fdf0e94f6eaa54670aaa40501ba1 Mon Sep 17 00:00:00 2001 From: Kona Farry Date: Mon, 25 May 2026 20:53:50 -0700 Subject: [PATCH 05/10] Attempt to fix stale open files issue --- Sources/swift-csv/LinuxAsyncFileBytes.swift | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Sources/swift-csv/LinuxAsyncFileBytes.swift b/Sources/swift-csv/LinuxAsyncFileBytes.swift index 230554f..f5c11da 100755 --- a/Sources/swift-csv/LinuxAsyncFileBytes.swift +++ b/Sources/swift-csv/LinuxAsyncFileBytes.swift @@ -13,6 +13,7 @@ struct LinuxAsyncFileBytes: AsyncSequence { let bufferSize: Int var buffer: [UInt8] = [] var index: Int = 0 + var didClose = false mutating func next() async throws -> UInt8? { if index < buffer.count { @@ -22,6 +23,10 @@ struct LinuxAsyncFileBytes: AsyncSequence { return b } + guard !didClose else { + return nil + } + //if there are no buffered bytes, read the next batch and return first buffer = [UInt8](repeating: 0, count: bufferSize) let count = read(fd, &buffer, bufferSize) @@ -31,12 +36,19 @@ struct LinuxAsyncFileBytes: AsyncSequence { index = 1 return buffer[0] } else if count == 0 { - close(fd) + closeIfNeeded() return nil //eof } else { + closeIfNeeded() throw POSIXError(POSIXError.Code(rawValue: errno) ?? .EIO) } } + + private mutating func closeIfNeeded() { + guard !didClose else { return } + close(fd) + didClose = true + } } func makeAsyncIterator() -> Iterator { From e16bf5142826a375b8f97f5e1d34cd9829586adc Mon Sep 17 00:00:00 2001 From: Kona Farry Date: Mon, 25 May 2026 21:05:40 -0700 Subject: [PATCH 06/10] Change unsafe buffer call --- Sources/swift-csv/LinuxAsyncFileBytes.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Sources/swift-csv/LinuxAsyncFileBytes.swift b/Sources/swift-csv/LinuxAsyncFileBytes.swift index f5c11da..92b8e84 100755 --- a/Sources/swift-csv/LinuxAsyncFileBytes.swift +++ b/Sources/swift-csv/LinuxAsyncFileBytes.swift @@ -29,7 +29,9 @@ struct LinuxAsyncFileBytes: AsyncSequence { //if there are no buffered bytes, read the next batch and return first buffer = [UInt8](repeating: 0, count: bufferSize) - let count = read(fd, &buffer, bufferSize) + let count = buffer.withUnsafeMutableBytes { rawBuffer in + read(fd, rawBuffer.baseAddress, bufferSize) + } if count > 0 { buffer.removeLast(buffer.count - count) From 9e72b4765f31a81eb31faf9ddf00ef610e1648c8 Mon Sep 17 00:00:00 2001 From: Kona Farry Date: Tue, 26 May 2026 12:19:11 -0700 Subject: [PATCH 07/10] Fix null bytes issue from attempted open fix --- Sources/swift-csv/LinuxAsyncFileBytes.swift | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Sources/swift-csv/LinuxAsyncFileBytes.swift b/Sources/swift-csv/LinuxAsyncFileBytes.swift index 92b8e84..285edc1 100755 --- a/Sources/swift-csv/LinuxAsyncFileBytes.swift +++ b/Sources/swift-csv/LinuxAsyncFileBytes.swift @@ -16,6 +16,10 @@ struct LinuxAsyncFileBytes: AsyncSequence { var didClose = false mutating func next() async throws -> UInt8? { + guard !didClose else { + return nil + } + if index < buffer.count { //return unconsumed buffer bytes if the exist let b = buffer[index] @@ -23,10 +27,6 @@ struct LinuxAsyncFileBytes: AsyncSequence { return b } - guard !didClose else { - return nil - } - //if there are no buffered bytes, read the next batch and return first buffer = [UInt8](repeating: 0, count: bufferSize) let count = buffer.withUnsafeMutableBytes { rawBuffer in @@ -38,6 +38,8 @@ struct LinuxAsyncFileBytes: AsyncSequence { index = 1 return buffer[0] } else if count == 0 { + buffer.removeAll(keepingCapacity: true) + index = 0 closeIfNeeded() return nil //eof } else { From f2be8f10fdc103d5fc319b45e062db638329ae53 Mon Sep 17 00:00:00 2001 From: Kona Farry Date: Thu, 4 Jun 2026 10:26:02 -0700 Subject: [PATCH 08/10] Ignore byte-order mark at start of file --- Sources/swift-csv/AsyncRawCSVIterator.swift | 23 +++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/Sources/swift-csv/AsyncRawCSVIterator.swift b/Sources/swift-csv/AsyncRawCSVIterator.swift index 03f89f4..4a7f7cb 100755 --- a/Sources/swift-csv/AsyncRawCSVIterator.swift +++ b/Sources/swift-csv/AsyncRawCSVIterator.swift @@ -30,6 +30,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt let delimiter: UInt8 let escapeCharacter: UInt8 let ignoreLeadingWhitespace: Bool + private var isAtStartOfFile: Bool = false /// Create a new CSV iterator for the given URL. /// - Parameters: @@ -122,18 +123,18 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt case delimiter where !isEscaped: // comma hasDelimeter = true - pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) + appendCurrentPiece(from: startIndex) bytes.removeAll(keepingCapacity: true) startIndex = 0 case 10 where !isEscaped: // line feed - pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) + appendCurrentPiece(from: startIndex) return true case 13 where !isEscaped: // carriage return _ = try await iterator.next() - pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) + appendCurrentPiece(from: startIndex) return true case 32 where ignoreLeadingWhitespace && !isEscaped && bytes.isEmpty: // leading space @@ -147,11 +148,25 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt } if !bytes.isEmpty || hasDelimeter { - pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) + appendCurrentPiece(from: startIndex) } return !pieces.isEmpty } + + mutating private func appendCurrentPiece(from startIndex: Int) { + var fieldBytes = Array(bytes[startIndex...]) + + if isAtStartOfFile { + isAtStartOfFile = false + + if fieldBytes.starts(with: [0xEF, 0xBB, 0xBF]) { //byte-order mark + fieldBytes.removeFirst(3) + } + } + + pieces.append(String(decoding: fieldBytes, as: Encoding.self)) + } } extension AsyncRawCSVIterator: AsyncSequence { From cf09f449e41849bc6ae87af915cbabc0795d8418 Mon Sep 17 00:00:00 2001 From: Kona Farry Date: Thu, 4 Jun 2026 10:31:54 -0700 Subject: [PATCH 09/10] Fix boolean init --- Sources/swift-csv/AsyncRawCSVIterator.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/swift-csv/AsyncRawCSVIterator.swift b/Sources/swift-csv/AsyncRawCSVIterator.swift index 4a7f7cb..67b0a76 100755 --- a/Sources/swift-csv/AsyncRawCSVIterator.swift +++ b/Sources/swift-csv/AsyncRawCSVIterator.swift @@ -30,7 +30,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt let delimiter: UInt8 let escapeCharacter: UInt8 let ignoreLeadingWhitespace: Bool - private var isAtStartOfFile: Bool = false + private var isAtStartOfFile: Bool = true /// Create a new CSV iterator for the given URL. /// - Parameters: From c03b4179336dc74c3cfc8d62abc7eb40651562a0 Mon Sep 17 00:00:00 2001 From: Kona Farry Date: Thu, 18 Jun 2026 15:35:31 -0700 Subject: [PATCH 10/10] Fix BOM before quoted header column --- Sources/swift-csv/AsyncRawCSVIterator.swift | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Sources/swift-csv/AsyncRawCSVIterator.swift b/Sources/swift-csv/AsyncRawCSVIterator.swift index 67b0a76..b1d61b5 100755 --- a/Sources/swift-csv/AsyncRawCSVIterator.swift +++ b/Sources/swift-csv/AsyncRawCSVIterator.swift @@ -155,7 +155,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt } mutating private func appendCurrentPiece(from startIndex: Int) { - var fieldBytes = Array(bytes[startIndex...]) + var fieldBytes = bytes if isAtStartOfFile { isAtStartOfFile = false @@ -165,6 +165,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt } } + fieldBytes = Array(fieldBytes[startIndex...]) pieces.append(String(decoding: fieldBytes, as: Encoding.self)) } }