diff --git a/Sources/swift-csv/AsyncCodableCSVIterator.swift b/Sources/swift-csv/AsyncCodableCSVIterator.swift index 527668c..f211fae 100644 --- a/Sources/swift-csv/AsyncCodableCSVIterator.swift +++ b/Sources/swift-csv/AsyncCodableCSVIterator.swift @@ -42,10 +42,11 @@ public struct AsyncCodableCSVIterator: skipInvalidRows: Bool = false, delimiter: Character = ",", escapeCharacter: Character = "\"", + ignoreLeadingWhitespace: Bool = false, encoding: Encoding.Type = UTF8.self, booleanDecodingBehavior: BooleanDecodingBehavior = .disabled ) async throws { - let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: hasHeaders, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, encoding: encoding) + let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: hasHeaders, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, ignoreLeadingWhitespace: ignoreLeadingWhitespace, encoding: encoding) self.iterator = iterator self.headers = iterator.headers diff --git a/Sources/swift-csv/AsyncRawAsDictCSVIterator.swift b/Sources/swift-csv/AsyncRawAsDictCSVIterator.swift index 98f111f..8e3cbbb 100644 --- a/Sources/swift-csv/AsyncRawAsDictCSVIterator.swift +++ b/Sources/swift-csv/AsyncRawAsDictCSVIterator.swift @@ -28,9 +28,10 @@ public struct AsyncRawAsDictCSVIterator: AsyncIterat skipInvalidRows: Bool = false, delimiter: Character = ",", escapeCharacter: Character = "\"", + ignoreLeadingWhitespace: Bool = false, encoding: Encoding.Type = UTF8.self ) async throws { - let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: true, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, encoding: encoding) + let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: true, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, ignoreLeadingWhitespace: ignoreLeadingWhitespace, encoding: encoding) self.iterator = iterator self.headers = iterator.headers! diff --git a/Sources/swift-csv/AsyncRawCSVIterator.swift b/Sources/swift-csv/AsyncRawCSVIterator.swift old mode 100644 new mode 100755 index cd0550d..b1d61b5 --- a/Sources/swift-csv/AsyncRawCSVIterator.swift +++ b/Sources/swift-csv/AsyncRawCSVIterator.swift @@ -10,8 +10,11 @@ import Foundation /// A CSV iterator can lazily parse a CSV file. The whole file is not loaded into memory. Instead, it is parsed when the data is requested. If the data is not stored outside the iterator, the file can be parsed without using a lot of memory. The iterator can parse local and remote data. public struct AsyncRawCSVIterator: AsyncIteratorProtocol where Encoding.CodeUnit == UInt8 { public typealias Element = [String] - + #if os(Linux) + var iterator: LinuxAsyncFileBytes.Iterator + #else var iterator: URL.AsyncBytes.AsyncIterator + #endif @usableFromInline var pieces: [String] = [] @@ -26,10 +29,12 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt let skipInvalidRows: Bool let delimiter: UInt8 let escapeCharacter: UInt8 + let ignoreLeadingWhitespace: Bool + private var isAtStartOfFile: Bool = true /// Create a new CSV iterator for the given URL. /// - Parameters: - /// - url: The CSV source. This can be a URL to a local or remote file. + /// - url: The CSV source. On Linux, this MUST be a file URL (`URL.isFileURL`). On all other systems, this can be a URL to a local or remote file. /// - as: The type to decode to. /// - hasHeaders: Mark whether the CSV file has a header. If true, the header will be used to check if each row has a valid length. If false, the first row length will be used instead. /// - skipInvalidRows: If enabled, no errors will be thrown for rows that have an incorrect amount of columns. @@ -42,9 +47,14 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt skipInvalidRows: Bool = false, delimiter: Character = ",", escapeCharacter: Character = "\"", + ignoreLeadingWhitespace: Bool = false, encoding: Encoding.Type = UTF8.self ) async throws { + #if os(Linux) + let iterator = try LinuxAsyncFileBytes(url: url, bufferSize: 64 * 1024).makeAsyncIterator() + #else let iterator = url.resourceBytes.makeAsyncIterator() + #endif self.skipInvalidRows = skipInvalidRows self.iterator = iterator @@ -59,6 +69,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt self.delimiter = delimiter self.escapeCharacter = escapeCharacter + self.ignoreLeadingWhitespace = ignoreLeadingWhitespace if hasHeaders { _ = try await readLine() @@ -93,6 +104,7 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt @usableFromInline mutating func readLine() async throws -> Bool { var isEscaped = false + var hasDelimeter = false var startIndex: Int = 0 @@ -110,31 +122,52 @@ public struct AsyncRawCSVIterator: AsyncIteratorProt } case delimiter where !isEscaped: // comma - pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) + hasDelimeter = true + appendCurrentPiece(from: startIndex) bytes.removeAll(keepingCapacity: true) startIndex = 0 case 10 where !isEscaped: // line feed - pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) + appendCurrentPiece(from: startIndex) return true case 13 where !isEscaped: // carriage return _ = try await iterator.next() - pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) + appendCurrentPiece(from: startIndex) return true + case 32 where ignoreLeadingWhitespace && !isEscaped && bytes.isEmpty: // leading space + continue + case 9 where ignoreLeadingWhitespace && !isEscaped && bytes.isEmpty: // leading tab + continue + default: bytes.append(value) } } - if !bytes.isEmpty { - pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self)) + if !bytes.isEmpty || hasDelimeter { + appendCurrentPiece(from: startIndex) } return !pieces.isEmpty } + + mutating private func appendCurrentPiece(from startIndex: Int) { + var fieldBytes = bytes + + if isAtStartOfFile { + isAtStartOfFile = false + + if fieldBytes.starts(with: [0xEF, 0xBB, 0xBF]) { //byte-order mark + fieldBytes.removeFirst(3) + } + } + + fieldBytes = Array(fieldBytes[startIndex...]) + pieces.append(String(decoding: fieldBytes, as: Encoding.self)) + } } extension AsyncRawCSVIterator: AsyncSequence { diff --git a/Sources/swift-csv/LinuxAsyncFileBytes.swift b/Sources/swift-csv/LinuxAsyncFileBytes.swift new file mode 100755 index 0000000..285edc1 --- /dev/null +++ b/Sources/swift-csv/LinuxAsyncFileBytes.swift @@ -0,0 +1,78 @@ +#if os(Linux) +import Foundation +import Glibc + +struct LinuxAsyncFileBytes: AsyncSequence { + typealias Element = UInt8 + + let fd: Int32 + let bufferSize: Int + + struct Iterator: AsyncIteratorProtocol { + let fd: Int32 + let bufferSize: Int + var buffer: [UInt8] = [] + var index: Int = 0 + var didClose = false + + mutating func next() async throws -> UInt8? { + guard !didClose else { + return nil + } + + if index < buffer.count { + //return unconsumed buffer bytes if the exist + let b = buffer[index] + index += 1 + return b + } + + //if there are no buffered bytes, read the next batch and return first + buffer = [UInt8](repeating: 0, count: bufferSize) + let count = buffer.withUnsafeMutableBytes { rawBuffer in + read(fd, rawBuffer.baseAddress, bufferSize) + } + + if count > 0 { + buffer.removeLast(buffer.count - count) + index = 1 + return buffer[0] + } else if count == 0 { + buffer.removeAll(keepingCapacity: true) + index = 0 + closeIfNeeded() + return nil //eof + } else { + closeIfNeeded() + throw POSIXError(POSIXError.Code(rawValue: errno) ?? .EIO) + } + } + + private mutating func closeIfNeeded() { + guard !didClose else { return } + close(fd) + didClose = true + } + } + + func makeAsyncIterator() -> Iterator { + Iterator(fd: fd, bufferSize: bufferSize) + } +} + +extension LinuxAsyncFileBytes { + init(url: URL, bufferSize: Int) throws { + precondition(url.isFileURL, "URL must be a file:// URL") + + let path = url.path + + let fd = open(path, O_RDONLY) + guard fd >= 0 else { + throw POSIXError(POSIXError.Code(rawValue: errno) ?? .EIO) + } + + self.fd = fd + self.bufferSize = bufferSize + } +} +#endif \ No newline at end of file diff --git a/Tests/swift-csvTests/swift_csvTests.swift b/Tests/swift-csvTests/swift_csvTests.swift index 66ce56c..c77f1f4 100644 --- a/Tests/swift-csvTests/swift_csvTests.swift +++ b/Tests/swift-csvTests/swift_csvTests.swift @@ -1,5 +1,5 @@ import XCTest -@testable import swift_csv +@testable import SwiftCSV final class swift_csvTests: XCTestCase { func testExample() throws {