Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Sources/swift-csv/AsyncCodableCSVIterator.swift
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,11 @@ public struct AsyncCodableCSVIterator<T: Decodable, Encoding: _UnicodeEncoding>:
skipInvalidRows: Bool = false,
delimiter: Character = ",",
escapeCharacter: Character = "\"",
ignoreLeadingWhitespace: Bool = false,
encoding: Encoding.Type = UTF8.self,
booleanDecodingBehavior: BooleanDecodingBehavior = .disabled
) async throws {
let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: hasHeaders, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, encoding: encoding)
let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: hasHeaders, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, ignoreLeadingWhitespace: ignoreLeadingWhitespace, encoding: encoding)

self.iterator = iterator
self.headers = iterator.headers
Expand Down
3 changes: 2 additions & 1 deletion Sources/swift-csv/AsyncRawAsDictCSVIterator.swift
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ public struct AsyncRawAsDictCSVIterator<Encoding: _UnicodeEncoding>: AsyncIterat
skipInvalidRows: Bool = false,
delimiter: Character = ",",
escapeCharacter: Character = "\"",
ignoreLeadingWhitespace: Bool = false,
encoding: Encoding.Type = UTF8.self
) async throws {
let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: true, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, encoding: encoding)
let iterator = try await AsyncRawCSVIterator(url: url, hasHeaders: true, skipInvalidRows: skipInvalidRows, delimiter: delimiter, escapeCharacter: escapeCharacter, ignoreLeadingWhitespace: ignoreLeadingWhitespace, encoding: encoding)

self.iterator = iterator
self.headers = iterator.headers!
Expand Down
47 changes: 40 additions & 7 deletions Sources/swift-csv/AsyncRawCSVIterator.swift
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ import Foundation
/// A CSV iterator can lazily parse a CSV file. The whole file is not loaded into memory. Instead, it is parsed when the data is requested. If the data is not stored outside the iterator, the file can be parsed without using a lot of memory. The iterator can parse local and remote data.
public struct AsyncRawCSVIterator<Encoding: _UnicodeEncoding>: AsyncIteratorProtocol where Encoding.CodeUnit == UInt8 {
public typealias Element = [String]

#if os(Linux)
var iterator: LinuxAsyncFileBytes.Iterator
#else
var iterator: URL.AsyncBytes.AsyncIterator
#endif

@usableFromInline
var pieces: [String] = []
Expand All @@ -26,10 +29,12 @@ public struct AsyncRawCSVIterator<Encoding: _UnicodeEncoding>: AsyncIteratorProt
let skipInvalidRows: Bool
let delimiter: UInt8
let escapeCharacter: UInt8
let ignoreLeadingWhitespace: Bool
private var isAtStartOfFile: Bool = true

/// Create a new CSV iterator for the given URL.
/// - Parameters:
/// - url: The CSV source. This can be a URL to a local or remote file.
/// - url: The CSV source. On Linux, this MUST be a file URL (`URL.isFileURL`). On all other systems, this can be a URL to a local or remote file.
/// - as: The type to decode to.
/// - hasHeaders: Mark whether the CSV file has a header. If true, the header will be used to check if each row has a valid length. If false, the first row length will be used instead.
/// - skipInvalidRows: If enabled, no errors will be thrown for rows that have an incorrect amount of columns.
Expand All @@ -42,9 +47,14 @@ public struct AsyncRawCSVIterator<Encoding: _UnicodeEncoding>: AsyncIteratorProt
skipInvalidRows: Bool = false,
delimiter: Character = ",",
escapeCharacter: Character = "\"",
ignoreLeadingWhitespace: Bool = false,
encoding: Encoding.Type = UTF8.self
) async throws {
#if os(Linux)
let iterator = try LinuxAsyncFileBytes(url: url, bufferSize: 64 * 1024).makeAsyncIterator()
#else
let iterator = url.resourceBytes.makeAsyncIterator()
#endif

self.skipInvalidRows = skipInvalidRows
self.iterator = iterator
Expand All @@ -59,6 +69,7 @@ public struct AsyncRawCSVIterator<Encoding: _UnicodeEncoding>: AsyncIteratorProt

self.delimiter = delimiter
self.escapeCharacter = escapeCharacter
self.ignoreLeadingWhitespace = ignoreLeadingWhitespace

if hasHeaders {
_ = try await readLine()
Expand Down Expand Up @@ -93,6 +104,7 @@ public struct AsyncRawCSVIterator<Encoding: _UnicodeEncoding>: AsyncIteratorProt
@usableFromInline
mutating func readLine() async throws -> Bool {
var isEscaped = false
var hasDelimeter = false

var startIndex: Int = 0

Expand All @@ -110,31 +122,52 @@ public struct AsyncRawCSVIterator<Encoding: _UnicodeEncoding>: AsyncIteratorProt
}

case delimiter where !isEscaped: // comma
pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self))
hasDelimeter = true
appendCurrentPiece(from: startIndex)
bytes.removeAll(keepingCapacity: true)
startIndex = 0


case 10 where !isEscaped: // line feed
pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self))
appendCurrentPiece(from: startIndex)
return true

case 13 where !isEscaped: // carriage return
_ = try await iterator.next()
pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self))
appendCurrentPiece(from: startIndex)
return true

case 32 where ignoreLeadingWhitespace && !isEscaped && bytes.isEmpty: // leading space
continue
case 9 where ignoreLeadingWhitespace && !isEscaped && bytes.isEmpty: // leading tab
continue

default:
bytes.append(value)
}
}

if !bytes.isEmpty {
pieces.append(String(decoding: bytes[startIndex...], as: Encoding.self))
if !bytes.isEmpty || hasDelimeter {
appendCurrentPiece(from: startIndex)
}

return !pieces.isEmpty
}

mutating private func appendCurrentPiece(from startIndex: Int) {
var fieldBytes = bytes

if isAtStartOfFile {
isAtStartOfFile = false

if fieldBytes.starts(with: [0xEF, 0xBB, 0xBF]) { //byte-order mark
fieldBytes.removeFirst(3)
}
}

fieldBytes = Array(fieldBytes[startIndex...])
pieces.append(String(decoding: fieldBytes, as: Encoding.self))
}
}

extension AsyncRawCSVIterator: AsyncSequence {
Expand Down
78 changes: 78 additions & 0 deletions Sources/swift-csv/LinuxAsyncFileBytes.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#if os(Linux)
import Foundation
import Glibc

struct LinuxAsyncFileBytes: AsyncSequence {
typealias Element = UInt8

let fd: Int32
let bufferSize: Int

struct Iterator: AsyncIteratorProtocol {
let fd: Int32
let bufferSize: Int
var buffer: [UInt8] = []
var index: Int = 0
var didClose = false

mutating func next() async throws -> UInt8? {
guard !didClose else {
return nil
}

if index < buffer.count {
//return unconsumed buffer bytes if the exist
let b = buffer[index]
index += 1
return b
}

//if there are no buffered bytes, read the next batch and return first
buffer = [UInt8](repeating: 0, count: bufferSize)
let count = buffer.withUnsafeMutableBytes { rawBuffer in
read(fd, rawBuffer.baseAddress, bufferSize)
}

if count > 0 {
buffer.removeLast(buffer.count - count)
index = 1
return buffer[0]
} else if count == 0 {
buffer.removeAll(keepingCapacity: true)
index = 0
closeIfNeeded()
return nil //eof
} else {
closeIfNeeded()
throw POSIXError(POSIXError.Code(rawValue: errno) ?? .EIO)
}
}

private mutating func closeIfNeeded() {
guard !didClose else { return }
close(fd)
didClose = true
}
}

func makeAsyncIterator() -> Iterator {
Iterator(fd: fd, bufferSize: bufferSize)
}
}

extension LinuxAsyncFileBytes {
init(url: URL, bufferSize: Int) throws {
precondition(url.isFileURL, "URL must be a file:// URL")

let path = url.path

let fd = open(path, O_RDONLY)
guard fd >= 0 else {
throw POSIXError(POSIXError.Code(rawValue: errno) ?? .EIO)
}

self.fd = fd
self.bufferSize = bufferSize
}
}
#endif
2 changes: 1 addition & 1 deletion Tests/swift-csvTests/swift_csvTests.swift
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import XCTest
@testable import swift_csv
@testable import SwiftCSV

final class swift_csvTests: XCTestCase {
func testExample() throws {
Expand Down