Skip to content

Commit c4c42a6

Browse files
committed
Enforce trim characters to not contain delimiter scalars
1 parent 511096c commit c4c42a6

7 files changed

Lines changed: 132 additions & 70 deletions

File tree

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,17 @@ A `CSVReadder` parses CSV data from a given input (`String`, or `Data`, or file)
142142

143143
CSV fields are separated within a row with _field delimiters_ (commonly a "comma"). CSV rows are separated through _row delimiters_ (commonly a "line feed"). You can specify any unicode scalar, `String` value, or `nil` for unknown delimiters.
144144

145+
- `escapingStrategy` (default: `.doubleQuote`) specify the Unicode scalar used to escape fields.
146+
147+
CSV fields can be escaped in case they contain priviledge characters, such as field/row delimiters. Commonly the escaping character is a double quote (i.e. `"`), by setting this configuration value you can change it (e.g. a single quote), or disable the escaping functionality.
148+
145149
- `headerStrategy` (default: `.none`) indicates whether the CSV data has a header row or not.
146150
147151
CSV files may contain an optional header row at the very beginning. This configuration value lets you specify whether the file has a header row or not, or whether you want the library to figure it out.
148152
149153
- `trimStrategy` (default: empty set) trims the given characters at the beginning and end of each parsed field.
150154
151-
The trim characters are applied for the escaped and unescaped fields.
155+
The trim characters are applied for the escaped and unescaped fields. The set cannot include any of the delimiter characters or the escaping scalar. If so, an error will be thrown during initialization.
152156
153157
- `presample` (default: `false`) indicates whether the CSV data should be completely loaded into memory before parsing begins.
154158
@@ -236,6 +240,10 @@ A `CSVWriter` encodes CSV information into a specified target (i.e. a `String`,
236240
237241
CSV fields are separated within a row with _field delimiters_ (commonly a "comma"). CSV rows are separated through _row delimiters_ (commonly a "line feed"). You can specify any unicode scalar, `String` value, or `nil` for unknown delimiters.
238242
243+
- `escapingStrategy` (default: `.doubleQuote`) specify the Unicode scalar used to escape fields.
244+
245+
CSV fields can be escaped in case they contain priviledge characters, such as field/row delimiters. Commonly the escaping character is a double quote (i.e. `"`), by setting this configuration value you can change it (e.g. a single quote), or disable the escaping functionality.
246+
239247
- `headers` (default: `[]`) indicates whether the CSV data has a header row or not.
240248

241249
CSV files may contain an optional header row at the very beginning. If this configuration value is empty, no header row is writen.

Sources/Active/Reader/Reader.swift

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,9 @@ extension CSVReader {
133133

134134
extension CSVReader {
135135
/// Creates the lookup dictionary from the headers row.
136+
///
137+
/// Although it is officially allowed that two CSV headers have the same value, this method will throw an error if that is the case.
138+
/// - throws: `CSVError<CSVReader>` exclusively.
136139
internal func makeHeaderLookup() throws -> [Int:Int] {
137140
var result: [Int:Int] = .init(minimumCapacity: self.headers.count)
138141
for (index, header) in self.headers.enumerated() {
@@ -162,13 +165,11 @@ extension CSVReader {
162165
case false: result.append(""); break loop
163166
}
164167
}
165-
166168
// 3. Check for characters to trim before a field is parsed.
167169
if !self.settings.trimCharacters.isEmpty, self.settings.trimCharacters.contains(scalar) {
168170
continue loop
169171
}
170-
171-
// 4. If the unicode scalar retrieved is a double quote, an escaped field is awaiting for parsing.
172+
// 4. If the unicode scalar retrieved is the escaping scalar, an escaped field is awaiting parsing.
172173
if let escapingScalar = self.settings.escapingScalar, scalar == escapingScalar {
173174
let field = try self.parseEscapedField(rowIndex: rowIndex, escaping: escapingScalar)
174175
result.append(field.value)
@@ -200,26 +201,30 @@ extension CSVReader {
200201
var field: String.UnicodeScalarView = .init(repeating: starting, count: 1)
201202
var reachedRowsEnd = false
202203

204+
// 1. This loop continue parsing a unescaped field till the field end is reached.
203205
fieldLoop: while true {
204-
// Try to retrieve an scalar (if not, it is the EOF).
205-
guard let scalar = try self.buffer.next() ?? self.decoder() else { reachedRowsEnd = true; break fieldLoop }
206-
// There cannot be double quotes on unescaped fields. If one is encountered, an error is thrown.
206+
// 2. Try to retrieve an scalar (if not, it is the EOF).
207+
guard let scalar = try self.buffer.next() ?? self.decoder() else {
208+
reachedRowsEnd = true
209+
break fieldLoop
210+
}
211+
// 3. A escaping scalar cannot appear on unescaped fields. If one is encountered, an error is thrown.
207212
if scalar == self.settings.escapingScalar {
208213
throw Error.invalidUnescapedField(rowIndex: rowIndex)
209-
// If the field delimiter is encountered, return the already parsed characters.
214+
// 4. If the field delimiter is encountered, return the already parsed characters.
210215
} else if try self.isFieldDelimiter(scalar) {
211216
reachedRowsEnd = false
212217
break fieldLoop
213-
// If the row delimiter is encountered, return the already parsed characters.
218+
// 5. If the row delimiter is encountered, return the already parsed characters.
214219
} else if try self.isRowDelimiter(scalar) {
215220
reachedRowsEnd = true
216221
break fieldLoop
217-
// If it is a regular unicode scalar, just store it and continue parsing.
222+
// 6. If it is a regular unicode scalar, just store it and continue parsing.
218223
} else {
219224
field.append(scalar)
220225
}
221226
}
222-
227+
// 7. Once the end has been reached, a field look-back (starting from the end) is performed to check if there are trim characters.
223228
if !self.settings.trimCharacters.isEmpty {
224229
while let lastScalar = field.last, self.settings.trimCharacters.contains(lastScalar) {
225230
field.removeLast()
@@ -242,13 +247,24 @@ extension CSVReader {
242247

243248
fieldLoop: while true {
244249
// 1. Retrieve an scalar (if not there, it means EOF). This case is not allowed without closing the escaping field first.
245-
guard let scalar = try self.buffer.next() ?? self.decoder() else { throw Error.invalidEOF(rowIndex: rowIndex) }
246-
// 2. If the retrieved scalar is not a quote (i.e. "), just store it and continue parsing.
247-
guard scalar == escapingScalar else { field.append(scalar); continue fieldLoop }
248-
// 3. If the retrieved scalar was a quote, retrieve the following scalar and check if it is EOF. If so, the field has finished and also the row and the file.
249-
guard var followingScalar = try self.buffer.next() ?? self.decoder() else { reachedRowsEnd = true; break fieldLoop }
250-
// 4. If the second retrieved scalar is another quote, the data is escaping a single quote scalar (quotes are escaped with other quotes).
251-
guard followingScalar != escapingScalar else { field.append(escapingScalar); continue fieldLoop }
250+
guard let scalar = try self.buffer.next() ?? self.decoder() else {
251+
throw Error.invalidEOF(rowIndex: rowIndex)
252+
}
253+
// 2. If the retrieved scalar is not the escaping scalar, just store it and continue parsing.
254+
guard scalar == escapingScalar else {
255+
field.append(scalar)
256+
continue fieldLoop
257+
}
258+
// 3. If the retrieved scalar was a escaping scalar, retrieve the following scalar and check if it is EOF. If so, the field has finished and also the row and the file.
259+
guard var followingScalar = try self.buffer.next() ?? self.decoder() else {
260+
reachedRowsEnd = true
261+
break fieldLoop
262+
}
263+
// 4. If the second retrieved scalar is another escaping scalar, the data is escaping the escaping scalar.
264+
guard followingScalar != escapingScalar else {
265+
field.append(escapingScalar)
266+
continue fieldLoop
267+
}
252268
// 5. Once this point is reached, the field has been properly escaped.
253269
if !self.settings.trimCharacters.isEmpty {
254270
// 6. Trim any character after the quote if necessary.

Sources/Active/Reader/ReaderConfiguration.swift

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@ extension CSVReader {
55
public struct Configuration {
66
/// The field and row delimiters.
77
public var delimiters: Delimiter.Pair
8+
/// The strategy to allow/disable escaped fields and how.
9+
public var escapingStrategy: Strategy.Escaping
810
/// Indication on whether the CSV will contain a header row or not, or that information is unknown and it should try to be inferred.
911
public var headerStrategy: Strategy.Header
1012
/// Trims the given characters at the beginning and end of each row, and between fields.
1113
public var trimStrategry: CharacterSet
12-
/// The strategy for escaping quoted fields.
13-
public var escapingStrategy: Strategy.Escaping
1414
/// The encoding used to identify the underlying data or `nil` if you want the CSV reader to try to figure it out.
1515
///
1616
/// If no encoding is provided and the input data doesn't contain a Byte Order Marker (BOM), UTF8 is presumed.
@@ -24,9 +24,9 @@ extension CSVReader {
2424
/// Designated initializer setting the default values.
2525
public init() {
2626
self.delimiters = (field: ",", row: "\n")
27+
self.escapingStrategy = .doubleQuote
2728
self.headerStrategy = .none
2829
self.trimStrategry = .init()
29-
self.escapingStrategy = .doubleQuote
3030
self.encoding = nil
3131
self.presample = false
3232
}
@@ -38,10 +38,10 @@ extension CSVReader {
3838
internal struct Settings {
3939
/// The unicode scalar delimiters for fields and rows.
4040
let delimiters: Delimiter.RawPair
41-
/// The characters set to be trimmed at the beginning and ending of each field.
42-
let trimCharacters: CharacterSet
4341
/// The unicode scalar used as encapsulator and escaping character (when printed two times).
4442
let escapingScalar: Unicode.Scalar?
43+
/// The characters set to be trimmed at the beginning and ending of each field.
44+
let trimCharacters: CharacterSet
4545

4646
/// Creates the inmutable reader settings from the user provided configuration values.
4747
/// - parameter configuration: The configuration values provided by the API user.
@@ -62,13 +62,21 @@ extension CSVReader {
6262
case (let delimiter, _):
6363
throw Error.invalidDelimiters(delimiter)
6464
}
65-
// 2. Set the trim characters set.
66-
self.trimCharacters = configuration.trimStrategry
67-
// 3. Set the escaping scalar.
65+
// 2. Set the escaping scalar.
6866
self.escapingScalar = configuration.escapingStrategy.scalar
69-
// 4. Ensure trim character set does not include escaping scalar
70-
if let escapingScalar = escapingScalar, trimCharacters.contains(escapingScalar) {
71-
throw Error.invalidTrimCharacter(escapingScalar: escapingScalar, trimCharacters: trimCharacters)
67+
// 3. Set the trim characters set.
68+
self.trimCharacters = configuration.trimStrategry
69+
// 4. Ensure trim character set doesn't contain the field delimiter.
70+
guard delimiters.field.allSatisfy({ !self.trimCharacters.contains($0) }) else {
71+
throw Error.invalidTrimCharacters(self.trimCharacters, delimiter: configuration.delimiters.field.rawValue)
72+
}
73+
// 5. Ensure trim character set doesn't contain the row delimiter.
74+
guard delimiters.row.allSatisfy({ !self.trimCharacters.contains($0) }) else {
75+
throw Error.invalidTrimCharacters(self.trimCharacters, delimiter: configuration.delimiters.row.rawValue)
76+
}
77+
// 6. Ensure trim character set does not include escaping scalar
78+
if let escapingScalar = self.escapingScalar, self.trimCharacters.contains(escapingScalar) {
79+
throw Error.invalidTrimCharacters(self.trimCharacters, escapingScalar: escapingScalar)
7280
}
7381
}
7482
}
@@ -83,10 +91,19 @@ fileprivate extension CSVReader.Error {
8391
help: "Set different delimiters for field and rows.",
8492
userInfo: ["Delimiter": delimiter])
8593
}
86-
87-
static func invalidTrimCharacter(escapingScalar: Unicode.Scalar, trimCharacters: CharacterSet) -> CSVError<CSVReader> {
94+
/// Error raised when a delimiter (whether row or field) is included in the trim character set.
95+
static func invalidTrimCharacters(_ trimCharacters: CharacterSet, delimiter: String.UnicodeScalarView) -> CSVError<CSVReader> {
96+
.init(.invalidConfiguration,
97+
reason: "The trim character set includes delimiter characters.",
98+
help: "Remove the delimiter scalars from the trim character set.",
99+
userInfo: ["Delimiter": delimiter, "Trim characters": trimCharacters])
100+
}
101+
/// Error raised when the escaping scalar has been included in the trim character set.
102+
/// - parameter escapingScalar: The selected escaping scalar.
103+
/// - parameter trimCharacters: The character set selected for trimming.
104+
static func invalidTrimCharacters(_ trimCharacters: CharacterSet, escapingScalar: Unicode.Scalar) -> CSVError<CSVReader> {
88105
.init(.invalidConfiguration,
89-
reason: "The trim characters set can not include the escaping scalar.",
106+
reason: "The trim characters set includes the escaping scalar.",
90107
help: "Remove the escaping scalar from the trim characters set.",
91108
userInfo: ["Escaping scalar": escapingScalar, "Trim characters": trimCharacters])
92109
}

Sources/Active/Writer/Writer.swift

Lines changed: 44 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -177,42 +177,59 @@ extension CSVWriter {
177177
/// - parameter field: The field to be checked for characters to escape and subsequently written.
178178
/// - throws: `CSVError<CSVWriter>` exclusively.
179179
private func lowlevelWrite(field: String) throws {
180-
let escapingScalar = self.settings.escapingScalar
181180
var result: [Unicode.Scalar]
182181

182+
// 1. If the field is empty, just write two escaping scalars.
183183
if field.isEmpty {
184-
if let escapingScalar = escapingScalar {
185-
result = .init(repeating: escapingScalar, count: 2)
186-
} else {
187-
result = []
184+
switch self.settings.escapingScalar {
185+
case let s?: result = .init(repeating: s, count: 2)
186+
case .none: result = .init()
188187
}
188+
// 2. If the field contains characters...
189189
} else {
190190
let input: [Unicode.Scalar] = .init(field.unicodeScalars)
191191
result = .init()
192-
result.reserveCapacity(input.count + 2)
193-
var index = 0
194-
var needsEscaping: Unicode.Scalar?
192+
// 3. Reserve space for all field scalars plus a bit more in case escaping is needed.
193+
result.reserveCapacity(input.count + 3)
195194

196-
while index < input.endIndex {
197-
let scalar = input[index]
198-
199-
if scalar == escapingScalar {
200-
needsEscaping = scalar
201-
} else if self.isFieldDelimiter(input, &index, &result) || self.isRowDelimiter(input, &index, &result) {
202-
needsEscaping = scalar
203-
continue
195+
// 4.A. If escaping is allowed.
196+
if let escapingScalar = self.settings.escapingScalar {
197+
var (index, needsEscaping) = (0, false)
198+
// 5. Iterate through all the input's Unicode scalars.
199+
while index < input.endIndex {
200+
let scalar = input[index]
201+
// 6. If the escaping character appears, the field needs escaping, but also the escaping character is duplicated.
202+
if scalar == escapingScalar {
203+
needsEscaping = true
204+
result.append(escapingScalar)
205+
// 7. If there is a field or row delimiter, the field needs escaping.
206+
} else if self.isFieldDelimiter(input, &index, &result) || self.isRowDelimiter(input, &index, &result) {
207+
needsEscaping = true
208+
continue
209+
}
210+
211+
result.append(scalar)
212+
index += 1
204213
}
205214

206-
index += 1
207-
result.append(scalar)
208-
}
209-
210-
if let needsEscaping = needsEscaping {
211-
guard let escapingScalar = escapingScalar else {
212-
throw Error.unescapedDelimiter(needsEscaping)
215+
// 8. If the field needed escaping, insert the escaping escalar at the beginning and end of the field.
216+
if needsEscaping {
217+
result.insert(escapingScalar, at: result.startIndex)
218+
result.append(escapingScalar)
219+
}
220+
// 4.B. If escaping is not allowed.
221+
} else {
222+
var index = 0
223+
// 5. Iterate through all the input's Unicode scalars.
224+
while index < input.endIndex {
225+
// 6. If the input data contains a delimiter, through an error.
226+
guard !self.isFieldDelimiter(input, &index, &result), !self.isRowDelimiter(input, &index, &result) else {
227+
throw Error.invalidPriviledgeCharacter(on: field)
228+
}
229+
230+
result.append(input[index])
231+
index += 1
213232
}
214-
result.insert(escapingScalar, at: result.startIndex)
215-
result.append(escapingScalar)
216233
}
217234
}
218235

@@ -228,11 +245,11 @@ extension CSVWriter {
228245
}
229246

230247
fileprivate extension CSVWriter.Error {
231-
static func unescapedDelimiter(_ delimiter: Unicode.Scalar) -> CSVError<CSVWriter> {
248+
static func invalidPriviledgeCharacter(on field: String) -> CSVError<CSVWriter> {
232249
.init(.invalidInput,
233250
reason: "A field cannot include a delimiter if escaping strategy is disabled.",
234251
help: "Remove delimiter from field or set an escaping strategy.",
235-
userInfo: ["Invalid character": delimiter])
252+
userInfo: ["Field": field])
236253

237254
}
238255
/// Error raised when the a field is trying to be writen and it overflows the expected number of fields per row.

Sources/Active/Writer/WriterConfiguration.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ extension CSVWriter {
33
public struct Configuration {
44
/// The field and row delimiters.
55
public var delimiters: Delimiter.Pair
6-
/// The strategy for escaping quoted fields.
6+
/// The strategy to allow/disable escaped fields and how.
77
public var escapingStrategy: Strategy.Escaping
88
/// The row of headers to write at the beginning of the CSV data.
99
///
@@ -53,10 +53,10 @@ extension CSVWriter {
5353
internal struct Settings {
5454
/// The unicode scalar delimiters for fields and rows.
5555
let delimiters: Delimiter.RawPair
56-
/// Boolean indicating whether the received CSV contains a header row or not.
57-
let headers: [String]
5856
/// The unicode scalar used as encapsulator and escaping character (when printed two times).
5957
let escapingScalar: Unicode.Scalar?
58+
/// Boolean indicating whether the received CSV contains a header row or not.
59+
let headers: [String]
6060
/// The encoding used to identify the underlying data.
6161
let encoding: String.Encoding
6262

0 commit comments

Comments
 (0)