Skip to content

Commit 67cde82

Browse files
authored
Merge pull request #601 from garydgregory/fix/CSV323_ExtendedBufferedReader_byte_pos
[CSV-323] ExtendedBufferedReader byte tracking leads to an incorrect CSVRecord.getBytePosition()
2 parents add38c3 + 23eb602 commit 67cde82

2 files changed

Lines changed: 60 additions & 34 deletions

File tree

src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java

Lines changed: 42 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -37,26 +37,30 @@
3737
/**
3838
* A special buffered reader which supports sophisticated read access.
3939
* <p>
40-
* In particular the reader supports a look-ahead option, which allows you to see the next char returned by
41-
* {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}.
40+
* In particular the reader supports a look-ahead option, which allows you to see the next char returned by {@link #read()}. This reader also tracks how many
41+
* characters have been read with {@link #getPosition()}.
4242
* </p>
4343
*/
4444
final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
4545

4646
/** The last char returned */
4747
private int lastChar = UNDEFINED;
48+
4849
private int lastCharMark = UNDEFINED;
4950

5051
/** The count of EOLs (CR/LF/CRLF) seen so far */
5152
private long lineNumber;
53+
5254
private long lineNumberMark;
5355

5456
/** The position, which is the number of characters read so far */
5557
private long position;
58+
5659
private long positionMark;
5760

5861
/** The number of bytes read so far. */
5962
private long bytesRead;
63+
6064
private long bytesReadMark;
6165

6266
/** Encoder for calculating the number of bytes for each character read. */
@@ -70,12 +74,11 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
7074
}
7175

7276
/**
73-
* Constructs a new instance with the specified reader, character set,
74-
* and byte tracking option. Initializes an encoder if byte tracking is enabled
75-
* and a character set is provided.
77+
* Constructs a new instance with the specified reader, character set, and byte tracking option. Initializes an encoder if byte tracking is enabled and a
78+
* character set is provided.
7679
*
77-
* @param reader the reader supports a look-ahead option.
78-
* @param charset the character set for encoding, or {@code null} if not applicable.
80+
* @param reader the reader supports a look-ahead option.
81+
* @param charset the character set for encoding, or {@code null} if not applicable.
7982
* @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
8083
*/
8184
ExtendedBufferedReader(final Reader reader, final Charset charset, final boolean trackBytes) {
@@ -86,8 +89,7 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
8689
/**
8790
* Closes the stream.
8891
*
89-
* @throws IOException
90-
* If an I/O error occurs
92+
* @throws IOException If an I/O error occurs
9193
*/
9294
@Override
9395
public void close() throws IOException {
@@ -105,26 +107,33 @@ long getBytesRead() {
105107
return this.bytesRead;
106108
}
107109

110+
private long getEncodedCharLength(final char[] buf, final int offset, final int length) throws CharacterCodingException {
111+
int len = 0;
112+
for (int i = offset; i < length; i++) {
113+
len += getEncodedCharLength(buf[i]);
114+
}
115+
return len;
116+
}
117+
108118
/**
109-
* Gets the byte length of the given character based on the original Unicode
110-
* specification, which defined characters as fixed-width 16-bit entities.
119+
* Gets the byte length of the given character based on the original Unicode specification, which defined characters as fixed-width 16-bit entities.
111120
* <p>
112121
* The Unicode characters are divided into two main ranges:
113122
* <ul>
114-
* <li><strong>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</strong>
115-
* <ul>
116-
* <li>Represented using a single 16-bit {@code char}.</li>
117-
* <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
118-
* </ul>
119-
* </li>
120-
* <li><strong>U+10000 to U+10FFFF (Supplementary Characters):</strong>
121-
* <ul>
122-
* <li>Represented as a pair of {@code char}s:</li>
123-
* <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
124-
* <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
125-
* <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
126-
* </ul>
127-
* </li>
123+
* <li><strong>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</strong>
124+
* <ul>
125+
* <li>Represented using a single 16-bit {@code char}.</li>
126+
* <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
127+
* </ul>
128+
* </li>
129+
* <li><strong>U+10000 to U+10FFFF (Supplementary Characters):</strong>
130+
* <ul>
131+
* <li>Represented as a pair of {@code char}s:</li>
132+
* <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
133+
* <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
134+
* <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
135+
* </ul>
136+
* </li>
128137
* </ul>
129138
*
130139
* @param current the current character to process.
@@ -148,10 +157,9 @@ private int getEncodedCharLength(final int current) throws CharacterCodingExcept
148157
}
149158

150159
/**
151-
* Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by
152-
* any of the read methods. This will not include a character read using the {@link #peek()} method. If no
153-
* character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached
154-
* on the last read then this will return {@link IOUtils#EOF}.
160+
* Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by any of the read methods. This will not
161+
* include a character read using the {@link #peek()} method. If no character has been read then this will return {@link Constants#UNDEFINED}. If the end of
162+
* the stream was reached on the last read then this will return {@link IOUtils#EOF}.
155163
*
156164
* @return the last character that was read
157165
*/
@@ -193,8 +201,7 @@ public void mark(final int readAheadLimit) throws IOException {
193201
@Override
194202
public int read() throws IOException {
195203
final int current = super.read();
196-
if (current == CR || current == LF && lastChar != CR ||
197-
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
204+
if (current == CR || current == LF && lastChar != CR || current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
198205
lineNumber++;
199206
}
200207
if (encoder != null) {
@@ -226,13 +233,15 @@ public int read(final char[] buf, final int offset, final int length) throws IOE
226233
} else if (len == EOF) {
227234
lastChar = EOF;
228235
}
236+
if (encoder != null) {
237+
this.bytesRead += getEncodedCharLength(buf, offset, len);
238+
}
229239
position += len;
230240
return len;
231241
}
232242

233243
/**
234-
* Gets the next line, dropping the line terminator(s). This method should only be called when processing a
235-
* comment, otherwise, information can be lost.
244+
* Gets the next line, dropping the line terminator(s). This method should only be called when processing a comment, otherwise, information can be lost.
236245
* <p>
237246
* Increments {@link #lineNumber} and updates {@link #position}.
238247
* </p>
@@ -272,5 +281,4 @@ public void reset() throws IOException {
272281
bytesRead = bytesReadMark;
273282
super.reset();
274283
}
275-
276284
}

src/test/java/org/apache/commons/csv/CSVParserTest.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,24 @@ void testForEach() throws Exception {
648648
}
649649
}
650650

651+
@Test
652+
void testGetBytePositionMultiCharacterDelimiter() throws IOException {
653+
final String code = "aa[|]bb\ncc[|]dd\n";
654+
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter("[|]").get();
655+
try (CSVParser parser = CSVParser.builder()
656+
.setReader(new StringReader(code))
657+
.setFormat(format)
658+
.setCharset(StandardCharsets.UTF_8)
659+
.setTrackBytes(true)
660+
.get()) {
661+
final Iterator<CSVRecord> it = parser.iterator();
662+
final CSVRecord first = it.next();
663+
final CSVRecord second = it.next();
664+
assertEquals(0, first.getBytePosition());
665+
assertEquals(8, second.getBytePosition());
666+
}
667+
}
668+
651669
@Test
652670
void testGetHeaderComment_HeaderComment1() throws IOException {
653671
try (CSVParser parser = CSVParser.parse(CSV_INPUT_HEADER_COMMENT, FORMAT_AUTO_HEADER)) {

0 commit comments

Comments
 (0)