Skip to content

Commit bd832ff

Browse files
authored
Merge pull request #1665 from Kotlin/csv-charsets
Custom Csv charsets support
2 parents f1fcc82 + dfcdb9c commit bd832ff

19 files changed

Lines changed: 829 additions & 20 deletions

File tree

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/common.kt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.jetbrains.kotlinx.dataframe.io
22

3+
import org.apache.commons.io.ByteOrderMark
34
import org.apache.commons.io.input.BOMInputStream
45
import org.jetbrains.kotlinx.dataframe.AnyFrame
56
import org.jetbrains.kotlinx.dataframe.DataFrame
@@ -88,5 +89,12 @@ public fun asUrl(fileOrUrl: String): URL =
8889
public fun InputStream.skippingBomCharacters(): InputStream =
8990
BOMInputStream.builder()
9091
.setInputStream(this)
92+
.setByteOrderMarks(
93+
ByteOrderMark.UTF_8,
94+
ByteOrderMark.UTF_16LE,
95+
ByteOrderMark.UTF_16BE,
96+
ByteOrderMark.UTF_32LE,
97+
ByteOrderMark.UTF_32BE,
98+
)
9199
.setInclude(false)
92100
.get()

dataframe-csv/api/dataframe-csv.api

Lines changed: 45 additions & 15 deletions
Large diffs are not rendered by default.

dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentationCsv/CommonReadDelimDocs.kt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
package org.jetbrains.kotlinx.dataframe.documentationCsv
44

55
import org.jetbrains.kotlinx.dataframe.DataFrame
6+
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.CHARSET
67
import org.jetbrains.kotlinx.dataframe.io.ColType
78
import java.io.File
89
import java.io.InputStream
@@ -83,6 +84,7 @@ internal interface CommonReadDelimDocs {
8384
interface DelimDocs
8485

8586
/**
87+
* @include [CHARSET]
8688
* @include [DelimParams.HEADER]
8789
* @include [DelimParams.HAS_FIXED_WIDTH_COLUMNS]
8890
* @include [DelimParams.FIXED_COLUMN_WIDTHS]

dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentationCsv/DelimParams.kt

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import org.jetbrains.kotlinx.dataframe.io.ColType
1111
import org.jetbrains.kotlinx.dataframe.io.Compression
1212
import org.jetbrains.kotlinx.dataframe.io.DefaultNullStringsContentLink
1313
import org.jetbrains.kotlinx.dataframe.io.QuoteMode
14+
import java.nio.charset.Charset
1415

1516
/**
1617
* Contains both the default values of csv/tsv parameters and the parameter KDocs.
@@ -20,29 +21,36 @@ internal object DelimParams {
2021

2122
/**
2223
* @param path The file path to read.
24+
* Use [charset\] to specify the encoding.
2325
* Can also be compressed as `.gz` or `.zip`, see [Compression][Compression].
2426
*/
2527
interface PATH_READ
2628

2729
/**
2830
* @param file The file to read.
31+
* Use [charset\] to specify the encoding.
2932
* Can also be compressed as `.gz` or `.zip`, see [Compression][Compression].
3033
*/
3134
interface FILE_READ
3235

3336
/**
3437
* @param url The URL from which to fetch the data.
38+
* Use [charset\] to specify the encoding.
3539
* Can also be compressed as `.gz` or `.zip`, see [Compression][Compression].
3640
*/
3741
interface URL_READ
3842

3943
/**
4044
* @param fileOrUrl The file path or URL to read the data from.
45+
* Use [charset\] to specify the encoding.
4146
* Can also be compressed as `.gz` or `.zip`, see [Compression][Compression].
4247
*/
4348
interface FILE_OR_URL_READ
4449

45-
/** @param inputStream Represents the file to read. */
50+
/**
51+
* @param inputStream Represents the file to read.
52+
* Use [charset\] to specify the encoding.
53+
*/
4654
interface INPUT_STREAM_READ
4755

4856
/** @param text The raw data to read in the form of a [String]. */
@@ -57,6 +65,15 @@ internal object DelimParams {
5765
/** @param writer The [Appendable] to write to. */
5866
interface WRITER_WRITE
5967

68+
/**
69+
* @param charset The [character set][java.nio.charset.Charset] the input is encoded in.
70+
* Default: `null`
71+
*
72+
* If `null`, the Charset will be read from the BOM of the provided input,
73+
* defaulting to [UTF-8][Charsets.UTF_8] if no BOM is found.
74+
*/
75+
val CHARSET: Charset? = null
76+
6077
/**
6178
* @param delimiter The field delimiter character. Default: ','.
6279
*

dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import io.deephaven.csv.util.CsvReaderException
2222
import kotlinx.datetime.LocalDate
2323
import kotlinx.datetime.LocalDateTime
2424
import kotlinx.datetime.LocalTime
25+
import org.apache.commons.io.input.BOMInputStream
2526
import org.jetbrains.kotlinx.dataframe.DataColumn
2627
import org.jetbrains.kotlinx.dataframe.DataFrame
2728
import org.jetbrains.kotlinx.dataframe.DataRow
@@ -34,6 +35,7 @@ import org.jetbrains.kotlinx.dataframe.api.tryParse
3435
import org.jetbrains.kotlinx.dataframe.columns.ValueColumn
3536
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_SPECS
3637
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ALLOW_MISSING_COLUMNS
38+
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.CHARSET
3739
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COL_TYPES
3840
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COMPRESSION
3941
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.FIXED_COLUMN_WIDTHS
@@ -61,6 +63,7 @@ import java.io.InputStream
6163
import java.math.BigDecimal
6264
import java.math.BigInteger
6365
import java.net.URL
66+
import java.nio.charset.Charset
6467
import kotlin.reflect.KType
6568
import kotlin.reflect.full.withNullability
6669
import kotlin.reflect.typeOf
@@ -72,6 +75,7 @@ import kotlinx.datetime.Instant as DeprecatedInstant
7275
* Implementation to read delimiter-separated data from an [InputStream] based on the Deephaven CSV library.
7376
*
7477
* @include [INPUT_STREAM_READ]
78+
* @include [CHARSET]
7579
* @param delimiter The field delimiter character. The default is ',' for CSV, '\t' for TSV.
7680
* @include [HEADER]
7781
* @include [COL_TYPES]
@@ -94,6 +98,7 @@ internal fun readDelimImpl(
9498
inputStream: InputStream,
9599
delimiter: Char,
96100
header: List<String>,
101+
charset: Charset?,
97102
hasFixedWidthColumns: Boolean,
98103
fixedColumnWidths: List<Int>,
99104
colTypes: Map<String, ColType>,
@@ -139,10 +144,18 @@ internal fun readDelimImpl(
139144
val csvReaderResult = inputStream.useDecompressed(compression) { decompressedInputStream ->
140145
// read the csv
141146
try {
147+
val deBommedInputString = decompressedInputStream.skippingBomCharacters()
148+
149+
// choose charset like: provided? -> from BOM? -> UTF-8
150+
val streamCharset = charset
151+
?: (deBommedInputString as? BOMInputStream)?.bom?.let { Charset.forName(it.charsetName) }
152+
?: Charsets.UTF_8
153+
142154
@Suppress("ktlint:standard:comment-wrapping")
143155
CsvReader.read(
144156
/* specs = */ csvSpecs,
145-
/* stream = */ decompressedInputStream.skippingBomCharacters(),
157+
/* stream = */ deBommedInputString,
158+
/* streamCharset = */ streamCharset,
146159
/* sinkFactory = */ ListSink.SINK_FACTORY,
147160
)
148161
} catch (e: CsvReaderException) {

dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeDelim.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ internal fun writeDelimImpl(
4848
setCommentMarker(commentChar)
4949
setHeaderComments(*headerComments.toTypedArray())
5050
}.let { adjustCsvFormat(it, it) }
51-
.build()
51+
.get()
5252

5353
// let the format handle the writing, only converting AnyRow and AnyFrame to JSON
5454
format.print(writer).use { printer ->

dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ public class CsvDeephaven(private val delimiter: Char = DelimParams.CSV_DELIMITE
1919
DataFrame.readCsv(file = file, header = header, delimiter = delimiter)
2020

2121
override fun readDataFrame(path: Path, header: List<String>): DataFrame<*> =
22-
DataFrame.readCsv(path = path, header = header, delimiter = delimiter)
22+
DataFrame.readCsv(path = path, delimiter = delimiter, header = header)
2323

2424
override fun acceptsExtension(ext: String): Boolean = ext == "csv"
2525

0 commit comments

Comments
 (0)