Skip to content

Commit 42b7f90

Browse files
committed
Fix concatKeepingSchema turning ColumnGroups into DataColumns
Fixes #1763
1 parent 33e2e48 commit 42b7f90

3 files changed

Lines changed: 28 additions & 5 deletions

File tree

dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReadingImpl.kt

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,14 @@ import org.jetbrains.kotlinx.dataframe.api.NullabilityException
6060
import org.jetbrains.kotlinx.dataframe.api.NullabilityOptions
6161
import org.jetbrains.kotlinx.dataframe.api.applyNullability
6262
import org.jetbrains.kotlinx.dataframe.api.cast
63+
import org.jetbrains.kotlinx.dataframe.api.count
6364
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
6465
import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame
6566
import org.jetbrains.kotlinx.dataframe.api.getColumn
67+
import org.jetbrains.kotlinx.dataframe.api.getColumnsWithPaths
68+
import org.jetbrains.kotlinx.dataframe.api.isColumnGroup
6669
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
70+
import org.jetbrains.kotlinx.dataframe.api.toDataFrameFromPairs
6771
import org.jetbrains.kotlinx.dataframe.impl.asList
6872
import java.io.File
6973
import java.math.BigDecimal
@@ -89,13 +93,16 @@ internal fun <T> Iterable<DataFrame<T>>.concatKeepingSchema(): DataFrame<T> {
8993
1 -> return dataFrames[0]
9094
}
9195

92-
val columnNames = dataFrames.first().columnNames()
96+
val columnPaths = dataFrames.first()
97+
.getColumnsWithPaths { colsAtAnyDepth().filter { !it.isColumnGroup() } }
98+
.map { it.path }
9399

94-
val columns = columnNames.map { name ->
95-
val values = dataFrames.flatMap { it.getColumn(name).values() }
96-
DataColumn.createValueColumn(name, values, dataFrames.first().getColumn(name).type())
100+
val totalRows = dataFrames.sumOf { it.count() }
101+
val columns = columnPaths.map { path ->
102+
val values = dataFrames.flatMapTo(ArrayList(totalRows)) { it.getColumn(path).values() }
103+
path to DataColumn.createValueColumn(path.name(), values, dataFrames.first().getColumn(path).type())
97104
}
98-
return dataFrameOf(columns).cast()
105+
return columns.toDataFrameFromPairs()
99106
}
100107

101108
private fun BitVector.values(range: IntRange): List<Boolean?> = range.map { getObject(it) }

dataframe-arrow/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowKtTest.kt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package org.jetbrains.kotlinx.dataframe.io
22

3+
import io.kotest.assertions.asClue
34
import io.kotest.assertions.throwables.shouldThrow
45
import io.kotest.matchers.collections.shouldContain
56
import io.kotest.matchers.shouldBe
7+
import io.kotest.matchers.types.shouldBeInstanceOf
68
import kotlinx.datetime.LocalDate
79
import kotlinx.datetime.LocalDateTime
810
import kotlinx.datetime.UtcOffset
@@ -37,7 +39,9 @@ import org.jetbrains.kotlinx.dataframe.api.convertToBoolean
3739
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
3840
import org.jetbrains.kotlinx.dataframe.api.map
3941
import org.jetbrains.kotlinx.dataframe.api.pathOf
42+
import org.jetbrains.kotlinx.dataframe.api.print
4043
import org.jetbrains.kotlinx.dataframe.api.remove
44+
import org.jetbrains.kotlinx.dataframe.api.schema
4145
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
4246
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
4347
import org.junit.Assert
@@ -77,6 +81,18 @@ internal class ArrowKtTest {
7781
df shouldBe expected
7882
}
7983

84+
@Test
85+
fun testReadingMultipleBatches() {
86+
val df = DataFrame.readArrowFeather(testArrowFeather("multiple_batches_concat"))
87+
df.schema().print()
88+
df.schema().asClue {
89+
df["id"].type() shouldBe typeOf<Int>()
90+
val person = df["person"].shouldBeInstanceOf<ColumnGroup<*>>()
91+
person["name"].type() shouldBe typeOf<String>()
92+
person["age"].type() shouldBe typeOf<Int>()
93+
}
94+
}
95+
8096
@Test
8197
fun testReadingAllTypesAsEstimated() {
8298
assertEstimations(
Binary file not shown.

0 commit comments

Comments
 (0)