From 5b6ba3f967ed98082d3a371d78f718ad48bee616 Mon Sep 17 00:00:00 2001 From: Aleksandr Nikolaev Date: Wed, 27 May 2026 16:45:27 +0200 Subject: [PATCH 01/10] Add `countDistinct` overloads on `GroupBy` and KDocs for them --- core/api/core.api | 4 + .../kotlinx/dataframe/api/countDistinct.kt | 86 +++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/core/api/core.api b/core/api/core.api index bf5b7adf46..dcbf911126 100644 --- a/core/api/core.api +++ b/core/api/core.api @@ -1714,6 +1714,10 @@ public final class org/jetbrains/kotlinx/dataframe/api/CountDistinctKt { public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Ljava/lang/String;)I public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lkotlin/reflect/KProperty;)I public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lorg/jetbrains/kotlinx/dataframe/columns/ColumnReference;)I + public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;Lkotlin/jvm/functions/Function2;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static synthetic fun countDistinct$default (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static synthetic fun countDistinct$default (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;Lkotlin/jvm/functions/Function2;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } public final class org/jetbrains/kotlinx/dataframe/api/CountKt { diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt index f3c407c3ee..99dcdd7f90 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt @@ -4,10 +4,18 @@ import org.jetbrains.kotlinx.dataframe.AnyColumnReference import org.jetbrains.kotlinx.dataframe.ColumnsSelector import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload +import org.jetbrains.kotlinx.dataframe.annotations.Interpretable +import org.jetbrains.kotlinx.dataframe.annotations.Refine +import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COLUMNS_PARAM +import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COLUMN_SELECTION_DSL +import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COMPARISON_OBJECT +import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.EXAMPLE +import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.SCOPE import org.jetbrains.kotlinx.dataframe.columns.toColumnSet import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns +import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateValue import org.jetbrains.kotlinx.dataframe.indices import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API import kotlin.reflect.KProperty @@ -93,3 +101,81 @@ public fun DataFrame.countDistinct(vararg columns: AnyColumnReference): I countDistinct { columns.toColumnSet() } // endregion + +// region GroupBy + +/** + * Aggregates this [GroupBy] by counting the number of distinct {@get [COMPARISON_OBJECT] rows} in each group. + * + * Compares rows in each group based on the values in {@get [SCOPE] all} columns. + * Returns a new [DataFrame] where each row corresponds to a group. + * The resulting [DataFrame] contains: + * - the original group key columns, + * - a new column (named [resultName\], default is `"countDistinct"`) + * that contains the number of distinct {@get [COMPARISON_OBJECT] rows} in each group. + * + * See also: + * - [aggregate][Grouped.aggregate], which aggregates a [GroupBy] using the provided statistics. + * - [count][Grouped.count], which counts the number of rows in each group. + * - [distinct][DataFrame.distinct], which removes duplicate rows and returns a new [DataFrame]. + * - [groupBy][DataFrame.groupBy], which groups the rows of a [DataFrame] + * based on the values in one or more specified cols. + * + * For more information: {@include [DocumentationUrls.CountDistinct]} + * + * {@get [COLUMN_SELECTION_DSL]} + * + * ### Example + * ```kotlin + * {@get [EXAMPLE]} + * ``` + * + * @param [resultName\] The name of the result column that will store the number + * of distinct {@get [COMPARISON_OBJECT] rows} in each group. Defaults to `"countDistinct"`. + * @get [COLUMNS_PARAM] + * @return A new [DataFrame] with group keys and corresponding numbers of distinct {@get [COMPARISON_OBJECT] rows}. + */ +@ExcludeFromSources +private interface CountDistinctOnGroupByDocs { + typealias COMPARISON_OBJECT = Nothing + typealias SCOPE = Nothing + typealias EXAMPLE = Nothing + typealias COLUMN_SELECTION_DSL = Nothing + typealias COLUMNS_PARAM = Nothing +} + +/** + * @include [CountDistinctOnGroupByDocs] + * @set [EXAMPLE] + * // Counts the number of distinct rows for each city, returning + * // a new DataFrame with columns "city" and "countDistinct" + * df.groupBy { city }.countDistinct() + */ +@Refine +@Interpretable("GroupByCountDistinct0") +public fun Grouped.countDistinct(resultName: String = "countDistinct"): DataFrame = + countDistinct(resultName) { all() } + +/** + * @include [CountDistinctOnGroupByDocs] + * @set [COMPARISON_OBJECT] combinations of values in the selected [columns] + * @set [SCOPE] the selected + * @set [COLUMN_SELECTION_DSL] {@include [SelectingColumns.ColumnsSelectionDsl]} + * @set [EXAMPLE] + * // Counts unique combinations of values in the "year" and "title" columns + * // for each city, returning a new DataFrame with columns "city" and "countDistinct" + * df.groupBy { city }.countDistinct { year and title } + * @set [COLUMNS_PARAM] @param [columns\] The [ColumnsSelector] used to select columns + * that will be considered for evaluating whether the rows are distinct. + */ +@Refine +@Interpretable("GroupByCountDistinct0") +public fun Grouped.countDistinct( + resultName: String = "countDistinct", + columns: ColumnsSelector, +): DataFrame = + aggregateValue(resultName) { + countDistinct(columns) default 0 + } + +// endregion From 90daaeb932c58a7edd3bf9419578b5f71c7f9216 Mon Sep 17 00:00:00 2001 From: Aleksandr Nikolaev Date: Wed, 27 May 2026 16:46:21 +0200 Subject: [PATCH 02/10] Add tests for `countDistinct` overloads on `GroupBy` --- .../kotlinx/dataframe/api/countDistinct.kt | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt new file mode 100644 index 0000000000..fd83ad0a0f --- /dev/null +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt @@ -0,0 +1,139 @@ +package org.jetbrains.kotlinx.dataframe.api + +import io.kotest.matchers.shouldBe +import org.jetbrains.kotlinx.dataframe.nrow +import org.junit.Test + +class CountDistinctTests { + + private val df = dataFrameOf( + "name" to columnOf("Alice", "Alice", "Bob", "Charlie"), + "age" to columnOf(15, 15, 20, 25), + "group" to columnOf(1, 1, 1, 2), + ) + + @Test + fun `countDistinct on GroupBy`() { + val result = df.groupBy("group").countDistinct() + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(2, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with custom result name`() { + val result = df.groupBy("group").countDistinct("unique") + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "unique" to columnOf(2, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with one unique row`() { + val df = dataFrameOf( + "name" to columnOf("Alice", "Alice", "Alice"), + "age" to columnOf(15, 15, 15), + "group" to columnOf(1, 1, 1), + ) + val result = df.groupBy("group").countDistinct() + val expected = dataFrameOf( + "group" to columnOf(1), + "countDistinct" to columnOf(1), + ) + result shouldBe expected + } + + // TODO: check columns as well when #1531 is fixed + @Test + fun `countDistinct on empty GroupBy`() { + df.drop(df.nrow).groupBy("group").countDistinct().count() shouldBe 0 + } + + @Test + fun `countDistinct on GroupBy with nulls`() { + val result = df.append(null, null, 1).groupBy("group").countDistinct() + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(3, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with null group key`() { + val result = df.append("Dave", 30, null).groupBy("group").countDistinct() + val expected = dataFrameOf( + "group" to columnOf(1, 2, null), + "countDistinct" to columnOf(2, 1, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with columns selector`() { + val result = df.groupBy("group").countDistinct { "name"() } + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(2, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with columns selector (not distinct only by selected column)`() { + val df = dataFrameOf( + "name" to columnOf("Alice", "Bob", "Charlie"), + "age" to columnOf(15, 15, 20), + "group" to columnOf(1, 1, 2), + ) + val result = df.groupBy("group").countDistinct { "age"() } + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(1, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with multiple columns selector`() { + val df = dataFrameOf( + "name" to columnOf("Alice", "Alice", "Bob", "Charlie"), + "age" to columnOf(15, 15, 20, 25), + "group" to columnOf(1, 1, 1, 2), + "city" to columnOf("London", "Moscow", "London", "Paris"), + ) + val result = df.groupBy("group").countDistinct { "name"() and "age"() } + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(2, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on grouped DataFrame with columns selector and custom result name`() { + val result = df.groupBy("group").countDistinct(resultName = "unique") { "name"() } + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "unique" to columnOf(2, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on grouped DataFrame with multiple columns selector with nulls`() { + val result = df + .append(null, null, 1) + .groupBy("group") + .countDistinct { "name"() and "age"() } + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(3, 1), + ) + result shouldBe expected + } +} From 0284a7ecf079c6366e77113ab98865d15682c589 Mon Sep 17 00:00:00 2001 From: Aleksandr Nikolaev Date: Wed, 27 May 2026 16:49:41 +0200 Subject: [PATCH 03/10] Update website documentation for the `countDistinct` function because of new overloads on `GroupBy` --- .../kotlinx/dataframe/samples/api/Access.kt | 24 - ...ColumnsCustomNameOnGroupBy_properties.html | 511 +++++++++++++++++ .../countDistinct/countDistinctColumnsDf.html | 516 +++++++++++++++++ .../countDistinctColumnsGroupBy.html | 533 ++++++++++++++++++ ...ntDistinctColumnsOnGroupBy_properties.html | 511 +++++++++++++++++ .../api/countDistinct/countDistinctDf.html | 516 +++++++++++++++++ .../countDistinct/countDistinctGroupBy.html | 533 ++++++++++++++++++ .../countDistinctOnGroupBy_properties.html | 511 +++++++++++++++++ docs/StardustDocs/topics/_shadow_resources.md | 7 + docs/StardustDocs/topics/countDistinct.md | 110 +++- samples/build.gradle.kts | 1 + .../samples/api/CountDistinctSamples.kt | 139 +++++ 12 files changed, 3881 insertions(+), 31 deletions(-) create mode 100644 docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html create mode 100644 docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html create mode 100644 docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html create mode 100644 docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html create mode 100644 docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html create mode 100644 docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html create mode 100644 docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBy_properties.html create mode 100644 samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt index 9b4ecf5bfc..2ff7dd90d9 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt @@ -431,30 +431,6 @@ class Access : TestBase() { // SampleEnd } - @Test - @TransformDataFrameExpressions - fun countDistinct() { - // SampleStart - df.countDistinct() - // SampleEnd - } - - @Test - @TransformDataFrameExpressions - fun countDistinctColumns_properties() { - // SampleStart - df.countDistinct { age and name } - // SampleEnd - } - - @Test - @TransformDataFrameExpressions - fun countDistinctColumns_strings() { - // SampleStart - df.countDistinct("age", "name") - // SampleEnd - } - @Test @TransformDataFrameExpressions fun distinctColumns_strings() { diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html new file mode 100644 index 0000000000..610f3b6c63 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html @@ -0,0 +1,511 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html new file mode 100644 index 0000000000..1a5c11b426 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html @@ -0,0 +1,516 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html new file mode 100644 index 0000000000..de9c330b40 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html @@ -0,0 +1,533 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html new file mode 100644 index 0000000000..30991af112 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html @@ -0,0 +1,511 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html new file mode 100644 index 0000000000..bf549bf28c --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html @@ -0,0 +1,516 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html new file mode 100644 index 0000000000..577ae92a09 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html @@ -0,0 +1,533 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBy_properties.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBy_properties.html new file mode 100644 index 0000000000..0affb1cbcd --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBy_properties.html @@ -0,0 +1,511 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/topics/_shadow_resources.md b/docs/StardustDocs/topics/_shadow_resources.md index c9a1d3b506..6eaa0e1b48 100644 --- a/docs/StardustDocs/topics/_shadow_resources.md +++ b/docs/StardustDocs/topics/_shadow_resources.md @@ -318,3 +318,10 @@ + + + + + + + diff --git a/docs/StardustDocs/topics/countDistinct.md b/docs/StardustDocs/topics/countDistinct.md index 5c2a406761..eb4864e8b8 100644 --- a/docs/StardustDocs/topics/countDistinct.md +++ b/docs/StardustDocs/topics/countDistinct.md @@ -1,33 +1,129 @@ [//]: # (title: countDistinct) - + -Returns number of distinct combinations of values in selected columns of [`DataFrame`](DataFrame.md). + +Counts distinct rows or distinct combinations of values in selected columns. + +When `countDistinct` is used on a [`DataFrame`](DataFrame.md), +it returns the number of distinct rows in this [`DataFrame`](DataFrame.md). + + + +```kotlin +df +``` + + + + + + +```kotlin +df.countDistinct() // the result is 10 +``` + + + +You can also specify which columns to use when counting distinct combinations of values. + + ```kotlin -df.countDistinct { age and name } +df.countDistinct { name.firstName and city } // the result is 9 ``` ```kotlin -df.countDistinct("age", "name") +df.countDistinct { "name"["firstName"] and "city" } // the result is 9 ``` -When `columns` are not specified, returns number of distinct rows in [`DataFrame`](DataFrame.md). +When `countDistinct` is used on a `GroupBy`, it counts distinct rows within each group. +That is, this function returns a [`DataFrame`](DataFrame.md) where each row corresponds to a group +from the original `GroupBy`. The result contains the original group key columns +and a new column with the number of distinct rows (or combinations of values in selected columns) in each group. - +Let's take this `GroupBy` as an example: + + + +```kotlin +df.groupBy { isHappy } +``` + + + + +Applying `countDistinct` to this `GroupBy` yields the following result: + + + + + +```kotlin +df.groupBy { isHappy }.countDistinct() +``` + + + ```kotlin -df.countDistinct() +df.groupBy("isHappy").countDistinct() ``` + + + + +You can also specify which columns in the groups should be used to determine distinctness. + + + + + + + +```kotlin +df.groupBy { isHappy }.countDistinct { name.firstName } +``` + + + + +```kotlin +df.groupBy("isHappy").countDistinct { "name"["firstName"] } +``` + + + + + +The default name of the new column is `countDistinct`, but you can choose a different one. + + + + + +```kotlin +df.groupBy { isHappy }.countDistinct("uniqueFirstNames") { name.firstName } +``` + + + + +```kotlin +df.groupBy("isHappy").countDistinct("uniqueFirstNames") { "name"["firstName"] } +``` + + + diff --git a/samples/build.gradle.kts b/samples/build.gradle.kts index 5e3051ad85..7269a5e63b 100644 --- a/samples/build.gradle.kts +++ b/samples/build.gradle.kts @@ -114,6 +114,7 @@ korro { include("columnArithmetics.md") include("groupBy.md") include("pivot.md") + include("countDistinct.md") }, ) baseDir = rootProject.file("docs/StardustDocs/topics") diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt new file mode 100644 index 0000000000..210e9e074e --- /dev/null +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt @@ -0,0 +1,139 @@ +package org.jetbrains.kotlinx.dataframe.samples.api + +import org.jetbrains.kotlinx.dataframe.api.RgbColor +import org.jetbrains.kotlinx.dataframe.api.and +import org.jetbrains.kotlinx.dataframe.api.convert +import org.jetbrains.kotlinx.dataframe.api.countDistinct +import org.jetbrains.kotlinx.dataframe.api.format +import org.jetbrains.kotlinx.dataframe.api.groupBy +import org.jetbrains.kotlinx.dataframe.api.perRowCol +import org.jetbrains.kotlinx.dataframe.api.with +import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper +import org.jetbrains.kotlinx.dataframe.util.defaultHeaderFormatting +import org.junit.Test + +class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { + val df = peopleDf + + private fun lastNameToColor(name: String): RgbColor = + when (name) { + "Byrd", "Daniels" -> RgbColor(210, 229, 199) + else -> RgbColor(255, 255, 255) + } + + private fun firstNameToColor(name: String): RgbColor = + when (name) { + "Alice" -> RgbColor(189, 206, 233) + "Bob" -> RgbColor(198, 224, 198) + "Charlie" -> RgbColor(219, 198, 230) + else -> RgbColor(255, 255, 255) + } + + @Test + fun countDistinctDf() { + // SampleStart + df + // SampleEnd + .saveDfHtmlSample() + } + + @Test + fun countDistinct() { + // SampleStart + df.countDistinct() // the result is 10 + // SampleEnd + } + + @Test + fun countDistinctColumns_properties() { + // SampleStart + df.countDistinct { name.firstName and city } // the result is 9 + // SampleEnd + } + + @Test + fun countDistinctColumns_strings() { + // SampleStart + df.countDistinct { "name"["firstName"] and "city" } // the result is 9 + // SampleEnd + } + + @Test + fun countDistinctColumnsDf() { + df.format().perRowCol { row, _ -> + val lastName = df[row.index()].name.lastName + background(lastNameToColor(lastName)) and textColor(black) + } + .saveDfHtmlSample() + } + + @Test + fun countDistinctGroupBy() { + // SampleStart + df.groupBy { isHappy } + // SampleEnd + .saveDfHtmlSample() + } + + @Test + fun countDistinctOnGroupBy_properties() { + // SampleStart + df.groupBy { isHappy }.countDistinct() + // SampleEnd + .defaultHeaderFormatting { "countDistinct"() } + .saveDfHtmlSample() + } + + @Test + fun countDistinctOnGroupBy_strings() { + // SampleStart + df.groupBy("isHappy").countDistinct() + // SampleEnd + } + + @Test + fun countDistinctColumnsOnGroupBy_properties() { + // SampleStart + df.groupBy { isHappy }.countDistinct { name.firstName } + // SampleEnd + .defaultHeaderFormatting { "countDistinct"() } + .saveDfHtmlSample() + } + + @Test + fun countDistinctColumnsOnGroupBy_strings() { + // SampleStart + df.groupBy("isHappy").countDistinct { "name"["firstName"] } + // SampleEnd + } + + @Test + fun countDistinctColumnsCustomNameOnGroupBy_properties() { + // SampleStart + df.groupBy { isHappy }.countDistinct("uniqueFirstNames") { name.firstName } + // SampleEnd + .defaultHeaderFormatting { "uniqueFirstNames"() } + .saveDfHtmlSample() + } + + @Test + fun countDistinctColumnsCustomNameOnGroupBy_strings() { + // SampleStart + df.groupBy("isHappy").countDistinct("uniqueFirstNames") { "name"["firstName"] } + // SampleEnd + } + + @Test + fun countDistinctColumnsGroupBy() { + df.groupBy { isHappy } + .toDataFrame() + .convert { group }.with { group -> + val firstNameCol = group["name"]["firstName"] + group.format().perRowCol { row, _ -> + val firstName = firstNameCol[row.index()] as String + background(firstNameToColor(firstName)) and textColor(black) + } + } + .saveDfHtmlSample() + } +} From 1f30935f8bb34f18b5c16efef355f0150b7a10a2 Mon Sep 17 00:00:00 2001 From: Aleksandr Nikolaev Date: Wed, 27 May 2026 16:53:06 +0200 Subject: [PATCH 04/10] Update `groupBy` website documentation and KDocs because of new `countDistinct` overloads --- .../kotlinx/dataframe/api/groupBy.kt | 5 ++ docs/StardustDocs/topics/groupBy.md | 58 +++++++++++++++---- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt index 32c1c5bf81..f3cb96149c 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt @@ -156,6 +156,9 @@ internal interface GroupByDocs { * `| `__`.`__[**`count`**][Grouped.count]**`() `** * * {@include [Indent]} + * `| `__`.`__[**`countDistinct`**][Grouped.countDistinct]**`() `** + * + * {@include [Indent]} * `| `__`.`__[**`aggregate`**][Grouped.aggregate]**` { `**`aggregations: `[`AggregateDsl`][AggregateDsl]**` }`** * * {@include [Indent]} @@ -301,6 +304,8 @@ internal interface GroupByDocs { * from all rows of each group for the selected columns. * * [count][Grouped.count] — creates a [DataFrame] containing the grouping key columns and an additional column * with the number of rows in each corresponding group; + * * [countDistinct][Grouped.countDistinct] — creates a [DataFrame] containing the grouping key columns + * and an additional column with the number of distinct rows in each corresponding group; * * [aggregate][Grouped.aggregate] — performs a set of custom aggregations using [AggregateDsl], * allowing you to compute one or more derived values per group; * * [Various aggregation statistics][AggregationStatistics] — predefined shortcuts diff --git a/docs/StardustDocs/topics/groupBy.md b/docs/StardustDocs/topics/groupBy.md index b710b53e81..0f4f2eb09d 100644 --- a/docs/StardustDocs/topics/groupBy.md +++ b/docs/StardustDocs/topics/groupBy.md @@ -1,6 +1,7 @@ [//]: # (title: groupBy) + Splits the rows of [`DataFrame`](DataFrame.md) into groups using one or several columns as grouping keys. @@ -23,7 +24,7 @@ transformations = [ .sortByGroup { expression } | .sortByGroupDesc { expression reducer = .minBy { column } | .maxBy { column } | .medianBy { rowExpression } | .percentileBy(percentile) { rowExpression } | .first [ { rowCondition } ] | .last [ { rowCondition } ] .concat() | .into([column]) [{ rowExpression }] | .values { valueColumns } -aggregator = .count() | .concat() | .concatWithKeys() | .toDataFrame() | .into([column]) [{ rowExpression }] | .values { valueColumns } | .aggregate { aggregations } | . [ { columns } ] +aggregator = .count() | .countDistinct() | .concat() | .concatWithKeys() | .toDataFrame() | .into([column]) [{ rowExpression }] | .values { valueColumns } | .aggregate { aggregations } | . [ { columns } ] pivot = .pivot { columns } [ .default(defaultValue) ] @@ -602,22 +603,35 @@ A `GroupBy` can be directly transformed into a new [`DataFrame`](DataFrame.md) b Aggregation is a generalization of [`reducing`](groupBy.md#reducing). The following aggregation methods are available: -* [`concat`](concat.md) — concatenates all [`rows`](DataRow.md) from all groups into a single [`DataFrame`](DataFrame.md), without preserving grouping keys. -* [`toDataFrame`](createDataFrame.md#todataframe) — returns this `GroupBy` as a [`DataFrame`](DataFrame.md) with the grouping keys and corresponding groups in [FrameColumn](DataColumn.md#framecolumn). -* `concatWithKeys` — a variant of [`concat`](concat.md) that also includes grouping keys that were not present in the original [`DataFrame`](DataFrame.md). -* `into` — creates a new [`column`](DataColumn.md) containing a list of values computed with a `RowExpression` for each group, or a new [FrameColumn](DataColumn.md#framecolumn) containing the groups themselves. -* [`values`](values.md) — collects all column values for every group without aggregation. For a [ValueColumn](DataColumn.md#valuecolumn) of type `T` it will gather group values into lists of type `List`. -For a [ColumnGroup](DataColumn.md#columngroup) it will gather group values into a [`DataFrame`](DataFrame.md) and convert that [ColumnGroup](DataColumn.md#columngroup) into a [FrameColumn](DataColumn.md#framecolumn). -* [`count`](count.md) — creates a [`DataFrame`](DataFrame.md) containing the grouping key columns and an additional [`column`](DataColumn.md) with the number of rows in each corresponding group. -* `aggregate` — performs a set of custom aggregations using `AggregateDsl`, allowing you to compute one or more [statistics](summaryStatistics.md) per every group of `GroupBy`. -The body if this function will be executed for every data group and has a receiver of type [`DataFrame`](DataFrame.md) that represents the current data group being aggregated. +* [`concat`](concat.md) — concatenates all [`rows`](DataRow.md) from all groups into a single [`DataFrame`](DataFrame.md), +without preserving grouping keys. +* [`toDataFrame`](createDataFrame.md#todataframe) — returns this `GroupBy` as a [`DataFrame`](DataFrame.md) +with the grouping keys and corresponding groups in [FrameColumn](DataColumn.md#framecolumn). +* `concatWithKeys` — a variant of [`concat`](concat.md) +that also includes grouping keys that were not present in the original [`DataFrame`](DataFrame.md). +* `into` — creates a new [`column`](DataColumn.md) containing a list of values computed with a `RowExpression` for each group, +or a new [FrameColumn](DataColumn.md#framecolumn) containing the groups themselves. +* [`values`](values.md) — collects all column values for every group without aggregation. +For a [ValueColumn](DataColumn.md#valuecolumn) of type `T` it will gather group values into lists of type `List`. +For a [ColumnGroup](DataColumn.md#columngroup) it will gather group values into a [`DataFrame`](DataFrame.md) +and convert that [ColumnGroup](DataColumn.md#columngroup) into a [FrameColumn](DataColumn.md#framecolumn). +* [`count`](count.md) — creates a [`DataFrame`](DataFrame.md) containing the grouping key columns +and an additional [`column`](DataColumn.md) with the number of rows in each corresponding group. +* [`countDistinct`](countDistinct.md) — creates a [`DataFrame`](DataFrame.md) containing the grouping key columns +and an additional [`column`](DataColumn.md) with the number of distinct rows in each corresponding group. +* `aggregate` — performs a set of custom aggregations using `AggregateDsl`, +allowing you to compute one or more [statistics](summaryStatistics.md) per every group of `GroupBy`. +The body if this function will be executed for every data group and has a receiver of type [`DataFrame`](DataFrame.md) +that represents the current data group being aggregated. To add a new column to the resulting [`DataFrame`](DataFrame.md), pass the name of the new column to infix function `into`. -Each of these methods returns a new DataFrame that includes the grouping key columns (except for [`concat`](concat.md)) along with the columns of values aggregated from the corresponding groups. +Each of these methods returns a new DataFrame that includes the grouping key columns (except for [`concat`](concat.md)) +along with the columns of values aggregated from the corresponding groups. ### Examples of aggregation #### concat on GroupBy {collapsible="true"} -[`concat`](concat.md) can be used to union all data groups of `GroupBy` into the original [`DataFrame`](DataFrame.md) preserving the new order of rows produced by grouping: +[`concat`](concat.md) can be used to union all data groups of `GroupBy` into the original [`DataFrame`](DataFrame.md) +preserving the new order of rows produced by grouping: @@ -781,6 +795,26 @@ df.groupBy("city").count() +#### countDistinct on GroupBy {collapsible="true"} + + + + +```kotlin +df.groupBy { isHappy }.countDistinct() +``` + + + + +```kotlin +df.groupBy("isHappy").countDistinct() +``` + + + + + #### aggregate on GroupBy {collapsible="true"} From 070f21b203fb8bb3e4f0e35b72afb47a31e59d28 Mon Sep 17 00:00:00 2001 From: Aleksandr Nikolaev Date: Wed, 27 May 2026 16:55:21 +0200 Subject: [PATCH 05/10] Remove unused import in `Access.kt` --- .../kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt index 2ff7dd90d9..e8afdc8450 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt @@ -6,7 +6,6 @@ import org.jetbrains.kotlinx.dataframe.api.add import org.jetbrains.kotlinx.dataframe.api.after import org.jetbrains.kotlinx.dataframe.api.chunked import org.jetbrains.kotlinx.dataframe.api.colsOf -import org.jetbrains.kotlinx.dataframe.api.countDistinct import org.jetbrains.kotlinx.dataframe.api.distinct import org.jetbrains.kotlinx.dataframe.api.distinctBy import org.jetbrains.kotlinx.dataframe.api.drop From 70b1648fb026d85c06297de9b8c82413ac254e04 Mon Sep 17 00:00:00 2001 From: Aleksandr Nikolaev Date: Mon, 1 Jun 2026 17:32:49 +0200 Subject: [PATCH 06/10] Fixes to `countDistinct` and `groupBy` documentation after review --- .../kotlinx/dataframe/api/groupBy.kt | 2 + .../kotlinx/dataframe/api/countDistinct.kt | 13 +- ...ColumnsCustomNameOnGroupBy_properties.html | 6 +- .../countDistinct/countDistinctColumnsDf.html | 16 +- .../countDistinctColumnsGroupBy.html | 49 +- ...ntDistinctColumnsOnGroupBy_properties.html | 6 +- .../api/countDistinct/countDistinctDf.html | 16 +- .../countDistinct/countDistinctGroupBy.html | 49 +- ...istinctOnGroupBySmallTable_properties.html | 511 ++++++++++++++++++ .../countDistinctOnGroupBy_properties.html | 6 +- docs/StardustDocs/topics/_shadow_resources.md | 3 +- docs/StardustDocs/topics/countDistinct.md | 22 +- docs/StardustDocs/topics/groupBy.md | 6 +- .../samples/api/CountDistinctSamples.kt | 40 +- .../dataframe/samples/api/GroupBySamples.kt | 17 + 15 files changed, 659 insertions(+), 103 deletions(-) create mode 100644 docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBySmallTable_properties.html rename docs/StardustDocs/resources/api/{countDistinct => groupBy}/countDistinctOnGroupBy_properties.html (96%) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt index f3cb96149c..0b1a9a2da2 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt @@ -199,6 +199,8 @@ internal interface GroupByDocs { * * * [count][Grouped.count] — calculate the number of rows in each group * (optionally counting only rows that satisfy the given predicate); + * * [`countDistinct`][Grouped.countDistinct] — calculate the number of distinct rows in each group + * (or distinct combinations of values in selected columns); * * [max][Grouped.max] / [maxOf][Grouped.maxOf] / [maxFor][Grouped.maxFor] — * calculate the maximum of all values on the selected columns / by a row expression / * for each of the selected columns within each group; diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt index fd83ad0a0f..0e0810c9ee 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt @@ -50,12 +50,17 @@ class CountDistinctTests { // TODO: check columns as well when #1531 is fixed @Test fun `countDistinct on empty GroupBy`() { - df.drop(df.nrow).groupBy("group").countDistinct().count() shouldBe 0 + df + .drop(df.nrow) + .groupBy("group").countDistinct() + .count() shouldBe 0 } @Test fun `countDistinct on GroupBy with nulls`() { - val result = df.append(null, null, 1).groupBy("group").countDistinct() + val result = df + .append(null, null, 1) + .groupBy("group").countDistinct() val expected = dataFrameOf( "group" to columnOf(1, 2), "countDistinct" to columnOf(3, 1), @@ -65,7 +70,9 @@ class CountDistinctTests { @Test fun `countDistinct on GroupBy with null group key`() { - val result = df.append("Dave", 30, null).groupBy("group").countDistinct() + val result = df + .append("Dave", 30, null) + .groupBy("group").countDistinct() val expected = dataFrameOf( "group" to columnOf(1, 2, null), "countDistinct" to columnOf(2, 1, 1), diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html index 610f3b6c63..22abe47ddc 100644 --- a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html @@ -457,9 +457,9 @@ })() /**/ call_DataFrame(function() { DataFrame.renderTable(0) }); diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html index 1a5c11b426..8a2594d162 100644 --- a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html @@ -457,14 +457,14 @@ })() /**/ call_DataFrame(function() { DataFrame.renderTable(0) }); diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html index de9c330b40..486586343f 100644 --- a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html @@ -457,31 +457,42 @@ })() /**/ /**/ /**/ + +/**/ call_DataFrame(function() { DataFrame.renderTable(0) }); diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html index 30991af112..33f67fa573 100644 --- a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html @@ -457,9 +457,9 @@ })() /**/ call_DataFrame(function() { DataFrame.renderTable(0) }); diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html index bf549bf28c..402c557208 100644 --- a/docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html @@ -457,14 +457,14 @@ })() /**/ call_DataFrame(function() { DataFrame.renderTable(0) }); diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html index 577ae92a09..d2e393ac8a 100644 --- a/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html @@ -457,31 +457,42 @@ })() /**/ /**/ /**/ + +/**/ call_DataFrame(function() { DataFrame.renderTable(0) }); diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBySmallTable_properties.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBySmallTable_properties.html new file mode 100644 index 0000000000..f84d1e8140 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBySmallTable_properties.html @@ -0,0 +1,511 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBy_properties.html b/docs/StardustDocs/resources/api/groupBy/countDistinctOnGroupBy_properties.html similarity index 96% rename from docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBy_properties.html rename to docs/StardustDocs/resources/api/groupBy/countDistinctOnGroupBy_properties.html index 0affb1cbcd..d8e535cdb1 100644 --- a/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBy_properties.html +++ b/docs/StardustDocs/resources/api/groupBy/countDistinctOnGroupBy_properties.html @@ -457,9 +457,9 @@ })() /**/ call_DataFrame(function() { DataFrame.renderTable(0) }); diff --git a/docs/StardustDocs/topics/_shadow_resources.md b/docs/StardustDocs/topics/_shadow_resources.md index 6eaa0e1b48..e000fc5e0e 100644 --- a/docs/StardustDocs/topics/_shadow_resources.md +++ b/docs/StardustDocs/topics/_shadow_resources.md @@ -202,6 +202,7 @@ + @@ -320,7 +321,7 @@ - + diff --git a/docs/StardustDocs/topics/countDistinct.md b/docs/StardustDocs/topics/countDistinct.md index eb4864e8b8..b1b01039c6 100644 --- a/docs/StardustDocs/topics/countDistinct.md +++ b/docs/StardustDocs/topics/countDistinct.md @@ -20,7 +20,7 @@ df ```kotlin -df.countDistinct() // the result is 10 +df.countDistinct() // the result is 4 ``` @@ -34,14 +34,14 @@ You can also specify which columns to use when counting distinct combinations of ```kotlin -df.countDistinct { name.firstName and city } // the result is 9 +df.countDistinct { name.firstName and city } // the result is 3 ``` ```kotlin -df.countDistinct { "name"["firstName"] and "city" } // the result is 9 +df.countDistinct { "name"["firstName"] and "city" } // the result is 3 ```
@@ -57,7 +57,7 @@ Let's take this `GroupBy` as an example: ```kotlin -df.groupBy { isHappy } +df.groupBy { city } ``` @@ -70,19 +70,19 @@ Applying `countDistinct` to this `GroupBy` yields the following result: ```kotlin -df.groupBy { isHappy }.countDistinct() +df.groupBy { city }.countDistinct() ``` ```kotlin -df.groupBy("isHappy").countDistinct() +df.groupBy("city").countDistinct() ```
- + You can also specify which columns in the groups should be used to determine distinctness. @@ -93,14 +93,14 @@ You can also specify which columns in the groups should be used to determine dis ```kotlin -df.groupBy { isHappy }.countDistinct { name.firstName } +df.groupBy { city }.countDistinct { name.firstName } ``` ```kotlin -df.groupBy("isHappy").countDistinct { "name"["firstName"] } +df.groupBy("city").countDistinct { "name"["firstName"] } ``` @@ -114,14 +114,14 @@ The default name of the new column is `countDistinct`, but you can choose a diff ```kotlin -df.groupBy { isHappy }.countDistinct("uniqueFirstNames") { name.firstName } +df.groupBy { city }.countDistinct("uniqueFirstNames") { name.firstName } ``` ```kotlin -df.groupBy("isHappy").countDistinct("uniqueFirstNames") { "name"["firstName"] } +df.groupBy("city").countDistinct("uniqueFirstNames") { "name"["firstName"] } ``` diff --git a/docs/StardustDocs/topics/groupBy.md b/docs/StardustDocs/topics/groupBy.md index 0f4f2eb09d..122fa8ace0 100644 --- a/docs/StardustDocs/topics/groupBy.md +++ b/docs/StardustDocs/topics/groupBy.md @@ -1,7 +1,6 @@ [//]: # (title: groupBy) - Splits the rows of [`DataFrame`](DataFrame.md) into groups using one or several columns as grouping keys. @@ -801,14 +800,14 @@ df.groupBy("city").count() ```kotlin -df.groupBy { isHappy }.countDistinct() +df.groupBy { city }.countDistinct { name.firstName } ``` ```kotlin -df.groupBy("isHappy").countDistinct() +df.groupBy("city").countDistinct { "name"["firstName"] } ``` @@ -883,6 +882,7 @@ Each function computes a statistic across the [`rows`](DataRow.md) of a group an The following aggregation statistics are available: * [`count`](count.md); +* [`countDistinct`](countDistinct.md); * [`max / maxOf / maxFor`](minmax.md); * [`min / minOf / minFor`](minmax.md); * [`sum / sumOf / sumFor`](sum.md); diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt index 210e9e074e..c778f9bae5 100644 --- a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt @@ -4,22 +4,18 @@ import org.jetbrains.kotlinx.dataframe.api.RgbColor import org.jetbrains.kotlinx.dataframe.api.and import org.jetbrains.kotlinx.dataframe.api.convert import org.jetbrains.kotlinx.dataframe.api.countDistinct +import org.jetbrains.kotlinx.dataframe.api.filter import org.jetbrains.kotlinx.dataframe.api.format import org.jetbrains.kotlinx.dataframe.api.groupBy import org.jetbrains.kotlinx.dataframe.api.perRowCol +import org.jetbrains.kotlinx.dataframe.api.take import org.jetbrains.kotlinx.dataframe.api.with import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper import org.jetbrains.kotlinx.dataframe.util.defaultHeaderFormatting import org.junit.Test class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { - val df = peopleDf - - private fun lastNameToColor(name: String): RgbColor = - when (name) { - "Byrd", "Daniels" -> RgbColor(210, 229, 199) - else -> RgbColor(255, 255, 255) - } + val df = peopleDf.take(7).filter { index() in setOf(0, 1, 2, 6) } private fun firstNameToColor(name: String): RgbColor = when (name) { @@ -40,29 +36,29 @@ class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { @Test fun countDistinct() { // SampleStart - df.countDistinct() // the result is 10 + df.countDistinct() // the result is 4 // SampleEnd } @Test fun countDistinctColumns_properties() { // SampleStart - df.countDistinct { name.firstName and city } // the result is 9 + df.countDistinct { name.firstName and city } // the result is 3 // SampleEnd } @Test fun countDistinctColumns_strings() { // SampleStart - df.countDistinct { "name"["firstName"] and "city" } // the result is 9 + df.countDistinct { "name"["firstName"] and "city" } // the result is 3 // SampleEnd } @Test fun countDistinctColumnsDf() { df.format().perRowCol { row, _ -> - val lastName = df[row.index()].name.lastName - background(lastNameToColor(lastName)) and textColor(black) + val firstName = df[row.index()].name.firstName + background(firstNameToColor(firstName)) and textColor(black) } .saveDfHtmlSample() } @@ -70,31 +66,31 @@ class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { @Test fun countDistinctGroupBy() { // SampleStart - df.groupBy { isHappy } + df.groupBy { city } // SampleEnd .saveDfHtmlSample() } @Test - fun countDistinctOnGroupBy_properties() { + fun countDistinctOnGroupBySmallTable_properties() { // SampleStart - df.groupBy { isHappy }.countDistinct() + df.groupBy { city }.countDistinct() // SampleEnd .defaultHeaderFormatting { "countDistinct"() } .saveDfHtmlSample() } @Test - fun countDistinctOnGroupBy_strings() { + fun countDistinctOnGroupBySmallTable_strings() { // SampleStart - df.groupBy("isHappy").countDistinct() + df.groupBy("city").countDistinct() // SampleEnd } @Test fun countDistinctColumnsOnGroupBy_properties() { // SampleStart - df.groupBy { isHappy }.countDistinct { name.firstName } + df.groupBy { city }.countDistinct { name.firstName } // SampleEnd .defaultHeaderFormatting { "countDistinct"() } .saveDfHtmlSample() @@ -103,14 +99,14 @@ class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { @Test fun countDistinctColumnsOnGroupBy_strings() { // SampleStart - df.groupBy("isHappy").countDistinct { "name"["firstName"] } + df.groupBy("city").countDistinct { "name"["firstName"] } // SampleEnd } @Test fun countDistinctColumnsCustomNameOnGroupBy_properties() { // SampleStart - df.groupBy { isHappy }.countDistinct("uniqueFirstNames") { name.firstName } + df.groupBy { city }.countDistinct("uniqueFirstNames") { name.firstName } // SampleEnd .defaultHeaderFormatting { "uniqueFirstNames"() } .saveDfHtmlSample() @@ -119,13 +115,13 @@ class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { @Test fun countDistinctColumnsCustomNameOnGroupBy_strings() { // SampleStart - df.groupBy("isHappy").countDistinct("uniqueFirstNames") { "name"["firstName"] } + df.groupBy("city").countDistinct("uniqueFirstNames") { "name"["firstName"] } // SampleEnd } @Test fun countDistinctColumnsGroupBy() { - df.groupBy { isHappy } + df.groupBy { city } .toDataFrame() .convert { group }.with { group -> val firstNameCol = group["name"]["firstName"] diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/GroupBySamples.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/GroupBySamples.kt index bde4d02289..59f4f0372e 100644 --- a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/GroupBySamples.kt +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/GroupBySamples.kt @@ -12,6 +12,7 @@ import org.jetbrains.kotlinx.dataframe.api.concat import org.jetbrains.kotlinx.dataframe.api.concatWithKeys import org.jetbrains.kotlinx.dataframe.api.convert import org.jetbrains.kotlinx.dataframe.api.count +import org.jetbrains.kotlinx.dataframe.api.countDistinct import org.jetbrains.kotlinx.dataframe.api.dataFrameOf import org.jetbrains.kotlinx.dataframe.api.div import org.jetbrains.kotlinx.dataframe.api.expr @@ -661,6 +662,22 @@ class GroupBySamples : DataFrameSampleHelper("groupBy", "api") { // SampleEnd } + @Test + fun countDistinctOnGroupBy_properties() { + // SampleStart + df.groupBy { city }.countDistinct { name.firstName } + // SampleEnd + .defaultHeaderFormatting { "countDistinct"() } + .saveDfHtmlSample() + } + + @Test + fun countDistinctOnGroupBy_strings() { + // SampleStart + df.groupBy("city").countDistinct { "name"["firstName"] } + // SampleEnd + } + @Test fun aggregateOnGroupBy_properties() { // SampleStart From b1b683df910fa0757e00bd7d1b468e56754756d2 Mon Sep 17 00:00:00 2001 From: Aleksandr Nikolaev Date: Wed, 3 Jun 2026 14:22:39 +0200 Subject: [PATCH 07/10] Add a script to expand nested frames in HTML samples. Expand nested frames in `countDistinct` website docs --- .../countDistinctColumnsGroupBy.html | 30 ++++++++++++ .../countDistinct/countDistinctGroupBy.html | 30 ++++++++++++ .../samples/DataFrameSampleHelper.kt | 49 +++++++++++++++++++ .../samples/api/CountDistinctSamples.kt | 2 + 4 files changed, 111 insertions(+) diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html index 486586343f..ba884df876 100644 --- a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html @@ -497,6 +497,36 @@ call_DataFrame(function() { DataFrame.renderTable(0) }); +(function () { + function expandColumnGroups(df) { + for (let col of df.cols) { + if (col.parent === undefined && col.children.length > 0) col.expanded = true; + } + } + + function expandNestedFrames(df, rootDf) { + for (let col of df.cols) { + for (let value of col.values) { + if (value && value.frameId !== undefined) { + rootDf.expandedFrames.add(value.frameId); + let child = rootDf.childFrames[value.frameId]; + if (child) { + expandColumnGroups(child); + expandNestedFrames(child, rootDf); + } + } + } + } + } + + document.querySelectorAll("table.dataframe").forEach(function (table) { + if (table.df && table.df.id === table.df.rootId) { + let rootDf = table.df; + expandNestedFrames(rootDf, rootDf); + DataFrame.renderTable(rootDf.id); + } + }); +})(); function sendHeight() { const table = document.querySelector('table.dataframe'); if (!table) return; diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html index d2e393ac8a..f26cbd0bc9 100644 --- a/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html @@ -497,6 +497,36 @@ call_DataFrame(function() { DataFrame.renderTable(0) }); +(function () { + function expandColumnGroups(df) { + for (let col of df.cols) { + if (col.parent === undefined && col.children.length > 0) col.expanded = true; + } + } + + function expandNestedFrames(df, rootDf) { + for (let col of df.cols) { + for (let value of col.values) { + if (value && value.frameId !== undefined) { + rootDf.expandedFrames.add(value.frameId); + let child = rootDf.childFrames[value.frameId]; + if (child) { + expandColumnGroups(child); + expandNestedFrames(child, rootDf); + } + } + } + } + } + + document.querySelectorAll("table.dataframe").forEach(function (table) { + if (table.df && table.df.id === table.df.rootId) { + let rootDf = table.df; + expandNestedFrames(rootDf, rootDf); + DataFrame.renderTable(rootDf.id); + } + }); +})(); function sendHeight() { const table = document.querySelector('table.dataframe'); if (!table) return; diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt index 8b63791f9a..dfa74e96c3 100644 --- a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt @@ -1,8 +1,13 @@ package org.jetbrains.kotlinx.dataframe.samples import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.CodeString +import org.jetbrains.kotlinx.dataframe.api.GroupBy import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData +import org.jetbrains.kotlinx.dataframe.io.DisplayConfiguration +import org.jetbrains.kotlinx.dataframe.io.toStandaloneHtml import org.jetbrains.kotlinx.dataframe.samples.api.TestBase import org.jetbrains.kotlinx.kandy.letsplot.samples.SampleHelper import java.io.File @@ -44,4 +49,48 @@ abstract class DataFrameSampleHelper(sampleName: String, subFolder: String = "sa fun DataColumn<*>.saveDfHtmlSample() { toDataFrame().saveDfHtmlSample() } + + // TODO: might be changed as #1887 is fixed + private val expandNestedFramesScript = DataFrameHtmlData( + script = + """ + (function () { + function expandColumnGroups(df) { + for (let col of df.cols) { + if (col.parent === undefined && col.children.length > 0) col.expanded = true; + } + } + + function expandNestedFrames(df, rootDf) { + for (let col of df.cols) { + for (let value of col.values) { + if (value && value.frameId !== undefined) { + rootDf.expandedFrames.add(value.frameId); + let child = rootDf.childFrames[value.frameId]; + if (child) { + expandColumnGroups(child); + expandNestedFrames(child, rootDf); + } + } + } + } + } + + document.querySelectorAll("table.dataframe").forEach(function (table) { + if (table.df && table.df.id === table.df.rootId) { + let rootDf = table.df; + expandNestedFrames(rootDf, rootDf); + DataFrame.renderTable(rootDf.id); + } + }); + })(); + """.trimIndent(), + ) + + fun DataFrame<*>.toExpandedHtml() = toStandaloneHtml( + configuration = DisplayConfiguration(enableFallbackStaticTables = false), + getFooter = { "" }, + ) + expandNestedFramesScript + + fun GroupBy<*, *>.toExpandedHtml() = toDataFrame().toExpandedHtml() } diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt index c778f9bae5..c310ec2fa1 100644 --- a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt @@ -68,6 +68,7 @@ class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { // SampleStart df.groupBy { city } // SampleEnd + .toExpandedHtml() .saveDfHtmlSample() } @@ -130,6 +131,7 @@ class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { background(firstNameToColor(firstName)) and textColor(black) } } + .toExpandedHtml() .saveDfHtmlSample() } } From f5691e7e526563c2b320661106323874a2a4867d Mon Sep 17 00:00:00 2001 From: Aleksandr Nikolaev Date: Wed, 3 Jun 2026 15:08:26 +0200 Subject: [PATCH 08/10] Apply korro --- .../resources/api/pivot/pivotInward_properties.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/StardustDocs/resources/api/pivot/pivotInward_properties.html b/docs/StardustDocs/resources/api/pivot/pivotInward_properties.html index 49beb58089..8ff674999f 100644 --- a/docs/StardustDocs/resources/api/pivot/pivotInward_properties.html +++ b/docs/StardustDocs/resources/api/pivot/pivotInward_properties.html @@ -459,7 +459,7 @@ /**/ From ac9cd96d1a29237cee87133e083bc37bb1aba153 Mon Sep 17 00:00:00 2001 From: Aleksandr Nikolaev Date: Wed, 3 Jun 2026 15:10:48 +0200 Subject: [PATCH 09/10] Apply ktLintFormat --- .../samples/DataFrameSampleHelper.kt | 57 ++++++++++--------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt index dfa74e96c3..13b95638fd 100644 --- a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt @@ -54,43 +54,44 @@ abstract class DataFrameSampleHelper(sampleName: String, subFolder: String = "sa private val expandNestedFramesScript = DataFrameHtmlData( script = """ - (function () { - function expandColumnGroups(df) { - for (let col of df.cols) { - if (col.parent === undefined && col.children.length > 0) col.expanded = true; + (function () { + function expandColumnGroups(df) { + for (let col of df.cols) { + if (col.parent === undefined && col.children.length > 0) col.expanded = true; + } } - } - function expandNestedFrames(df, rootDf) { - for (let col of df.cols) { - for (let value of col.values) { - if (value && value.frameId !== undefined) { - rootDf.expandedFrames.add(value.frameId); - let child = rootDf.childFrames[value.frameId]; - if (child) { - expandColumnGroups(child); - expandNestedFrames(child, rootDf); + function expandNestedFrames(df, rootDf) { + for (let col of df.cols) { + for (let value of col.values) { + if (value && value.frameId !== undefined) { + rootDf.expandedFrames.add(value.frameId); + let child = rootDf.childFrames[value.frameId]; + if (child) { + expandColumnGroups(child); + expandNestedFrames(child, rootDf); + } } } } } - } - document.querySelectorAll("table.dataframe").forEach(function (table) { - if (table.df && table.df.id === table.df.rootId) { - let rootDf = table.df; - expandNestedFrames(rootDf, rootDf); - DataFrame.renderTable(rootDf.id); - } - }); - })(); - """.trimIndent(), + document.querySelectorAll("table.dataframe").forEach(function (table) { + if (table.df && table.df.id === table.df.rootId) { + let rootDf = table.df; + expandNestedFrames(rootDf, rootDf); + DataFrame.renderTable(rootDf.id); + } + }); + })(); + """.trimIndent(), ) - fun DataFrame<*>.toExpandedHtml() = toStandaloneHtml( - configuration = DisplayConfiguration(enableFallbackStaticTables = false), - getFooter = { "" }, - ) + expandNestedFramesScript + fun DataFrame<*>.toExpandedHtml() = + toStandaloneHtml( + configuration = DisplayConfiguration(enableFallbackStaticTables = false), + getFooter = { "" }, + ) + expandNestedFramesScript fun GroupBy<*, *>.toExpandedHtml() = toDataFrame().toExpandedHtml() } From 400ad0ae5639a2d354f6824de9c749bb9a2f76f9 Mon Sep 17 00:00:00 2001 From: Aleksandr Nikolaev Date: Tue, 9 Jun 2026 17:14:29 +0200 Subject: [PATCH 10/10] Rename `toExpandedHtml` to `toHtmlWithOpenedNestedDfs` --- .../kotlinx/dataframe/samples/DataFrameSampleHelper.kt | 4 ++-- .../kotlinx/dataframe/samples/api/CountDistinctSamples.kt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt index 13b95638fd..620d9a1a6d 100644 --- a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt @@ -87,11 +87,11 @@ abstract class DataFrameSampleHelper(sampleName: String, subFolder: String = "sa """.trimIndent(), ) - fun DataFrame<*>.toExpandedHtml() = + fun DataFrame<*>.toHtmlWithOpenedNestedDfs() = toStandaloneHtml( configuration = DisplayConfiguration(enableFallbackStaticTables = false), getFooter = { "" }, ) + expandNestedFramesScript - fun GroupBy<*, *>.toExpandedHtml() = toDataFrame().toExpandedHtml() + fun GroupBy<*, *>.toHtmlWithOpenedNestedDfs() = toDataFrame().toHtmlWithOpenedNestedDfs() } diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt index c310ec2fa1..377984cb36 100644 --- a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt @@ -68,7 +68,7 @@ class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { // SampleStart df.groupBy { city } // SampleEnd - .toExpandedHtml() + .toHtmlWithOpenedNestedDfs() .saveDfHtmlSample() } @@ -131,7 +131,7 @@ class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { background(firstNameToColor(firstName)) and textColor(black) } } - .toExpandedHtml() + .toHtmlWithOpenedNestedDfs() .saveDfHtmlSample() } }