diff --git a/core/api/core.api b/core/api/core.api index bf5b7adf46..dcbf911126 100644 --- a/core/api/core.api +++ b/core/api/core.api @@ -1714,6 +1714,10 @@ public final class org/jetbrains/kotlinx/dataframe/api/CountDistinctKt { public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Ljava/lang/String;)I public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lkotlin/reflect/KProperty;)I public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lorg/jetbrains/kotlinx/dataframe/columns/ColumnReference;)I + public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;Lkotlin/jvm/functions/Function2;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static synthetic fun countDistinct$default (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static synthetic fun countDistinct$default (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;Lkotlin/jvm/functions/Function2;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } public final class org/jetbrains/kotlinx/dataframe/api/CountKt { diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt index f3c407c3ee..99dcdd7f90 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt @@ -4,10 +4,18 @@ import org.jetbrains.kotlinx.dataframe.AnyColumnReference import org.jetbrains.kotlinx.dataframe.ColumnsSelector import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload +import org.jetbrains.kotlinx.dataframe.annotations.Interpretable +import org.jetbrains.kotlinx.dataframe.annotations.Refine +import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COLUMNS_PARAM +import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COLUMN_SELECTION_DSL +import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COMPARISON_OBJECT +import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.EXAMPLE +import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.SCOPE import org.jetbrains.kotlinx.dataframe.columns.toColumnSet import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns +import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateValue import org.jetbrains.kotlinx.dataframe.indices import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API import kotlin.reflect.KProperty @@ -93,3 +101,81 @@ public fun DataFrame.countDistinct(vararg columns: AnyColumnReference): I countDistinct { columns.toColumnSet() } // endregion + +// region GroupBy + +/** + * Aggregates this [GroupBy] by counting the number of distinct {@get [COMPARISON_OBJECT] rows} in each group. + * + * Compares rows in each group based on the values in {@get [SCOPE] all} columns. + * Returns a new [DataFrame] where each row corresponds to a group. + * The resulting [DataFrame] contains: + * - the original group key columns, + * - a new column (named [resultName\], default is `"countDistinct"`) + * that contains the number of distinct {@get [COMPARISON_OBJECT] rows} in each group. + * + * See also: + * - [aggregate][Grouped.aggregate], which aggregates a [GroupBy] using the provided statistics. + * - [count][Grouped.count], which counts the number of rows in each group. + * - [distinct][DataFrame.distinct], which removes duplicate rows and returns a new [DataFrame]. + * - [groupBy][DataFrame.groupBy], which groups the rows of a [DataFrame] + * based on the values in one or more specified cols. + * + * For more information: {@include [DocumentationUrls.CountDistinct]} + * + * {@get [COLUMN_SELECTION_DSL]} + * + * ### Example + * ```kotlin + * {@get [EXAMPLE]} + * ``` + * + * @param [resultName\] The name of the result column that will store the number + * of distinct {@get [COMPARISON_OBJECT] rows} in each group. Defaults to `"countDistinct"`. + * @get [COLUMNS_PARAM] + * @return A new [DataFrame] with group keys and corresponding numbers of distinct {@get [COMPARISON_OBJECT] rows}. + */ +@ExcludeFromSources +private interface CountDistinctOnGroupByDocs { + typealias COMPARISON_OBJECT = Nothing + typealias SCOPE = Nothing + typealias EXAMPLE = Nothing + typealias COLUMN_SELECTION_DSL = Nothing + typealias COLUMNS_PARAM = Nothing +} + +/** + * @include [CountDistinctOnGroupByDocs] + * @set [EXAMPLE] + * // Counts the number of distinct rows for each city, returning + * // a new DataFrame with columns "city" and "countDistinct" + * df.groupBy { city }.countDistinct() + */ +@Refine +@Interpretable("GroupByCountDistinct0") +public fun Grouped.countDistinct(resultName: String = "countDistinct"): DataFrame = + countDistinct(resultName) { all() } + +/** + * @include [CountDistinctOnGroupByDocs] + * @set [COMPARISON_OBJECT] combinations of values in the selected [columns] + * @set [SCOPE] the selected + * @set [COLUMN_SELECTION_DSL] {@include [SelectingColumns.ColumnsSelectionDsl]} + * @set [EXAMPLE] + * // Counts unique combinations of values in the "year" and "title" columns + * // for each city, returning a new DataFrame with columns "city" and "countDistinct" + * df.groupBy { city }.countDistinct { year and title } + * @set [COLUMNS_PARAM] @param [columns\] The [ColumnsSelector] used to select columns + * that will be considered for evaluating whether the rows are distinct. + */ +@Refine +@Interpretable("GroupByCountDistinct0") +public fun Grouped.countDistinct( + resultName: String = "countDistinct", + columns: ColumnsSelector, +): DataFrame = + aggregateValue(resultName) { + countDistinct(columns) default 0 + } + +// endregion diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt index 32c1c5bf81..0b1a9a2da2 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt @@ -156,6 +156,9 @@ internal interface GroupByDocs { * `| `__`.`__[**`count`**][Grouped.count]**`() `** * * {@include [Indent]} + * `| `__`.`__[**`countDistinct`**][Grouped.countDistinct]**`() `** + * + * {@include [Indent]} * `| `__`.`__[**`aggregate`**][Grouped.aggregate]**` { `**`aggregations: `[`AggregateDsl`][AggregateDsl]**` }`** * * {@include [Indent]} @@ -196,6 +199,8 @@ internal interface GroupByDocs { * * * [count][Grouped.count] — calculate the number of rows in each group * (optionally counting only rows that satisfy the given predicate); + * * [`countDistinct`][Grouped.countDistinct] — calculate the number of distinct rows in each group + * (or distinct combinations of values in selected columns); * * [max][Grouped.max] / [maxOf][Grouped.maxOf] / [maxFor][Grouped.maxFor] — * calculate the maximum of all values on the selected columns / by a row expression / * for each of the selected columns within each group; @@ -301,6 +306,8 @@ internal interface GroupByDocs { * from all rows of each group for the selected columns. * * [count][Grouped.count] — creates a [DataFrame] containing the grouping key columns and an additional column * with the number of rows in each corresponding group; + * * [countDistinct][Grouped.countDistinct] — creates a [DataFrame] containing the grouping key columns + * and an additional column with the number of distinct rows in each corresponding group; * * [aggregate][Grouped.aggregate] — performs a set of custom aggregations using [AggregateDsl], * allowing you to compute one or more derived values per group; * * [Various aggregation statistics][AggregationStatistics] — predefined shortcuts diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt new file mode 100644 index 0000000000..0e0810c9ee --- /dev/null +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt @@ -0,0 +1,146 @@ +package org.jetbrains.kotlinx.dataframe.api + +import io.kotest.matchers.shouldBe +import org.jetbrains.kotlinx.dataframe.nrow +import org.junit.Test + +class CountDistinctTests { + + private val df = dataFrameOf( + "name" to columnOf("Alice", "Alice", "Bob", "Charlie"), + "age" to columnOf(15, 15, 20, 25), + "group" to columnOf(1, 1, 1, 2), + ) + + @Test + fun `countDistinct on GroupBy`() { + val result = df.groupBy("group").countDistinct() + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(2, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with custom result name`() { + val result = df.groupBy("group").countDistinct("unique") + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "unique" to columnOf(2, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with one unique row`() { + val df = dataFrameOf( + "name" to columnOf("Alice", "Alice", "Alice"), + "age" to columnOf(15, 15, 15), + "group" to columnOf(1, 1, 1), + ) + val result = df.groupBy("group").countDistinct() + val expected = dataFrameOf( + "group" to columnOf(1), + "countDistinct" to columnOf(1), + ) + result shouldBe expected + } + + // TODO: check columns as well when #1531 is fixed + @Test + fun `countDistinct on empty GroupBy`() { + df + .drop(df.nrow) + .groupBy("group").countDistinct() + .count() shouldBe 0 + } + + @Test + fun `countDistinct on GroupBy with nulls`() { + val result = df + .append(null, null, 1) + .groupBy("group").countDistinct() + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(3, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with null group key`() { + val result = df + .append("Dave", 30, null) + .groupBy("group").countDistinct() + val expected = dataFrameOf( + "group" to columnOf(1, 2, null), + "countDistinct" to columnOf(2, 1, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with columns selector`() { + val result = df.groupBy("group").countDistinct { "name"() } + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(2, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with columns selector (not distinct only by selected column)`() { + val df = dataFrameOf( + "name" to columnOf("Alice", "Bob", "Charlie"), + "age" to columnOf(15, 15, 20), + "group" to columnOf(1, 1, 2), + ) + val result = df.groupBy("group").countDistinct { "age"() } + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(1, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on GroupBy with multiple columns selector`() { + val df = dataFrameOf( + "name" to columnOf("Alice", "Alice", "Bob", "Charlie"), + "age" to columnOf(15, 15, 20, 25), + "group" to columnOf(1, 1, 1, 2), + "city" to columnOf("London", "Moscow", "London", "Paris"), + ) + val result = df.groupBy("group").countDistinct { "name"() and "age"() } + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(2, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on grouped DataFrame with columns selector and custom result name`() { + val result = df.groupBy("group").countDistinct(resultName = "unique") { "name"() } + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "unique" to columnOf(2, 1), + ) + result shouldBe expected + } + + @Test + fun `countDistinct on grouped DataFrame with multiple columns selector with nulls`() { + val result = df + .append(null, null, 1) + .groupBy("group") + .countDistinct { "name"() and "age"() } + val expected = dataFrameOf( + "group" to columnOf(1, 2), + "countDistinct" to columnOf(3, 1), + ) + result shouldBe expected + } +} diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt index 9b4ecf5bfc..e8afdc8450 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt @@ -6,7 +6,6 @@ import org.jetbrains.kotlinx.dataframe.api.add import org.jetbrains.kotlinx.dataframe.api.after import org.jetbrains.kotlinx.dataframe.api.chunked import org.jetbrains.kotlinx.dataframe.api.colsOf -import org.jetbrains.kotlinx.dataframe.api.countDistinct import org.jetbrains.kotlinx.dataframe.api.distinct import org.jetbrains.kotlinx.dataframe.api.distinctBy import org.jetbrains.kotlinx.dataframe.api.drop @@ -431,30 +430,6 @@ class Access : TestBase() { // SampleEnd } - @Test - @TransformDataFrameExpressions - fun countDistinct() { - // SampleStart - df.countDistinct() - // SampleEnd - } - - @Test - @TransformDataFrameExpressions - fun countDistinctColumns_properties() { - // SampleStart - df.countDistinct { age and name } - // SampleEnd - } - - @Test - @TransformDataFrameExpressions - fun countDistinctColumns_strings() { - // SampleStart - df.countDistinct("age", "name") - // SampleEnd - } - @Test @TransformDataFrameExpressions fun distinctColumns_strings() { diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html new file mode 100644 index 0000000000..22abe47ddc --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsCustomNameOnGroupBy_properties.html @@ -0,0 +1,511 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html new file mode 100644 index 0000000000..8a2594d162 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsDf.html @@ -0,0 +1,516 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html new file mode 100644 index 0000000000..ba884df876 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsGroupBy.html @@ -0,0 +1,574 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html new file mode 100644 index 0000000000..33f67fa573 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctColumnsOnGroupBy_properties.html @@ -0,0 +1,511 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html new file mode 100644 index 0000000000..402c557208 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctDf.html @@ -0,0 +1,516 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html new file mode 100644 index 0000000000..f26cbd0bc9 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctGroupBy.html @@ -0,0 +1,574 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBySmallTable_properties.html b/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBySmallTable_properties.html new file mode 100644 index 0000000000..f84d1e8140 --- /dev/null +++ b/docs/StardustDocs/resources/api/countDistinct/countDistinctOnGroupBySmallTable_properties.html @@ -0,0 +1,511 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/groupBy/countDistinctOnGroupBy_properties.html b/docs/StardustDocs/resources/api/groupBy/countDistinctOnGroupBy_properties.html new file mode 100644 index 0000000000..d8e535cdb1 --- /dev/null +++ b/docs/StardustDocs/resources/api/groupBy/countDistinctOnGroupBy_properties.html @@ -0,0 +1,511 @@ + + + + + +
+ +

+ + + diff --git a/docs/StardustDocs/resources/api/pivot/pivotInward_properties.html b/docs/StardustDocs/resources/api/pivot/pivotInward_properties.html index 49beb58089..8ff674999f 100644 --- a/docs/StardustDocs/resources/api/pivot/pivotInward_properties.html +++ b/docs/StardustDocs/resources/api/pivot/pivotInward_properties.html @@ -459,7 +459,7 @@ /**/ diff --git a/docs/StardustDocs/topics/_shadow_resources.md b/docs/StardustDocs/topics/_shadow_resources.md index c9a1d3b506..e000fc5e0e 100644 --- a/docs/StardustDocs/topics/_shadow_resources.md +++ b/docs/StardustDocs/topics/_shadow_resources.md @@ -202,6 +202,7 @@ + @@ -318,3 +319,10 @@ + + + + + + + diff --git a/docs/StardustDocs/topics/countDistinct.md b/docs/StardustDocs/topics/countDistinct.md index 5c2a406761..b1b01039c6 100644 --- a/docs/StardustDocs/topics/countDistinct.md +++ b/docs/StardustDocs/topics/countDistinct.md @@ -1,33 +1,129 @@ [//]: # (title: countDistinct) - + -Returns number of distinct combinations of values in selected columns of [`DataFrame`](DataFrame.md). + +Counts distinct rows or distinct combinations of values in selected columns. + +When `countDistinct` is used on a [`DataFrame`](DataFrame.md), +it returns the number of distinct rows in this [`DataFrame`](DataFrame.md). + + + +```kotlin +df +``` + + + + + + +```kotlin +df.countDistinct() // the result is 4 +``` + + + +You can also specify which columns to use when counting distinct combinations of values. + + ```kotlin -df.countDistinct { age and name } +df.countDistinct { name.firstName and city } // the result is 3 ``` ```kotlin -df.countDistinct("age", "name") +df.countDistinct { "name"["firstName"] and "city" } // the result is 3 ``` -When `columns` are not specified, returns number of distinct rows in [`DataFrame`](DataFrame.md). +When `countDistinct` is used on a `GroupBy`, it counts distinct rows within each group. +That is, this function returns a [`DataFrame`](DataFrame.md) where each row corresponds to a group +from the original `GroupBy`. The result contains the original group key columns +and a new column with the number of distinct rows (or combinations of values in selected columns) in each group. - +Let's take this `GroupBy` as an example: + + + +```kotlin +df.groupBy { city } +``` + + + + +Applying `countDistinct` to this `GroupBy` yields the following result: + + + + + +```kotlin +df.groupBy { city }.countDistinct() +``` + + + ```kotlin -df.countDistinct() +df.groupBy("city").countDistinct() ``` + + + + +You can also specify which columns in the groups should be used to determine distinctness. + + + + + + + +```kotlin +df.groupBy { city }.countDistinct { name.firstName } +``` + + + + +```kotlin +df.groupBy("city").countDistinct { "name"["firstName"] } +``` + + + + + +The default name of the new column is `countDistinct`, but you can choose a different one. + + + + + +```kotlin +df.groupBy { city }.countDistinct("uniqueFirstNames") { name.firstName } +``` + + + + +```kotlin +df.groupBy("city").countDistinct("uniqueFirstNames") { "name"["firstName"] } +``` + + + diff --git a/docs/StardustDocs/topics/groupBy.md b/docs/StardustDocs/topics/groupBy.md index b710b53e81..122fa8ace0 100644 --- a/docs/StardustDocs/topics/groupBy.md +++ b/docs/StardustDocs/topics/groupBy.md @@ -23,7 +23,7 @@ transformations = [ .sortByGroup { expression } | .sortByGroupDesc { expression reducer = .minBy { column } | .maxBy { column } | .medianBy { rowExpression } | .percentileBy(percentile) { rowExpression } | .first [ { rowCondition } ] | .last [ { rowCondition } ] .concat() | .into([column]) [{ rowExpression }] | .values { valueColumns } -aggregator = .count() | .concat() | .concatWithKeys() | .toDataFrame() | .into([column]) [{ rowExpression }] | .values { valueColumns } | .aggregate { aggregations } | . [ { columns } ] +aggregator = .count() | .countDistinct() | .concat() | .concatWithKeys() | .toDataFrame() | .into([column]) [{ rowExpression }] | .values { valueColumns } | .aggregate { aggregations } | . [ { columns } ] pivot = .pivot { columns } [ .default(defaultValue) ] @@ -602,22 +602,35 @@ A `GroupBy` can be directly transformed into a new [`DataFrame`](DataFrame.md) b Aggregation is a generalization of [`reducing`](groupBy.md#reducing). The following aggregation methods are available: -* [`concat`](concat.md) — concatenates all [`rows`](DataRow.md) from all groups into a single [`DataFrame`](DataFrame.md), without preserving grouping keys. -* [`toDataFrame`](createDataFrame.md#todataframe) — returns this `GroupBy` as a [`DataFrame`](DataFrame.md) with the grouping keys and corresponding groups in [FrameColumn](DataColumn.md#framecolumn). -* `concatWithKeys` — a variant of [`concat`](concat.md) that also includes grouping keys that were not present in the original [`DataFrame`](DataFrame.md). -* `into` — creates a new [`column`](DataColumn.md) containing a list of values computed with a `RowExpression` for each group, or a new [FrameColumn](DataColumn.md#framecolumn) containing the groups themselves. -* [`values`](values.md) — collects all column values for every group without aggregation. For a [ValueColumn](DataColumn.md#valuecolumn) of type `T` it will gather group values into lists of type `List`. -For a [ColumnGroup](DataColumn.md#columngroup) it will gather group values into a [`DataFrame`](DataFrame.md) and convert that [ColumnGroup](DataColumn.md#columngroup) into a [FrameColumn](DataColumn.md#framecolumn). -* [`count`](count.md) — creates a [`DataFrame`](DataFrame.md) containing the grouping key columns and an additional [`column`](DataColumn.md) with the number of rows in each corresponding group. -* `aggregate` — performs a set of custom aggregations using `AggregateDsl`, allowing you to compute one or more [statistics](summaryStatistics.md) per every group of `GroupBy`. -The body if this function will be executed for every data group and has a receiver of type [`DataFrame`](DataFrame.md) that represents the current data group being aggregated. +* [`concat`](concat.md) — concatenates all [`rows`](DataRow.md) from all groups into a single [`DataFrame`](DataFrame.md), +without preserving grouping keys. +* [`toDataFrame`](createDataFrame.md#todataframe) — returns this `GroupBy` as a [`DataFrame`](DataFrame.md) +with the grouping keys and corresponding groups in [FrameColumn](DataColumn.md#framecolumn). +* `concatWithKeys` — a variant of [`concat`](concat.md) +that also includes grouping keys that were not present in the original [`DataFrame`](DataFrame.md). +* `into` — creates a new [`column`](DataColumn.md) containing a list of values computed with a `RowExpression` for each group, +or a new [FrameColumn](DataColumn.md#framecolumn) containing the groups themselves. +* [`values`](values.md) — collects all column values for every group without aggregation. +For a [ValueColumn](DataColumn.md#valuecolumn) of type `T` it will gather group values into lists of type `List`. +For a [ColumnGroup](DataColumn.md#columngroup) it will gather group values into a [`DataFrame`](DataFrame.md) +and convert that [ColumnGroup](DataColumn.md#columngroup) into a [FrameColumn](DataColumn.md#framecolumn). +* [`count`](count.md) — creates a [`DataFrame`](DataFrame.md) containing the grouping key columns +and an additional [`column`](DataColumn.md) with the number of rows in each corresponding group. +* [`countDistinct`](countDistinct.md) — creates a [`DataFrame`](DataFrame.md) containing the grouping key columns +and an additional [`column`](DataColumn.md) with the number of distinct rows in each corresponding group. +* `aggregate` — performs a set of custom aggregations using `AggregateDsl`, +allowing you to compute one or more [statistics](summaryStatistics.md) per every group of `GroupBy`. +The body if this function will be executed for every data group and has a receiver of type [`DataFrame`](DataFrame.md) +that represents the current data group being aggregated. To add a new column to the resulting [`DataFrame`](DataFrame.md), pass the name of the new column to infix function `into`. -Each of these methods returns a new DataFrame that includes the grouping key columns (except for [`concat`](concat.md)) along with the columns of values aggregated from the corresponding groups. +Each of these methods returns a new DataFrame that includes the grouping key columns (except for [`concat`](concat.md)) +along with the columns of values aggregated from the corresponding groups. ### Examples of aggregation #### concat on GroupBy {collapsible="true"} -[`concat`](concat.md) can be used to union all data groups of `GroupBy` into the original [`DataFrame`](DataFrame.md) preserving the new order of rows produced by grouping: +[`concat`](concat.md) can be used to union all data groups of `GroupBy` into the original [`DataFrame`](DataFrame.md) +preserving the new order of rows produced by grouping: @@ -781,6 +794,26 @@ df.groupBy("city").count() +#### countDistinct on GroupBy {collapsible="true"} + + + + +```kotlin +df.groupBy { city }.countDistinct { name.firstName } +``` + + + + +```kotlin +df.groupBy("city").countDistinct { "name"["firstName"] } +``` + + + + + #### aggregate on GroupBy {collapsible="true"} @@ -849,6 +882,7 @@ Each function computes a statistic across the [`rows`](DataRow.md) of a group an The following aggregation statistics are available: * [`count`](count.md); +* [`countDistinct`](countDistinct.md); * [`max / maxOf / maxFor`](minmax.md); * [`min / minOf / minFor`](minmax.md); * [`sum / sumOf / sumFor`](sum.md); diff --git a/samples/build.gradle.kts b/samples/build.gradle.kts index 5e3051ad85..7269a5e63b 100644 --- a/samples/build.gradle.kts +++ b/samples/build.gradle.kts @@ -114,6 +114,7 @@ korro { include("columnArithmetics.md") include("groupBy.md") include("pivot.md") + include("countDistinct.md") }, ) baseDir = rootProject.file("docs/StardustDocs/topics") diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt index 8b63791f9a..620d9a1a6d 100644 --- a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/DataFrameSampleHelper.kt @@ -1,8 +1,13 @@ package org.jetbrains.kotlinx.dataframe.samples import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.CodeString +import org.jetbrains.kotlinx.dataframe.api.GroupBy import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData +import org.jetbrains.kotlinx.dataframe.io.DisplayConfiguration +import org.jetbrains.kotlinx.dataframe.io.toStandaloneHtml import org.jetbrains.kotlinx.dataframe.samples.api.TestBase import org.jetbrains.kotlinx.kandy.letsplot.samples.SampleHelper import java.io.File @@ -44,4 +49,49 @@ abstract class DataFrameSampleHelper(sampleName: String, subFolder: String = "sa fun DataColumn<*>.saveDfHtmlSample() { toDataFrame().saveDfHtmlSample() } + + // TODO: might be changed as #1887 is fixed + private val expandNestedFramesScript = DataFrameHtmlData( + script = + """ + (function () { + function expandColumnGroups(df) { + for (let col of df.cols) { + if (col.parent === undefined && col.children.length > 0) col.expanded = true; + } + } + + function expandNestedFrames(df, rootDf) { + for (let col of df.cols) { + for (let value of col.values) { + if (value && value.frameId !== undefined) { + rootDf.expandedFrames.add(value.frameId); + let child = rootDf.childFrames[value.frameId]; + if (child) { + expandColumnGroups(child); + expandNestedFrames(child, rootDf); + } + } + } + } + } + + document.querySelectorAll("table.dataframe").forEach(function (table) { + if (table.df && table.df.id === table.df.rootId) { + let rootDf = table.df; + expandNestedFrames(rootDf, rootDf); + DataFrame.renderTable(rootDf.id); + } + }); + })(); + """.trimIndent(), + ) + + fun DataFrame<*>.toHtmlWithOpenedNestedDfs() = + toStandaloneHtml( + configuration = DisplayConfiguration(enableFallbackStaticTables = false), + getFooter = { "" }, + ) + expandNestedFramesScript + + fun GroupBy<*, *>.toHtmlWithOpenedNestedDfs() = toDataFrame().toHtmlWithOpenedNestedDfs() } diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt new file mode 100644 index 0000000000..377984cb36 --- /dev/null +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/CountDistinctSamples.kt @@ -0,0 +1,137 @@ +package org.jetbrains.kotlinx.dataframe.samples.api + +import org.jetbrains.kotlinx.dataframe.api.RgbColor +import org.jetbrains.kotlinx.dataframe.api.and +import org.jetbrains.kotlinx.dataframe.api.convert +import org.jetbrains.kotlinx.dataframe.api.countDistinct +import org.jetbrains.kotlinx.dataframe.api.filter +import org.jetbrains.kotlinx.dataframe.api.format +import org.jetbrains.kotlinx.dataframe.api.groupBy +import org.jetbrains.kotlinx.dataframe.api.perRowCol +import org.jetbrains.kotlinx.dataframe.api.take +import org.jetbrains.kotlinx.dataframe.api.with +import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper +import org.jetbrains.kotlinx.dataframe.util.defaultHeaderFormatting +import org.junit.Test + +class CountDistinctSamples : DataFrameSampleHelper("countDistinct", "api") { + val df = peopleDf.take(7).filter { index() in setOf(0, 1, 2, 6) } + + private fun firstNameToColor(name: String): RgbColor = + when (name) { + "Alice" -> RgbColor(189, 206, 233) + "Bob" -> RgbColor(198, 224, 198) + "Charlie" -> RgbColor(219, 198, 230) + else -> RgbColor(255, 255, 255) + } + + @Test + fun countDistinctDf() { + // SampleStart + df + // SampleEnd + .saveDfHtmlSample() + } + + @Test + fun countDistinct() { + // SampleStart + df.countDistinct() // the result is 4 + // SampleEnd + } + + @Test + fun countDistinctColumns_properties() { + // SampleStart + df.countDistinct { name.firstName and city } // the result is 3 + // SampleEnd + } + + @Test + fun countDistinctColumns_strings() { + // SampleStart + df.countDistinct { "name"["firstName"] and "city" } // the result is 3 + // SampleEnd + } + + @Test + fun countDistinctColumnsDf() { + df.format().perRowCol { row, _ -> + val firstName = df[row.index()].name.firstName + background(firstNameToColor(firstName)) and textColor(black) + } + .saveDfHtmlSample() + } + + @Test + fun countDistinctGroupBy() { + // SampleStart + df.groupBy { city } + // SampleEnd + .toHtmlWithOpenedNestedDfs() + .saveDfHtmlSample() + } + + @Test + fun countDistinctOnGroupBySmallTable_properties() { + // SampleStart + df.groupBy { city }.countDistinct() + // SampleEnd + .defaultHeaderFormatting { "countDistinct"() } + .saveDfHtmlSample() + } + + @Test + fun countDistinctOnGroupBySmallTable_strings() { + // SampleStart + df.groupBy("city").countDistinct() + // SampleEnd + } + + @Test + fun countDistinctColumnsOnGroupBy_properties() { + // SampleStart + df.groupBy { city }.countDistinct { name.firstName } + // SampleEnd + .defaultHeaderFormatting { "countDistinct"() } + .saveDfHtmlSample() + } + + @Test + fun countDistinctColumnsOnGroupBy_strings() { + // SampleStart + df.groupBy("city").countDistinct { "name"["firstName"] } + // SampleEnd + } + + @Test + fun countDistinctColumnsCustomNameOnGroupBy_properties() { + // SampleStart + df.groupBy { city }.countDistinct("uniqueFirstNames") { name.firstName } + // SampleEnd + .defaultHeaderFormatting { "uniqueFirstNames"() } + .saveDfHtmlSample() + } + + @Test + fun countDistinctColumnsCustomNameOnGroupBy_strings() { + // SampleStart + df.groupBy("city").countDistinct("uniqueFirstNames") { "name"["firstName"] } + // SampleEnd + } + + @Test + fun countDistinctColumnsGroupBy() { + df.groupBy { city } + .toDataFrame() + .convert { group }.with { group -> + val firstNameCol = group["name"]["firstName"] + group.format().perRowCol { row, _ -> + val firstName = firstNameCol[row.index()] as String + background(firstNameToColor(firstName)) and textColor(black) + } + } + .toHtmlWithOpenedNestedDfs() + .saveDfHtmlSample() + } +} diff --git a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/GroupBySamples.kt b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/GroupBySamples.kt index bde4d02289..59f4f0372e 100644 --- a/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/GroupBySamples.kt +++ b/samples/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/GroupBySamples.kt @@ -12,6 +12,7 @@ import org.jetbrains.kotlinx.dataframe.api.concat import org.jetbrains.kotlinx.dataframe.api.concatWithKeys import org.jetbrains.kotlinx.dataframe.api.convert import org.jetbrains.kotlinx.dataframe.api.count +import org.jetbrains.kotlinx.dataframe.api.countDistinct import org.jetbrains.kotlinx.dataframe.api.dataFrameOf import org.jetbrains.kotlinx.dataframe.api.div import org.jetbrains.kotlinx.dataframe.api.expr @@ -661,6 +662,22 @@ class GroupBySamples : DataFrameSampleHelper("groupBy", "api") { // SampleEnd } + @Test + fun countDistinctOnGroupBy_properties() { + // SampleStart + df.groupBy { city }.countDistinct { name.firstName } + // SampleEnd + .defaultHeaderFormatting { "countDistinct"() } + .saveDfHtmlSample() + } + + @Test + fun countDistinctOnGroupBy_strings() { + // SampleStart + df.groupBy("city").countDistinct { "name"["firstName"] } + // SampleEnd + } + @Test fun aggregateOnGroupBy_properties() { // SampleStart