Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions core/api/core.api
Original file line number Diff line number Diff line change
Expand Up @@ -1714,6 +1714,10 @@ public final class org/jetbrains/kotlinx/dataframe/api/CountDistinctKt {
public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Ljava/lang/String;)I
public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lkotlin/reflect/KProperty;)I
public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lorg/jetbrains/kotlinx/dataframe/columns/ColumnReference;)I
public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;Lkotlin/jvm/functions/Function2;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun countDistinct$default (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun countDistinct$default (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;Lkotlin/jvm/functions/Function2;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
}

public final class org/jetbrains/kotlinx/dataframe/api/CountKt {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,18 @@ import org.jetbrains.kotlinx.dataframe.AnyColumnReference
import org.jetbrains.kotlinx.dataframe.ColumnsSelector
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload
import org.jetbrains.kotlinx.dataframe.annotations.Interpretable
import org.jetbrains.kotlinx.dataframe.annotations.Refine
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COLUMNS_PARAM
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COLUMN_SELECTION_DSL
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COMPARISON_OBJECT
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.EXAMPLE
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.SCOPE
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls
import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns
import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateValue
import org.jetbrains.kotlinx.dataframe.indices
import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API
import kotlin.reflect.KProperty
Expand Down Expand Up @@ -93,3 +101,81 @@ public fun <T> DataFrame<T>.countDistinct(vararg columns: AnyColumnReference): I
countDistinct { columns.toColumnSet() }

// endregion

// region GroupBy

/**
* Aggregates this [GroupBy] by counting the number of distinct {@get [COMPARISON_OBJECT] rows} in each group.
*
* Compares rows in each group based on the values in {@get [SCOPE] all} columns.
* Returns a new [DataFrame] where each row corresponds to a group.
* The resulting [DataFrame] contains:
* - the original group key columns,
* - a new column (named [resultName\], default is `"countDistinct"`)
* that contains the number of distinct {@get [COMPARISON_OBJECT] rows} in each group.
*
* See also:
* - [aggregate][Grouped.aggregate], which aggregates a [GroupBy] using the provided statistics.
* - [count][Grouped.count], which counts the number of rows in each group.
* - [distinct][DataFrame.distinct], which removes duplicate rows and returns a new [DataFrame].
* - [groupBy][DataFrame.groupBy], which groups the rows of a [DataFrame]
* based on the values in one or more specified cols.
*
* For more information: {@include [DocumentationUrls.CountDistinct]}
*
* {@get [COLUMN_SELECTION_DSL]}
*
* ### Example
* ```kotlin
* {@get [EXAMPLE]}
* ```
*
* @param [resultName\] The name of the result column that will store the number
* of distinct {@get [COMPARISON_OBJECT] rows} in each group. Defaults to `"countDistinct"`.
* @get [COLUMNS_PARAM]
* @return A new [DataFrame] with group keys and corresponding numbers of distinct {@get [COMPARISON_OBJECT] rows}.
*/
@ExcludeFromSources
private interface CountDistinctOnGroupByDocs {
typealias COMPARISON_OBJECT = Nothing
typealias SCOPE = Nothing
typealias EXAMPLE = Nothing
typealias COLUMN_SELECTION_DSL = Nothing
typealias COLUMNS_PARAM = Nothing
}

/**
* @include [CountDistinctOnGroupByDocs]
* @set [EXAMPLE]
* // Counts the number of distinct rows for each city, returning
* // a new DataFrame with columns "city" and "countDistinct"
* df.groupBy { city }.countDistinct()
*/
@Refine
@Interpretable("GroupByCountDistinct0")
public fun <T> Grouped<T>.countDistinct(resultName: String = "countDistinct"): DataFrame<T> =
countDistinct(resultName) { all() }

/**
* @include [CountDistinctOnGroupByDocs]
* @set [COMPARISON_OBJECT] combinations of values in the selected [columns]
* @set [SCOPE] the selected
* @set [COLUMN_SELECTION_DSL] {@include [SelectingColumns.ColumnsSelectionDsl]}
* @set [EXAMPLE]
* // Counts unique combinations of values in the "year" and "title" columns
* // for each city, returning a new DataFrame with columns "city" and "countDistinct"
* df.groupBy { city }.countDistinct { year and title }
* @set [COLUMNS_PARAM] @param [columns\] The [ColumnsSelector] used to select columns
* that will be considered for evaluating whether the rows are distinct.
*/
@Refine
@Interpretable("GroupByCountDistinct0")
public fun <T, C> Grouped<T>.countDistinct(
resultName: String = "countDistinct",
columns: ColumnsSelector<T, C>,
): DataFrame<T> =
aggregateValue(resultName) {
countDistinct(columns) default 0
Comment thread
Allex-Nik marked this conversation as resolved.
}

// endregion
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ internal interface GroupByDocs {
* `| `__`.`__[**`count`**][Grouped.count]**`() `**
*
* {@include [Indent]}
* `| `__`.`__[**`countDistinct`**][Grouped.countDistinct]**`() `**
*
* {@include [Indent]}
* `| `__`.`__[**`aggregate`**][Grouped.aggregate]**` { `**`aggregations: `[`AggregateDsl`][AggregateDsl]**` }`**
*
* {@include [Indent]}
Expand Down Expand Up @@ -190,6 +193,8 @@ internal interface GroupByDocs {
*
* * [count][Grouped.count] — calculate the number of rows in each group
* (optionally counting only rows that satisfy the given predicate);
* * [`countDistinct`][Grouped.countDistinct] — calculate the number of distinct rows in each group
* (or distinct combinations of values in selected columns);
* * [max][Grouped.max] / [maxOf][Grouped.maxOf] / [maxFor][Grouped.maxFor] —
* calculate the maximum of all values on the selected columns / by a row expression /
* for each of the selected columns within each group;
Expand Down Expand Up @@ -295,6 +300,8 @@ internal interface GroupByDocs {
* from all rows of each group for the selected columns.
* * [count][Grouped.count] — creates a [DataFrame] containing the grouping key columns and an additional column
* with the number of rows in each corresponding group;
* * [countDistinct][Grouped.countDistinct] — creates a [DataFrame] containing the grouping key columns
* and an additional column with the number of distinct rows in each corresponding group;
* * [aggregate][Grouped.aggregate] — performs a set of custom aggregations using [AggregateDsl],
* allowing you to compute one or more derived values per group;
* * [Various aggregation statistics][AggregationStatistics] — predefined shortcuts
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
package org.jetbrains.kotlinx.dataframe.api

import io.kotest.matchers.shouldBe
import org.jetbrains.kotlinx.dataframe.nrow
import org.junit.Test

class CountDistinctTests {

private val df = dataFrameOf(
"name" to columnOf("Alice", "Alice", "Bob", "Charlie"),
"age" to columnOf(15, 15, 20, 25),
"group" to columnOf(1, 1, 1, 2),
)

@Test
fun `countDistinct on GroupBy`() {
val result = df.groupBy("group").countDistinct()
val expected = dataFrameOf(
"group" to columnOf(1, 2),
"countDistinct" to columnOf(2, 1),
)
result shouldBe expected
}

@Test
fun `countDistinct on GroupBy with custom result name`() {
val result = df.groupBy("group").countDistinct("unique")
val expected = dataFrameOf(
"group" to columnOf(1, 2),
"unique" to columnOf(2, 1),
)
result shouldBe expected
}

@Test
fun `countDistinct on GroupBy with one unique row`() {
val df = dataFrameOf(
"name" to columnOf("Alice", "Alice", "Alice"),
"age" to columnOf(15, 15, 15),
"group" to columnOf(1, 1, 1),
)
val result = df.groupBy("group").countDistinct()
val expected = dataFrameOf(
"group" to columnOf(1),
"countDistinct" to columnOf(1),
)
result shouldBe expected
}

// TODO: check columns as well when #1531 is fixed
Comment thread
Allex-Nik marked this conversation as resolved.
@Test
fun `countDistinct on empty GroupBy`() {
df
.drop(df.nrow)
.groupBy("group").countDistinct()
.count() shouldBe 0
}

@Test
fun `countDistinct on GroupBy with nulls`() {
val result = df
.append(null, null, 1)
.groupBy("group").countDistinct()
val expected = dataFrameOf(
"group" to columnOf(1, 2),
"countDistinct" to columnOf(3, 1),
)
result shouldBe expected
}

@Test
fun `countDistinct on GroupBy with null group key`() {
val result = df
.append("Dave", 30, null)
.groupBy("group").countDistinct()
val expected = dataFrameOf(
"group" to columnOf(1, 2, null),
"countDistinct" to columnOf(2, 1, 1),
)
result shouldBe expected
}

@Test
fun `countDistinct on GroupBy with columns selector`() {
val result = df.groupBy("group").countDistinct { "name"<String>() }
val expected = dataFrameOf(
"group" to columnOf(1, 2),
"countDistinct" to columnOf(2, 1),
)
result shouldBe expected
}

@Test
fun `countDistinct on GroupBy with columns selector (not distinct only by selected column)`() {
val df = dataFrameOf(
"name" to columnOf("Alice", "Bob", "Charlie"),
"age" to columnOf(15, 15, 20),
"group" to columnOf(1, 1, 2),
)
val result = df.groupBy("group").countDistinct { "age"<Int>() }
val expected = dataFrameOf(
"group" to columnOf(1, 2),
"countDistinct" to columnOf(1, 1),
)
result shouldBe expected
}

@Test
fun `countDistinct on GroupBy with multiple columns selector`() {
val df = dataFrameOf(
"name" to columnOf("Alice", "Alice", "Bob", "Charlie"),
"age" to columnOf(15, 15, 20, 25),
"group" to columnOf(1, 1, 1, 2),
"city" to columnOf("London", "Moscow", "London", "Paris"),
)
val result = df.groupBy("group").countDistinct { "name"<String>() and "age"<Int>() }
val expected = dataFrameOf(
"group" to columnOf(1, 2),
"countDistinct" to columnOf(2, 1),
)
result shouldBe expected
}

@Test
fun `countDistinct on grouped DataFrame with columns selector and custom result name`() {
val result = df.groupBy("group").countDistinct(resultName = "unique") { "name"<String>() }
val expected = dataFrameOf(
"group" to columnOf(1, 2),
"unique" to columnOf(2, 1),
)
result shouldBe expected
}

@Test
fun `countDistinct on grouped DataFrame with multiple columns selector with nulls`() {
val result = df
.append(null, null, 1)
.groupBy("group")
.countDistinct { "name"<String>() and "age"<Int>() }
val expected = dataFrameOf(
"group" to columnOf(1, 2),
"countDistinct" to columnOf(3, 1),
)
result shouldBe expected
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import org.jetbrains.kotlinx.dataframe.api.add
import org.jetbrains.kotlinx.dataframe.api.after
import org.jetbrains.kotlinx.dataframe.api.chunked
import org.jetbrains.kotlinx.dataframe.api.colsOf
import org.jetbrains.kotlinx.dataframe.api.countDistinct
import org.jetbrains.kotlinx.dataframe.api.distinct
import org.jetbrains.kotlinx.dataframe.api.distinctBy
import org.jetbrains.kotlinx.dataframe.api.drop
Expand Down Expand Up @@ -431,30 +430,6 @@ class Access : TestBase() {
// SampleEnd
}

@Test
@TransformDataFrameExpressions
fun countDistinct() {
// SampleStart
df.countDistinct()
// SampleEnd
}

@Test
@TransformDataFrameExpressions
fun countDistinctColumns_properties() {
// SampleStart
df.countDistinct { age and name }
// SampleEnd
}

@Test
@TransformDataFrameExpressions
fun countDistinctColumns_strings() {
// SampleStart
df.countDistinct("age", "name")
// SampleEnd
}

@Test
@TransformDataFrameExpressions
fun distinctColumns_strings() {
Expand Down
Loading
Loading