From 8dee32ce87e51828faacb7ff907fe0efb78c77fb Mon Sep 17 00:00:00 2001 From: Liwei Jiang Date: Wed, 25 Feb 2026 19:27:29 +0800 Subject: [PATCH 1/3] doc for aggregate function entropy --- .../aggregate-functions/entropy.md | 101 +++++++++++++++++ .../aggregate-functions/entropy.md | 103 ++++++++++++++++++ sidebars.ts | 1 + 3 files changed, 205 insertions(+) create mode 100644 docs/sql-manual/sql-functions/aggregate-functions/entropy.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/entropy.md diff --git a/docs/sql-manual/sql-functions/aggregate-functions/entropy.md b/docs/sql-manual/sql-functions/aggregate-functions/entropy.md new file mode 100644 index 0000000000000..04f4d297dc3bc --- /dev/null +++ b/docs/sql-manual/sql-functions/aggregate-functions/entropy.md @@ -0,0 +1,101 @@ +--- +{ + "title": "ENTROPY", + "language": "en", + "description": "Calculate the Shannon entropy of all non-null values in the specified column or expression." +} +--- + +## Description + +Computes the Shannon entropy of all non-null values in the specified column or expression. + +Entropy measures the uncertainty or randomness of a distribution. This function builds an empirical frequency map of the input values and computes entropy in bits using the base‑2 logarithm. + +The Shannon entropy is defined as: + +$ +Entropy(X) = -\sum_{i=1}^{k} p_i \log_2(p_i) +$ + +Where: + +- $k$ is the number of distinct non-null values +- $p_i = \frac{\text{count}(x_i)}{\text{total non-null count}}$ + +## Syntax + +```sql +ENTROPY( [, , ... , ]) +``` + +## Parameters + +| Parameter | Description | +|----------|-------------| +| ` [, , ...]` | One or more expressions or columns. Supported types: TinyInt, SmallInt, Integer, BigInt, LargeInt, Float, Double, Decimal, String, IPv4/IPv6, Array, Map,Struct. When multiple expressions are provided, their values are serialized together to form a single composite key, and entropy is computed over the frequency distribution of these composite keys. | + +## Return Value + +Returns a DOUBLE representing the Shannon entropy in bits. + +- Returns NULL if all values are NULL or the input is empty. +- Ignores NULL values during computation. + +## Examples + +```sql +CREATE TABLE t1 ( + id INT, + v INT +) DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ("replication_num"="1"); + +INSERT INTO t1 VALUES + (1, 1), + (2, 2), + (3, 2), + (4, NULL); +``` + +```sql +SELECT entropy(v) FROM t1; +``` + +Distribution: `{1:1, 2:2}` $H = -\left(\frac{1}{3}\log_2\frac{1}{3} + \frac{2}{3}\log_2\frac{2}{3}\right)=0.9183$ + +```text ++--------------------+ +| entropy(x) | ++--------------------+ +| 0.9182958340544896 | ++--------------------+ +``` + +```sql +SELECT entropy(1); +``` + +Only one distinct value → entropy = 0 + +```text ++------------+ +| entropy(1) | ++------------+ +| 0 | ++------------+ +``` + +```sql +SELECT entropy(NULL) FROM t1; +``` + +Returns NULL if all values are NULL or the input is empty. + +```sql ++---------------+ +| entropy(NULL) | ++---------------+ +| NULL | ++---------------+ +``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/entropy.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/entropy.md new file mode 100644 index 0000000000000..0e952764f3698 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/entropy.md @@ -0,0 +1,103 @@ +--- +{ + "title": "ENTROPY", + "language": "zh-CN", + "description": "计算指定列或表达式中所有非 NULL 值的香农熵(Shannon Entropy)。" +} +--- + +## 描述 + +计算指定列或表达式中所有非 NULL 值的香农熵(Shannon Entropy)。 + +熵用于衡量分布的不确定性或随机性。该函数会基于输入值构建经验频率分布,并使用以 2 为底的对数计算熵,单位为 比特(bits)。 + +香农熵的定义如下: + +$ +Entropy(X) = -\sum_{i=1}^{k} p_i \log_2(p_i) +$ + +其中: + +- $k$ 为非 NULL 的不同值的数量 +- $p_i = \frac{x_i的数量}{\text{所有非null值数量}}$ + +## 语法 + +```sql +ENTROPY( [, , ... , ]) +``` + +## 参数 + +| 参数 | 说明 | +|------|------| +| ` [, , ...]` | 一个或多个表达式或列。支持的类型包括:TinyInt、SmallInt、Integer、BigInt、LargeInt、Float、Double、Decimal、String、IPv4/IPv6、Array、Map、Struct 等。当提供多列时,每行的多个值会被序列化为一个复合键,并基于复合键的频率分布计算熵。 | + +## 返回值 + +返回一个 DOUBLE,表示以比特为单位的香农熵。 + +- 如果所有值均为 NULL 或输入为空,则返回 NULL。 +- 计算过程中会忽略 NULL 值。 + +## 举例 + +```sql +CREATE TABLE t1 ( + id INT, + v INT +) DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ("replication_num"="1"); + +INSERT INTO t1 VALUES + (1, 1), + (2, 2), + (3, 2), + (4, NULL); +``` + +```sql +SELECT entropy(v) FROM t1; +``` + +频率分布:`{1:1, 2:2}` + +熵的计算: $H = -\left(\frac{1}{3}\log_2\frac{1}{3} + \frac{2}{3}\log_2\frac{2}{3}\right)=0.9183$ + +```text ++--------------------+ +| entropy(x) | ++--------------------+ +| 0.9182958340544896 | ++--------------------+ +``` + +```sql +SELECT entropy(1); +``` + +只有一个唯一值 → 熵 = 0 + +```text ++------------+ +| entropy(1) | ++------------+ +| 0 | ++------------+ +``` + +```sql +SELECT entropy(NULL) FROM t1; +``` + +当所有值均为 NULL 或输入为空时返回 NULL。 + +```text ++---------------+ +| entropy(NULL) | ++---------------+ +| NULL | ++---------------+ +``` diff --git a/sidebars.ts b/sidebars.ts index 718e735d560fa..16e1bda661392 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -1895,6 +1895,7 @@ const sidebars: SidebarsConfig = { 'sql-manual/sql-functions/aggregate-functions/count-by-enum', 'sql-manual/sql-functions/aggregate-functions/covar', 'sql-manual/sql-functions/aggregate-functions/covar-samp', + 'sql-manual/sql-functions/aggregate-functions/entropy', 'sql-manual/sql-functions/aggregate-functions/group-array-intersect', 'sql-manual/sql-functions/aggregate-functions/group-array-union', 'sql-manual/sql-functions/aggregate-functions/group-bit-and', From 5a0792f3517a3d9a4466f4241b6d06914606fb8a Mon Sep 17 00:00:00 2001 From: Liwei Jiang Date: Sat, 14 Mar 2026 03:11:13 +0800 Subject: [PATCH 2/3] fix and add multi column example --- .../aggregate-functions/entropy.md | 42 ++++++++++++++----- .../aggregate-functions/entropy.md | 38 ++++++++++++----- 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/docs/sql-manual/sql-functions/aggregate-functions/entropy.md b/docs/sql-manual/sql-functions/aggregate-functions/entropy.md index 04f4d297dc3bc..d06b732f593b0 100644 --- a/docs/sql-manual/sql-functions/aggregate-functions/entropy.md +++ b/docs/sql-manual/sql-functions/aggregate-functions/entropy.md @@ -33,7 +33,7 @@ ENTROPY( [, , ... , ]) | Parameter | Description | |----------|-------------| -| ` [, , ...]` | One or more expressions or columns. Supported types: TinyInt, SmallInt, Integer, BigInt, LargeInt, Float, Double, Decimal, String, IPv4/IPv6, Array, Map,Struct. When multiple expressions are provided, their values are serialized together to form a single composite key, and entropy is computed over the frequency distribution of these composite keys. | +| ` [, , ...]` | One or more expressions or columns. Supported types: TinyInt, SmallInt, Integer, BigInt, LargeInt, Float, Double, Decimal, String, IPv4/IPv6, Array, Map, Struct. When multiple expressions are provided, their values are serialized together to form a single composite key, and entropy is computed over the frequency distribution of these composite keys. | ## Return Value @@ -47,31 +47,51 @@ Returns a DOUBLE representing the Shannon entropy in bits. ```sql CREATE TABLE t1 ( id INT, - v INT + c1 INT, + c2 STRING ) DISTRIBUTED BY HASH(id) BUCKETS 1 PROPERTIES ("replication_num"="1"); INSERT INTO t1 VALUES - (1, 1), - (2, 2), - (3, 2), - (4, NULL); + (1, 1, "a"), + (2, 1, "a"), + (3, 1, "b"), + (4, 2, "a"), + (5, NULL, "a"); ``` ```sql -SELECT entropy(v) FROM t1; +SELECT entropy(c1) FROM t1; ``` -Distribution: `{1:1, 2:2}` $H = -\left(\frac{1}{3}\log_2\frac{1}{3} + \frac{2}{3}\log_2\frac{2}{3}\right)=0.9183$ +Distribution: 1 → 3, 2 → 1 + +$H = -\left(\frac{1}{4}\log_2\frac{1}{4} + \frac{3}{4}\log_2\frac{3}{4}\right)=0.811$ ```text +--------------------+ -| entropy(x) | +| entropy(c1) | +--------------------+ -| 0.9182958340544896 | +| 0.8112781244591328 | +--------------------+ ``` +```sql +SELECT entropy(c1, c2) FROM t1; +``` + +Distribution: (1, "a") → 2, (1, "b") → 1, (2, "a") → 1 + +$H = -\left(\frac{1}{4}\log_2\frac{1}{4} + \frac{2}{4}\log_2\frac{2}{4}+ \frac{1}{4}\log_2\frac{1}{4}\right)=1.5$ + +```text ++-----------------+ +| entropy(c1, c2) | ++-----------------+ +| 1.5 | ++-----------------+ +``` + ```sql SELECT entropy(1); ``` @@ -92,7 +112,7 @@ SELECT entropy(NULL) FROM t1; Returns NULL if all values are NULL or the input is empty. -```sql +```text +---------------+ | entropy(NULL) | +---------------+ diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/entropy.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/entropy.md index 0e952764f3698..2b58b697d4d35 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/entropy.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/entropy.md @@ -47,33 +47,51 @@ ENTROPY( [, , ... , ]) ```sql CREATE TABLE t1 ( id INT, - v INT + c1 INT, + c2 STRING ) DISTRIBUTED BY HASH(id) BUCKETS 1 PROPERTIES ("replication_num"="1"); INSERT INTO t1 VALUES - (1, 1), - (2, 2), - (3, 2), - (4, NULL); + (1, 1, "a"), + (2, 1, "a"), + (3, 1, "b"), + (4, 2, "a"), + (5, NULL, "a"); ``` ```sql -SELECT entropy(v) FROM t1; +SELECT entropy(c1) FROM t1; ``` -频率分布:`{1:1, 2:2}` +频率分布:1 → 3, 2 → 1 -熵的计算: $H = -\left(\frac{1}{3}\log_2\frac{1}{3} + \frac{2}{3}\log_2\frac{2}{3}\right)=0.9183$ +熵的计算:$H = -\left(\frac{1}{4}\log_2\frac{1}{4} + \frac{3}{4}\log_2\frac{3}{4}\right)=0.811$ ```text +--------------------+ -| entropy(x) | +| entropy(c1) | +--------------------+ -| 0.9182958340544896 | +| 0.8112781244591328 | +--------------------+ ``` +```sql +SELECT entropy(c1, c2) FROM t1; +``` + +频率分布:(1, "a") → 2, (1, "b") → 1, (2, "a") → 1 + +熵的计算:$H = -\left(\frac{1}{4}\log_2\frac{1}{4} + \frac{2}{4}\log_2\frac{2}{4}+ \frac{1}{4}\log_2\frac{1}{4}\right)=1.5$ + +```text ++-----------------+ +| entropy(c1, c2) | ++-----------------+ +| 1.5 | ++-----------------+ +``` + ```sql SELECT entropy(1); ``` From e69f2261e2132d356494438ea8c5da4e81c50d71 Mon Sep 17 00:00:00 2001 From: Liwei Jiang Date: Sun, 15 Mar 2026 06:42:13 +0800 Subject: [PATCH 3/3] add doc for version 4.x --- .../aggregate-functions/entropy.md | 125 ++++++++++++++++++ .../aggregate-functions/entropy.md | 125 ++++++++++++++++++ versioned_sidebars/version-4.x-sidebars.json | 1 + 3 files changed, 251 insertions(+) create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/entropy.md create mode 100644 versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/entropy.md diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/entropy.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/entropy.md new file mode 100644 index 0000000000000..ce54acc363f98 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/entropy.md @@ -0,0 +1,125 @@ +--- +{ + "title": "ENTROPY", + "language": "zh-CN", + "description": "计算指定列或表达式中所有非 NULL 值的香农熵(Shannon Entropy)。" +} +--- + +## 描述 + +计算指定列或表达式中所有非 NULL 值的香农熵(Shannon Entropy)。 + +熵用于衡量分布的不确定性或随机性。该函数会基于输入值构建经验频率分布,并使用以 2 为底的对数计算熵,单位为 比特(bits)。 + +香农熵的定义如下: + +$ +Entropy(X) = -\sum_{i=1}^{k} p_i \log_2(p_i) +$ + +其中: + +- $k$ 为非 NULL 的不同值的数量 +- $p_i = \frac{x_i的数量}{\text{所有非null值数量}}$ + +:::info 备注 +从 Apache Doris 4.1.0 开始支持该函数 +::: + +## 语法 + +```sql +ENTROPY( [, , ... , ]) +``` + +## 参数 + +| 参数 | 说明 | +|------|------| +| ` [, , ...]` | 一个或多个表达式或列。支持的类型包括:TinyInt、SmallInt、Integer、BigInt、LargeInt、Float、Double、Decimal、String、IPv4/IPv6、Array、Map、Struct 等。当提供多列时,每行的多个值会被序列化为一个复合键,并基于复合键的频率分布计算熵。 | + +## 返回值 + +返回一个 DOUBLE,表示以比特为单位的香农熵。 + +- 如果所有值均为 NULL 或输入为空,则返回 NULL。 +- 计算过程中会忽略 NULL 值。 + +## 举例 + +```sql +CREATE TABLE t1 ( + id INT, + c1 INT, + c2 STRING +) DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ("replication_num"="1"); + +INSERT INTO t1 VALUES + (1, 1, "a"), + (2, 1, "a"), + (3, 1, "b"), + (4, 2, "a"), + (5, NULL, "a"); +``` + +```sql +SELECT entropy(c1) FROM t1; +``` + +频率分布:1 → 3, 2 → 1 + +熵的计算:$H = -\left(\frac{1}{4}\log_2\frac{1}{4} + \frac{3}{4}\log_2\frac{3}{4}\right)=0.811$ + +```text ++--------------------+ +| entropy(c1) | ++--------------------+ +| 0.8112781244591328 | ++--------------------+ +``` + +```sql +SELECT entropy(c1, c2) FROM t1; +``` + +频率分布:(1, "a") → 2, (1, "b") → 1, (2, "a") → 1 + +熵的计算:$H = -\left(\frac{1}{4}\log_2\frac{1}{4} + \frac{2}{4}\log_2\frac{2}{4}+ \frac{1}{4}\log_2\frac{1}{4}\right)=1.5$ + +```text ++-----------------+ +| entropy(c1, c2) | ++-----------------+ +| 1.5 | ++-----------------+ +``` + +```sql +SELECT entropy(1); +``` + +只有一个唯一值 → 熵 = 0 + +```text ++------------+ +| entropy(1) | ++------------+ +| 0 | ++------------+ +``` + +```sql +SELECT entropy(NULL) FROM t1; +``` + +当所有值均为 NULL 或输入为空时返回 NULL。 + +```text ++---------------+ +| entropy(NULL) | ++---------------+ +| NULL | ++---------------+ +``` diff --git a/versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/entropy.md b/versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/entropy.md new file mode 100644 index 0000000000000..85e707f20ec9b --- /dev/null +++ b/versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/entropy.md @@ -0,0 +1,125 @@ +--- +{ + "title": "ENTROPY", + "language": "en", + "description": "Calculate the Shannon entropy of all non-null values in the specified column or expression." +} +--- + +## Description + +Computes the Shannon entropy of all non-null values in the specified column or expression. + +Entropy measures the uncertainty or randomness of a distribution. This function builds an empirical frequency map of the input values and computes entropy in bits using the base‑2 logarithm. + +The Shannon entropy is defined as: + +$ +Entropy(X) = -\sum_{i=1}^{k} p_i \log_2(p_i) +$ + +Where: + +- $k$ is the number of distinct non-null values +- $p_i = \frac{\text{count}(x_i)}{\text{total non-null count}}$ + +:::info Note +Supported since Apache Doris 4.1.0 +::: + +## Syntax + +```sql +ENTROPY( [, , ... , ]) +``` + +## Parameters + +| Parameter | Description | +|----------|-------------| +| ` [, , ...]` | One or more expressions or columns. Supported types: TinyInt, SmallInt, Integer, BigInt, LargeInt, Float, Double, Decimal, String, IPv4/IPv6, Array, Map, Struct. When multiple expressions are provided, their values are serialized together to form a single composite key, and entropy is computed over the frequency distribution of these composite keys. | + +## Return Value + +Returns a DOUBLE representing the Shannon entropy in bits. + +- Returns NULL if all values are NULL or the input is empty. +- Ignores NULL values during computation. + +## Examples + +```sql +CREATE TABLE t1 ( + id INT, + c1 INT, + c2 STRING +) DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ("replication_num"="1"); + +INSERT INTO t1 VALUES + (1, 1, "a"), + (2, 1, "a"), + (3, 1, "b"), + (4, 2, "a"), + (5, NULL, "a"); +``` + +```sql +SELECT entropy(c1) FROM t1; +``` + +Distribution: 1 → 3, 2 → 1 + +$H = -\left(\frac{1}{4}\log_2\frac{1}{4} + \frac{3}{4}\log_2\frac{3}{4}\right)=0.811$ + +```text ++--------------------+ +| entropy(c1) | ++--------------------+ +| 0.8112781244591328 | ++--------------------+ +``` + +```sql +SELECT entropy(c1, c2) FROM t1; +``` + +Distribution: (1, "a") → 2, (1, "b") → 1, (2, "a") → 1 + +$H = -\left(\frac{1}{4}\log_2\frac{1}{4} + \frac{2}{4}\log_2\frac{2}{4}+ \frac{1}{4}\log_2\frac{1}{4}\right)=1.5$ + +```text ++-----------------+ +| entropy(c1, c2) | ++-----------------+ +| 1.5 | ++-----------------+ +``` + +```sql +SELECT entropy(1); +``` + +Only one distinct value → entropy = 0 + +```text ++------------+ +| entropy(1) | ++------------+ +| 0 | ++------------+ +``` + +```sql +SELECT entropy(NULL) FROM t1; +``` + +Returns NULL if all values are NULL or the input is empty. + +```text ++---------------+ +| entropy(NULL) | ++---------------+ +| NULL | ++---------------+ +``` diff --git a/versioned_sidebars/version-4.x-sidebars.json b/versioned_sidebars/version-4.x-sidebars.json index a61946af0f0f6..0fd7743e0c7d1 100644 --- a/versioned_sidebars/version-4.x-sidebars.json +++ b/versioned_sidebars/version-4.x-sidebars.json @@ -1933,6 +1933,7 @@ "sql-manual/sql-functions/aggregate-functions/count-by-enum", "sql-manual/sql-functions/aggregate-functions/covar", "sql-manual/sql-functions/aggregate-functions/covar-samp", + "sql-manual/sql-functions/aggregate-functions/entropy", "sql-manual/sql-functions/aggregate-functions/group-array-intersect", "sql-manual/sql-functions/aggregate-functions/group-array-union", "sql-manual/sql-functions/aggregate-functions/group-bit-and",