Skip to content

Latest commit

 

History

History
376 lines (290 loc) · 6.95 KB

File metadata and controls

376 lines (290 loc) · 6.95 KB

3.6 ClickHouse 列式数据库

📍 导航返回目录 | 上一节:etcd | 下一节:Elasticsearch


核心特性

  • 列式存储:按列存储,压缩率高
  • 向量化执行:SIMD 指令加速
  • 稀疏索引:快速定位数据块
  • 支持 SQL:兼容标准 SQL
  • 水平扩展:分布式表

适用场景

  • 实时数据分析
  • OLAP 查询
  • 日志分析
  • 用户行为分析

表引擎

MergeTree(最常用)

CREATE TABLE events (
    date Date,
    user_id UInt32,
    event_type String,
    value Float32
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (date, user_id)
SETTINGS index_granularity = 8192;

特点

  • 按 ORDER BY 字段排序
  • 支持主键索引
  • 支持分区(PARTITION BY)

ReplacingMergeTree(去重)

CREATE TABLE user_profiles (
    user_id UInt32,
    name String,
    age UInt8,
    updated_at DateTime
) ENGINE = ReplacingMergeTree(updated_at)
ORDER BY user_id;

特点:相同 ORDER BY 的数据,保留最新的一条

SummingMergeTree(聚合)

CREATE TABLE metrics (
    date Date,
    metric String,
    value Float64
) ENGINE = SummingMergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (date, metric);

特点:自动对数值列求和

Distributed(分布式表)

CREATE TABLE events_distributed AS events
ENGINE = Distributed(cluster_name, database, events, rand());

特点:聚合多个分片的数据


数据类型

-- 整数
UInt8, UInt16, UInt32, UInt64
Int8, Int16, Int32, Int64

-- 浮点数
Float32, Float64

-- 字符串
String
FixedString(N)

-- 日期时间
Date          -- 2024-01-01
DateTime      -- 2024-01-01 12:00:00
DateTime64    -- 微秒精度

-- 数组
Array(T)

-- 枚举
Enum8('male' = 1, 'female' = 2)

-- Nullable
Nullable(Int32)  -- 允许 NULL

SQL 查询

基本查询

-- 查询
SELECT user_id, count() AS cnt
FROM events
WHERE date >= '2024-01-01'
GROUP BY user_id
ORDER BY cnt DESC
LIMIT 10;

-- 去重计数
SELECT uniq(user_id) FROM events;

-- 近似去重(更快)
SELECT uniqHLL12(user_id) FROM events;

聚合函数

-- 基础聚合
SELECT 
    count(),
    sum(value),
    avg(value),
    max(value),
    min(value)
FROM metrics;

-- 分位数
SELECT 
    quantile(0.5)(value) AS median,
    quantile(0.95)(value) AS p95,
    quantile(0.99)(value) AS p99
FROM metrics;

-- TopK
SELECT 
    topK(10)(event_type)
FROM events;

时间函数

-- 时间转换
SELECT 
    toDate(timestamp),
    toStartOfHour(timestamp),
    toStartOfDay(timestamp),
    toYYYYMM(timestamp)
FROM events;

-- 时间差
SELECT 
    dateDiff('day', start_date, end_date),
    dateDiff('hour', start_time, end_time)
FROM logs;

性能优化

分区剪枝

-- 按月分区
PARTITION BY toYYYYMM(date)

-- 查询时自动剪枝
SELECT * FROM events WHERE date = '2024-01-15';  -- 只扫描 202401 分区

物化视图

-- 创建物化视图(实时聚合)
CREATE MATERIALIZED VIEW daily_stats
ENGINE = SummingMergeTree()
PARTITION BY date
ORDER BY (date, event_type)
AS SELECT
    toDate(timestamp) AS date,
    event_type,
    count() AS cnt
FROM events
GROUP BY date, event_type;

-- 查询物化视图(速度快)
SELECT * FROM daily_stats WHERE date = today();

批量插入

-- ❌ 单条插入(慢)
INSERT INTO events VALUES (...);
INSERT INTO events VALUES (...);

-- ✅ 批量插入(快)
INSERT INTO events VALUES
    (...),
    (...),
    (...);

建议:每批次 10000-100000 条

采样查询

-- 1% 采样
SELECT count() * 100
FROM events SAMPLE 0.01;

-- 指定采样行数
SELECT count() * (total_rows / sample_rows)
FROM events SAMPLE 10000;

Go 客户端

import (
    "database/sql"
    _ "github.com/ClickHouse/clickhouse-go/v2"
)

func main() {
    // 连接
    dsn := "clickhouse://localhost:9000/default"
    db, err := sql.Open("clickhouse", dsn)
    if err != nil {
        panic(err)
    }
    defer db.Close()
    
    // 创建表
    _, err = db.Exec(`
        CREATE TABLE IF NOT EXISTS events (
            timestamp DateTime,
            user_id UInt32,
            event_type String
        ) ENGINE = MergeTree()
        ORDER BY (timestamp, user_id)
    `)
    
    // 批量插入
    tx, _ := db.Begin()
    stmt, _ := tx.Prepare("INSERT INTO events (timestamp, user_id, event_type) VALUES (?, ?, ?)")
    
    for i := 0; i < 10000; i++ {
        stmt.Exec(time.Now(), i, "click")
    }
    
    tx.Commit()
    
    // 查询
    rows, _ := db.Query("SELECT event_type, count() AS cnt FROM events GROUP BY event_type")
    defer rows.Close()
    
    for rows.Next() {
        var eventType string
        var cnt uint64
        rows.Scan(&eventType, &cnt)
        fmt.Printf("%s: %d\n", eventType, cnt)
    }
}

集群部署

配置文件

<!-- /etc/clickhouse-server/config.d/cluster.xml -->
<clickhouse>
    <remote_servers>
        <my_cluster>
            <shard>
                <replica>
                    <host>node1</host>
                    <port>9000</port>
                </replica>
            </shard>
            <shard>
                <replica>
                    <host>node2</host>
                    <port>9000</port>
                </replica>
            </shard>
            <shard>
                <replica>
                    <host>node3</host>
                    <port>9000</port>
                </replica>
            </shard>
        </my_cluster>
    </remote_servers>
</clickhouse>

分布式表

-- 本地表
CREATE TABLE events_local (
    date Date,
    user_id UInt32,
    event_type String
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (date, user_id);

-- 分布式表
CREATE TABLE events_all AS events_local
ENGINE = Distributed(my_cluster, default, events_local, rand());

-- 查询分布式表(自动聚合所有分片数据)
SELECT event_type, count()
FROM events_all
GROUP BY event_type;

本章小结

ClickHouse 是高性能 OLAP 数据库,适合大规模数据分析场景,查询速度可达传统数据库的 100-1000 倍。

关键要点

  • ✅ 列式存储 + 压缩,节省存储空间
  • ✅ 稀疏索引 + 分区剪枝,加速查询
  • ✅ 物化视图实现实时聚合
  • ✅ 批量插入是性能关键

扩展阅读


💡 思考题

  1. 为什么 ClickHouse 查询这么快?
  2. 列式存储和行式存储的区别?
  3. 什么时候用物化视图?

⏮️ 上一节:etcd | ⏭️ 下一节:Elasticsearch