📍 导航:返回目录 | 上一节:etcd | 下一节:Elasticsearch
- 列式存储:按列存储,压缩率高
- 向量化执行:SIMD 指令加速
- 稀疏索引:快速定位数据块
- 支持 SQL:兼容标准 SQL
- 水平扩展:分布式表
适用场景:
- 实时数据分析
- OLAP 查询
- 日志分析
- 用户行为分析
CREATE TABLE events (
date Date,
user_id UInt32,
event_type String,
value Float32
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (date, user_id)
SETTINGS index_granularity = 8192;特点:
- 按 ORDER BY 字段排序
- 支持主键索引
- 支持分区(PARTITION BY)
CREATE TABLE user_profiles (
user_id UInt32,
name String,
age UInt8,
updated_at DateTime
) ENGINE = ReplacingMergeTree(updated_at)
ORDER BY user_id;特点:相同 ORDER BY 的数据,保留最新的一条
CREATE TABLE metrics (
date Date,
metric String,
value Float64
) ENGINE = SummingMergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (date, metric);特点:自动对数值列求和
CREATE TABLE events_distributed AS events
ENGINE = Distributed(cluster_name, database, events, rand());特点:聚合多个分片的数据
-- 整数
UInt8, UInt16, UInt32, UInt64
Int8, Int16, Int32, Int64
-- 浮点数
Float32, Float64
-- 字符串
String
FixedString(N)
-- 日期时间
Date -- 2024-01-01
DateTime -- 2024-01-01 12:00:00
DateTime64 -- 微秒精度
-- 数组
Array(T)
-- 枚举
Enum8('male' = 1, 'female' = 2)
-- Nullable
Nullable(Int32) -- 允许 NULL-- 查询
SELECT user_id, count() AS cnt
FROM events
WHERE date >= '2024-01-01'
GROUP BY user_id
ORDER BY cnt DESC
LIMIT 10;
-- 去重计数
SELECT uniq(user_id) FROM events;
-- 近似去重(更快)
SELECT uniqHLL12(user_id) FROM events;-- 基础聚合
SELECT
count(),
sum(value),
avg(value),
max(value),
min(value)
FROM metrics;
-- 分位数
SELECT
quantile(0.5)(value) AS median,
quantile(0.95)(value) AS p95,
quantile(0.99)(value) AS p99
FROM metrics;
-- TopK
SELECT
topK(10)(event_type)
FROM events;-- 时间转换
SELECT
toDate(timestamp),
toStartOfHour(timestamp),
toStartOfDay(timestamp),
toYYYYMM(timestamp)
FROM events;
-- 时间差
SELECT
dateDiff('day', start_date, end_date),
dateDiff('hour', start_time, end_time)
FROM logs;-- 按月分区
PARTITION BY toYYYYMM(date)
-- 查询时自动剪枝
SELECT * FROM events WHERE date = '2024-01-15'; -- 只扫描 202401 分区-- 创建物化视图(实时聚合)
CREATE MATERIALIZED VIEW daily_stats
ENGINE = SummingMergeTree()
PARTITION BY date
ORDER BY (date, event_type)
AS SELECT
toDate(timestamp) AS date,
event_type,
count() AS cnt
FROM events
GROUP BY date, event_type;
-- 查询物化视图(速度快)
SELECT * FROM daily_stats WHERE date = today();-- ❌ 单条插入(慢)
INSERT INTO events VALUES (...);
INSERT INTO events VALUES (...);
-- ✅ 批量插入(快)
INSERT INTO events VALUES
(...),
(...),
(...);建议:每批次 10000-100000 条
-- 1% 采样
SELECT count() * 100
FROM events SAMPLE 0.01;
-- 指定采样行数
SELECT count() * (total_rows / sample_rows)
FROM events SAMPLE 10000;import (
"database/sql"
_ "github.com/ClickHouse/clickhouse-go/v2"
)
func main() {
// 连接
dsn := "clickhouse://localhost:9000/default"
db, err := sql.Open("clickhouse", dsn)
if err != nil {
panic(err)
}
defer db.Close()
// 创建表
_, err = db.Exec(`
CREATE TABLE IF NOT EXISTS events (
timestamp DateTime,
user_id UInt32,
event_type String
) ENGINE = MergeTree()
ORDER BY (timestamp, user_id)
`)
// 批量插入
tx, _ := db.Begin()
stmt, _ := tx.Prepare("INSERT INTO events (timestamp, user_id, event_type) VALUES (?, ?, ?)")
for i := 0; i < 10000; i++ {
stmt.Exec(time.Now(), i, "click")
}
tx.Commit()
// 查询
rows, _ := db.Query("SELECT event_type, count() AS cnt FROM events GROUP BY event_type")
defer rows.Close()
for rows.Next() {
var eventType string
var cnt uint64
rows.Scan(&eventType, &cnt)
fmt.Printf("%s: %d\n", eventType, cnt)
}
}<!-- /etc/clickhouse-server/config.d/cluster.xml -->
<clickhouse>
<remote_servers>
<my_cluster>
<shard>
<replica>
<host>node1</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>node2</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>node3</host>
<port>9000</port>
</replica>
</shard>
</my_cluster>
</remote_servers>
</clickhouse>-- 本地表
CREATE TABLE events_local (
date Date,
user_id UInt32,
event_type String
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (date, user_id);
-- 分布式表
CREATE TABLE events_all AS events_local
ENGINE = Distributed(my_cluster, default, events_local, rand());
-- 查询分布式表(自动聚合所有分片数据)
SELECT event_type, count()
FROM events_all
GROUP BY event_type;ClickHouse 是高性能 OLAP 数据库,适合大规模数据分析场景,查询速度可达传统数据库的 100-1000 倍。
关键要点:
- ✅ 列式存储 + 压缩,节省存储空间
- ✅ 稀疏索引 + 分区剪枝,加速查询
- ✅ 物化视图实现实时聚合
- ✅ 批量插入是性能关键
- ClickHouse 官方文档
- 《ClickHouse 原理解析与应用实践》
- ClickHouse 性能优化指南
💡 思考题:
- 为什么 ClickHouse 查询这么快?
- 列式存储和行式存储的区别?
- 什么时候用物化视图?
⏮️ 上一节:etcd | ⏭️ 下一节:Elasticsearch