# 实时查看 CPU 热点
perf top
# 记录 CPU profile(30秒)
perf record -F 99 -g ./my_app
perf record -F 99 -g -p <pid> -- sleep 30
# 查看报告
perf report
# 查看火焰图
perf report --stdio# 查看 CPU 缓存未命中
perf stat -e cache-misses,cache-references ./my_app
# 查看分支预测失败
perf stat -e branch-misses,branch-instructions ./my_app
# 查看页缺失
perf stat -e page-faults ./my_app
# 完整统计
perf stat ./my_app
# 输出示例:
# Performance counter stats for './my_app':
# 1,234.56 msec task-clock # 0.999 CPUs utilized
# 123 context-switches # 0.100 K/sec
# 12 cpu-migrations # 0.010 K/sec
# 1,234 page-faults # 1.000 K/sec
# 5,000,000 cycles # 4.050 GHz
# 3,000,000 instructions # 0.60 insn per cycle
# 500,000 branches # 404.858 M/sec
# 10,000 branch-misses # 2.00% of all branches# 追踪程序的所有系统调用
strace ./my_app
# 追踪运行中的进程
strace -p <pid>
# 统计系统调用耗时
strace -c ./my_app
# 输出示例:
# % time seconds usecs/call calls errors syscall
# ------ ----------- ----------- --------- --------- ----------------
# 45.23 0.123456 1234 100 read
# 30.12 0.082345 823 100 write
# 15.34 0.041234 412 100 open
# 9.31 0.025432 254 100 close
# 只追踪特定系统调用
strace -e trace=open,read,write ./my_app
# 追踪文件操作
strace -e trace=file ./my_app
# 追踪网络操作
strace -e trace=network ./my_app# 追踪 TCP 连接
bpftrace -e 'tracepoint:syscalls:sys_enter_connect { printf("%s\n", comm); }'
# 统计系统调用耗时
bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @start[tid] = nsecs; }
tracepoint:raw_syscalls:sys_exit /@start[tid]/ {
@latency_ms = hist((nsecs - @start[tid]) / 1000000);
delete(@start[tid]);
}'
# 追踪 MySQL 查询耗时
bpftrace -e 'usdt:/usr/sbin/mysqld:mysql:query__start {
@start[arg0] = nsecs;
}
usdt:/usr/sbin/mysqld:mysql:query__done {
@query_ms = hist((nsecs - @start[arg0]) / 1000000);
delete(@start[arg0]);
}'# 追踪慢系统调用
/usr/share/bcc/tools/funclatency -m 10 do_sys_open
# 追踪 TCP 重传
/usr/share/bcc/tools/tcpretrans
# 追踪内存分配
/usr/share/bcc/tools/memleak -p <pid>
# 追踪文件 IO
/usr/share/bcc/tools/filetop
# 追踪磁盘 IO
/usr/share/bcc/tools/biolatency# 查看磁盘 IO
iostat -x 1
# 输出解读:
# Device r/s w/s rkB/s wkB/s await util%
# sda 100 200 1024 2048 5.2 85.5
#
# r/s, w/s: 读写每秒次数
# rkB/s, wkB/s: 读写每秒KB数
# await: 平均响应时间(ms)
# util%: 磁盘利用率(接近100%说明瓶颈)# 实时网络流量
iftop -i eth0
# 按协议查看
iftop -i eth0 -f "port 80"
# 查看网络统计
netstat -s
ss -simport (
"net/http"
_ "net/http/pprof"
)
func main() {
// 启动 pprof HTTP 服务
go func() {
log.Println(http.ListenAndServe("localhost:6060", nil))
}()
// 业务代码
// ...
}# 采集 30 秒 CPU profile
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30
# 进入交互模式
(pprof) top
# 显示 CPU 占用最多的函数
(pprof) list <function_name>
# 显示函数的源码和 CPU 占用
(pprof) web
# 生成调用图(需要安装 graphviz)
# 生成火焰图
go tool pprof -http=:8080 cpu.prof
# 查看汇编
(pprof) disasm <function_name>
# 对比两次 profile
go tool pprof -base cpu1.prof cpu2.prof# 采集堆内存
go tool pprof http://localhost:6060/debug/pprof/heap
# 查看内存分配
(pprof) top
# 显示内存分配最多的函数
(pprof) list <function_name>
# 显示具体分配位置
# 查看对象数量
(pprof) top -cum
# 内存增长分析(对比两次快照)
curl http://localhost:6060/debug/pprof/heap > heap1.prof
# 运行一段时间后
curl http://localhost:6060/debug/pprof/heap > heap2.prof
# 对比分析
go tool pprof -base heap1.prof heap2.prof# 查看 goroutine 数量和堆栈
go tool pprof http://localhost:6060/debug/pprof/goroutine
(pprof) top
# 显示 goroutine 数量最多的函数
(pprof) traces
# 显示 goroutine 的调用栈// 启用阻塞分析
import "runtime"
func init() {
runtime.SetBlockProfileRate(1) // 采样率:1次/纳秒
}# 采集阻塞 profile
go tool pprof http://localhost:6060/debug/pprof/block
(pprof) top
# 显示阻塞最多的函数// 启用锁竞争分析
import "runtime"
func init() {
runtime.SetMutexProfileFraction(1) // 采样率:1次
}# 采集锁竞争 profile
go tool pprof http://localhost:6060/debug/pprof/mutex
(pprof) top
# 显示锁竞争最多的函数import (
"os"
"runtime/trace"
)
func main() {
// 记录 trace
f, _ := os.Create("trace.out")
defer f.Close()
trace.Start(f)
defer trace.Stop()
// 业务代码
// ...
}# 查看 trace
go tool trace trace.out
# 浏览器打开 http://localhost:port
# 可以看到:
# - Goroutine 执行情况
# - GC 暂停时间
# - 系统调用
# - 网络 IO
# - 调度延迟import cProfile
import pstats
# 方法 1:装饰器
def profile_func(func):
def wrapper(*args, **kwargs):
profiler = cProfile.Profile()
profiler.enable()
result = func(*args, **kwargs)
profiler.disable()
stats = pstats.Stats(profiler)
stats.sort_stats('cumtime')
stats.print_stats(20) # 打印前 20 个
return result
return wrapper
@profile_func
def my_function():
# 业务代码
pass
# 方法 2:命令行
# python -m cProfile -s cumtime script.py
# 方法 3:代码中使用
profiler = cProfile.Profile()
profiler.enable()
# 业务代码
my_function()
profiler.disable()
stats = pstats.Stats(profiler)
stats.sort_stats('cumtime')
stats.print_stats()from memory_profiler import profile
@profile
def my_function():
a = [1] * (10 ** 6)
b = [2] * (2 * 10 ** 7)
del b
return a
# 运行
# python -m memory_profiler script.py
# 输出示例:
# Line # Mem usage Increment Line Contents
# ================================================
# 3 38.816 MiB 38.816 MiB @profile
# 4 46.492 MiB 7.676 MiB a = [1] * (10 ** 6)
# 5 198.969 MiB 152.477 MiB b = [2] * (2 * 10 ** 7)
# 6 46.492 MiB -152.477 MiB del b
# 7 46.492 MiB 0.000 MiB return a# 实时查看 Python 程序的调用栈
py-spy top --pid <pid>
# 记录火焰图
py-spy record -o profile.svg --pid <pid>
# 记录一段时间
py-spy record -o profile.svg --duration 60 --pid <pid>from line_profiler import LineProfiler
def my_function():
total = 0
for i in range(1000000):
total += i
return total
profiler = LineProfiler()
profiler.add_function(my_function)
profiler.enable()
my_function()
profiler.disable()
profiler.print_stats()# 1. 采集数据
perf record -F 99 -g -p <pid> -- sleep 30
# 2. 生成火焰图
perf script > out.perf
./stackcollapse-perf.pl out.perf > out.folded
./flamegraph.pl out.folded > flame.svg
# Go 程序生成火焰图
go tool pprof -http=:8080 cpu.prof火焰图解读:
- X 轴:按字母排序(非时间)
- Y 轴:调用栈深度
- 颜色:随机,无特殊含义
- 宽度:CPU 占用时间
- 顶部宽的函数:CPU 热点
# 追踪 off-CPU 时间(IO 等待、锁等待等)
perf record -e sched:sched_stat_sleep -e sched:sched_switch \
-e sched:sched_process_exit -g -p <pid> -- sleep 30
perf script > out.perf
./stackcollapse-perf.pl out.perf > out.folded
./flamegraph.pl --color=io --title="Off-CPU Time" out.folded > offcpu.svg# 基本压测
wrk -t4 -c100 -d30s http://localhost:8080/api
# 输出:
# Running 30s test @ http://localhost:8080/api
# 4 threads and 100 connections
# Thread Stats Avg Stdev Max +/- Stdev
# Latency 52.31ms 15.22ms 250.00ms 89.23%
# Req/Sec 2.15k 234.12 2.89k 78.45%
# Latency Distribution
# 50% 48.00ms
# 75% 62.00ms
# 90% 78.00ms
# 99% 120.00ms
# 514532 requests in 60.00s, 125.34MB read
# Requests/sec: 8575.53
# Transfer/sec: 2.09MB
# 使用 Lua 脚本
wrk -t4 -c100 -d30s -s script.lua http://localhost:8080/api
# script.lua 示例:
# wrk.method = "POST"
# wrk.body = '{"user":"test"}'
# wrk.headers["Content-Type"] = "application/json"# 基本压测
ab -n 10000 -c 100 http://localhost:8080/api
# POST 请求
ab -n 1000 -c 10 -p data.json -T application/json http://localhost:8080/apifrom locust import HttpUser, task, between
class WebsiteUser(HttpUser):
wait_time = between(1, 5)
@task
def get_user(self):
self.client.get("/api/user/123")
@task(3) # 权重 3
def create_user(self):
self.client.post("/api/user", json={
"name": "test",
"age": 25
})
# 运行
# locust -f locustfile.py
# 浏览器打开 http://localhost:8089<!-- JMeter 测试计划示例 -->
<jmeterTestPlan version="1.2">
<hashTree>
<TestPlan>
<ThreadGroup>
<stringProp name="ThreadGroup.num_threads">100</stringProp>
<stringProp name="ThreadGroup.ramp_time">10</stringProp>
<stringProp name="ThreadGroup.duration">60</stringProp>
</ThreadGroup>
<HTTPSampler>
<stringProp name="HTTPSampler.domain">localhost</stringProp>
<stringProp name="HTTPSampler.port">8080</stringProp>
<stringProp name="HTTPSampler.path">/api</stringProp>
</HTTPSampler>
</TestPlan>
</hashTree>
</jmeterTestPlan>import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// 定义指标
var (
httpRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "path", "status"},
)
httpRequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request latency",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "path"},
)
)
func init() {
prometheus.MustRegister(httpRequestsTotal)
prometheus.MustRegister(httpRequestDuration)
}
// HTTP 中间件
func prometheusMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
// 调用下一个处理器
next.ServeHTTP(w, r)
// 记录指标
duration := time.Since(start).Seconds()
httpRequestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
httpRequestsTotal.WithLabelValues(r.Method, r.URL.Path, "200").Inc()
})
}
func main() {
// 暴露 metrics 端点
http.Handle("/metrics", promhttp.Handler())
// 启动服务
http.ListenAndServe(":8080", nil)
}import (
"github.com/opentracing/opentracing-go"
"github.com/uber/jaeger-client-go"
)
// 初始化 Jaeger
func initJaeger(service string) (opentracing.Tracer, io.Closer) {
cfg := &jaeger.Configuration{
ServiceName: service,
Sampler: &jaeger.SamplerConfig{
Type: "const",
Param: 1,
},
Reporter: &jaeger.ReporterConfig{
LogSpans: true,
LocalAgentHostPort: "localhost:6831",
},
}
tracer, closer, _ := cfg.NewTracer()
return tracer, closer
}
// 使用示例
func handleRequest(w http.ResponseWriter, r *http.Request) {
// 开始 span
span := opentracing.StartSpan("handleRequest")
defer span.Finish()
// 调用数据库
dbSpan := opentracing.StartSpan("queryDB", opentracing.ChildOf(span.Context()))
// 查询数据库...
dbSpan.Finish()
// 调用 Redis
redisSpan := opentracing.StartSpan("queryRedis", opentracing.ChildOf(span.Context()))
// 查询 Redis...
redisSpan.Finish()
}| 性能问题 | 工具 | 用途 |
|---|---|---|
| CPU 热点 | perf, pprof | CPU 火焰图 |
| 内存泄漏 | pprof, valgrind | 内存分析 |
| 锁竞争 | pprof (mutex) | 锁分析 |
| IO 瓶颈 | iostat, iotop | 磁盘 IO |
| 网络瓶颈 | iftop, tcpdump | 网络流量 |
| 系统调用 | strace | 系统调用追踪 |
| 全链路追踪 | Jaeger, Zipkin | 分布式追踪 |
| 压测 | wrk, Locust | 性能测试 |
核心要点:
- ✅ Linux 工具:perf(CPU)、strace(系统调用)、eBPF(内核追踪)
- ✅ Go 工具:pprof(CPU/内存)、trace(执行追踪)
- ✅ 火焰图:可视化性能瓶颈
- ✅ 压测工具:wrk、Locust 模拟真实流量
- ✅ 监控工具:Prometheus + Grafana 实时监控
使用流程:
压测发现问题 → pprof/perf定位瓶颈 → 优化代码 → 验证效果