性能分析是内核开发和优化的关键技能。本手册详细介绍perf、SystemTap、火焰图等性能分析工具的高级用法,帮助你定位性能瓶颈、分析系统行为。
# 查看系统性能概况
perf top
# 记录性能数据
perf record -a -g -- sleep 10
# 查看记录的数据
perf report
# 统计性能计数
perf stat -a sleep 5
# 列出可用事件
perf list
#!/bin/bash
# perf_advanced.sh - perf高级用法
# CPU采样分析
perf_cpu_sample() {
# 采样CPU周期,包括内核和用户空间
perf record -F 99 -a -g -- sleep 30
# 采样特定进程
perf record -F 99 -p $PID -g -- sleep 10
# 采样特定CPU
perf record -F 99 -C 0,1 -g -- sleep 10
# 采样时包含调用栈
perf record -F 99 -a -g --call-graph dwarf -- sleep 10
}
# 缓存分析
perf_cache_analysis() {
# L1数据缓存未命中
perf record -e L1-dcache-load-misses -c 10000 -g -- ./app
# LLC(最后一级缓存)未命中
perf record -e LLC-load-misses -c 1000 -g -- ./app
# 缓存统计
perf stat -e L1-dcache-loads,L1-dcache-load-misses,\
LLC-loads,LLC-load-misses -- ./app
}
# 分支预测分析
perf_branch_analysis() {
# 分支预测失败
perf record -e branch-misses -g -- ./app
# 分支统计
perf stat -e branches,branch-misses -- ./app
# 详细分支记录
perf record -b -g -- ./app
}
# 内存带宽分析
perf_memory_bandwidth() {
# 内存访问采样
perf mem record -- ./app
# 内存访问报告
perf mem report
# NUMA节点访问
perf stat -e node-loads,node-load-misses -- ./app
}
# 系统调用分析
perf_syscall_analysis() {
# 跟踪所有系统调用
perf trace -a
# 跟踪特定系统调用
perf trace -e open,read,write,close -- ./app
# 系统调用延迟
perf trace -s -- ./app
}
# kernel_perf.sh - 内核函数性能分析
# 内核函数调用统计
kernel_function_stats() {
# 统计内核函数调用次数
perf probe -a 'kmalloc'
perf record -e probe:kmalloc -aR sleep 10
perf report
# 添加动态探针
perf probe -a 'tcp_sendmsg bytes=%dx'
perf record -e probe:tcp_sendmsg -aR sleep 10
# 查看探针
perf probe -l
# 删除探针
perf probe -d probe:kmalloc
}
# 内核锁分析
kernel_lock_analysis() {
# 锁竞争分析
perf lock record -- sleep 10
perf lock report
# 锁统计
perf lock info
# 特定锁的竞争
perf record -e lock:* -g -- sleep 10
}
# 调度器分析
scheduler_analysis() {
# 调度事件
perf sched record -- sleep 10
perf sched latency
# 调度时间线
perf sched timehist
# 迁移统计
perf stat -e sched:sched_migrate_task -a sleep 10
}
# 中断分析
interrupt_analysis() {
# 软中断统计
perf record -e irq:softirq_entry,irq:softirq_exit -g -a sleep 10
# 硬中断统计
perf record -e irq:irq_handler_entry,irq:irq_handler_exit -g -a sleep 10
# 中断延迟
perf trace -e 'irq:*' --duration 100 -a
}
# perf_script.py - 处理perf数据的Python脚本
#!/usr/bin/env python3
import sys
sys.path.append('/usr/lib/python3/dist-packages/')
from perf_trace_util import *
# 事件处理函数
deftrace_begin():
print("Starting trace analysis...")
deftrace_end():
print("Trace analysis complete")
print_summary()
# 统计数据
cpu_usage = {}
syscall_count = {}
defprocess_event(event):
# 获取事件信息
comm = event.get('comm', 'unknown')
cpu = event.get('cpu', 0)
# 统计CPU使用
if cpu notin cpu_usage:
cpu_usage[cpu] = {}
if comm notin cpu_usage[cpu]:
cpu_usage[cpu][comm] = 0
cpu_usage[cpu][comm] += 1
# 统计系统调用
if'syscall'in event:
syscall = event['syscall']
if syscall notin syscall_count:
syscall_count[syscall] = 0
syscall_count[syscall] += 1
defprint_summary():
print("\n=== CPU Usage ===")
for cpu, procs in cpu_usage.items():
print(f"CPU {cpu}:")
for comm, count insorted(procs.items(), key=lambda x: x[1], reverse=True)[:5]:
print(f" {comm}: {count}")
print("\n=== System Calls ===")
for syscall, count insorted(syscall_count.items(), key=lambda x: x[1], reverse=True)[:10]:
print(f" {syscall}: {count}")
// basic_stap.stp - SystemTap基础脚本
#!/usr/bin/stap
// 全局变量
global start_time
global io_count
global process_io[1000] // 关联数组
// 探针:系统启动
probe begin {
printf("Starting SystemTap script...\n")
start_time = gettimeofday_us()
}
// 探针:VFS读操作
probe vfs.read {
process_io[pid()] <<< bytes_to_read
io_count++
}
// 探针:VFS写操作
probe vfs.write {
process_io[pid()] <<< bytes_to_write
io_count++
}
// 定时器探针:每5秒输出统计
probe timer.sec(5) {
printf("\n=== I/O Statistics ===\n")
printf("Total I/O operations: %d\n", io_count)
foreach (p in process_io- limit 10) {
printf("PID %d: %d bytes (avg: %d)\n",
p, @sum(process_io[p]), @avg(process_io[p]))
}
delete process_io
io_count = 0
}
// 探针:系统结束
probe end {
runtime = (gettimeofday_us() - start_time) / 1000000
printf("\nScript ran for %d seconds\n", runtime)
}
// kernel_trace.stp - 内核函数跟踪
#!/usr/bin/stap
// 跟踪内存分配
probe kernel.function("kmalloc") {
printf("%s(%d): kmalloc size=%lu flags=0x%x\n",
execname(), pid(), $size, $flags)
print_backtrace()
}
// 跟踪进程创建
probe kernel.function("do_fork") {
printf("Fork by %s(%d) flags=0x%lx\n",
execname(), pid(), $clone_flags)
}
// 跟踪页错误
probe vm.pagefault {
if (fault_type == VM_FAULT_MINOR) {
printf("Minor fault: %s(%d) addr=%p\n",
execname(), pid(), address)
} elseif (fault_type == VM_FAULT_MAJOR) {
printf("Major fault: %s(%d) addr=%p\n",
execname(), pid(), address)
}
}
// 函数执行时间
global func_time[1000]
global func_start[1000]
probe kernel.function("tcp_sendmsg") {
func_start[tid()] = gettimeofday_ns()
}
probe kernel.function("tcp_sendmsg").return {
if (tid() in func_start) {
duration = gettimeofday_ns() - func_start[tid()]
func_time[execname()] <<< duration
delete func_start[tid()]
}
}
probe timer.sec(10) {
printf("\n=== Function Execution Time ===\n")
foreach (name in func_time- limit 5) {
printf("%s: avg=%dus max=%dus count=%d\n",
name,
@avg(func_time[name])/1000,
@max(func_time[name])/1000,
@count(func_time[name]))
}
}
#!/bin/bash
# flamegraph_cpu.sh - 生成CPU火焰图
# 安装火焰图工具
install_flamegraph() {
git clone https://github.com/brendangregg/FlameGraph
export PATH=$PATH:$(pwd)/FlameGraph
}
# 生成on-CPU火焰图
generate_oncpu_flamegraph() {
# 采样30秒
sudo perf record -F 99 -a -g -- sleep 30
# 生成火焰图
sudo perf script | stackcollapse-perf.pl > out.folded
flamegraph.pl out.folded > oncpu.svg
echo"On-CPU flamegraph saved to oncpu.svg"
}
# 生成off-CPU火焰图
generate_offcpu_flamegraph() {
# 记录调度事件
sudo perf record -e sched:sched_switch -a -g -- sleep 30
# 处理数据
sudo perf script -f comm,pid,tid,cpu,time,period,event,ip,sym,dso,trace | \
awk '
NF > 4 { exec = $1; period_ms = int($5 / 1000000) }
NF > 1 && NF <= 4 && period_ms > 0 { print $2 }
NF == 1 && period_ms > 0 { print $1 ";" exec " " period_ms }
' | \
stackcollapse.pl | \
flamegraph.pl --countname=ms --title="Off-CPU Time" > offcpu.svg
echo"Off-CPU flamegraph saved to offcpu.svg"
}
# 生成内存火焰图
generate_memory_flamegraph() {
# 记录内存分配
sudo perf record -e kmem:kmalloc -a -g -- sleep 30
# 生成火焰图
sudo perf script | stackcollapse-perf.pl | \
flamegraph.pl --color=mem --title="Memory Allocations" > memory.svg
echo"Memory flamegraph saved to memory.svg"
}
# 差分火焰图(对比两次采样)
generate_diff_flamegraph() {
# 第一次采样
sudo perf record -F 99 -a -g -o perf1.data -- sleep 30
sudo perf script -i perf1.data | stackcollapse-perf.pl > out1.folded
# 第二次采样
sudo perf record -F 99 -a -g -o perf2.data -- sleep 30
sudo perf script -i perf2.data | stackcollapse-perf.pl > out2.folded
# 生成差分火焰图
difffolded.pl out1.folded out2.folded | \
flamegraph.pl --title="Differential" > diff.svg
echo"Differential flamegraph saved to diff.svg"
}
#!/usr/bin/env python3
# bcc_flamegraph.py - 使用BCC生成火焰图
from bcc import BPF
import signal
import sys
# BPF程序
bpf_text = """
#include <uapi/linux/ptrace.h>
BPF_STACK_TRACE(stack_traces, 10240);
BPF_HASH(counts, u32);
int do_trace(struct pt_regs *ctx) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
// 获取栈ID
int stack_id = stack_traces.get_stackid(ctx, BPF_F_USER_STACK);
if (stack_id < 0)
return 0;
// 更新计数
counts.increment(stack_id);
return 0;
}
"""
# 初始化BPF
b = BPF(text=bpf_text)
# 附加到CPU周期事件
b.attach_perf_event(
ev_type=PerfType.HARDWARE,
ev_config=PerfHWConfig.CPU_CYCLES,
fn_name="do_trace",
sample_freq=99
)
# 信号处理
defsignal_handler(sig, frame):
print("\nGenerating flamegraph...")
# 输出栈信息
counts = b.get_table("counts")
stack_traces = b.get_table("stack_traces")
for k, v insorted(counts.items(), key=lambda x: x[1].value):
stack = []
for addr in stack_traces.walk(k.value):
sym = b.sym(addr, pid, show_module=True, show_offset=False)
stack.append(sym)
if stack:
print(";".join(reversed(stack)) + " " + str(v.value))
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
print("Sampling... Press Ctrl-C to generate flamegraph")
signal.pause()
#!/bin/bash
# stap_flamegraph.sh - SystemTap生成火焰图
# SystemTap采样脚本
cat > sample.stp << 'EOF'
global samples[65536]
probe timer.profile {
if (user_mode()) {
samples[ubacktrace()] <<< 1
} else {
samples[backtrace()] <<< 1
}
}
probe end {
foreach (stack in samples) {
print_stack(stack)
printf(" %d\n", @count(samples[stack]))
}
}
EOF
# 运行SystemTap
sudo stap -v sample.stp -d /usr/bin/myapp > stap.out
# 转换为火焰图格式
stackcollapse-stap.pl < stap.out | flamegraph.pl > stap.svg
// ebpf_perf.c - eBPF性能分析程序
#include<linux/bpf.h>
#include<bpf/bpf_helpers.h>
#include<bpf/bpf_tracing.h>
// 性能事件Map
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(u32));
} perf_map SEC(".maps");
// 延迟直方图
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 64);
__type(key, u32);
__type(value, u64);
} latency_hist SEC(".maps");
// 开始时间戳
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 10240);
__type(key, u64);
__type(value, u64);
} start_ts SEC(".maps");
// 跟踪函数进入
SEC("kprobe/tcp_sendmsg")
inttrace_enter(struct pt_regs *ctx)
{
u64 pid_tgid = bpf_get_current_pid_tgid();
u64 ts = bpf_ktime_get_ns();
bpf_map_update_elem(&start_ts, &pid_tgid, &ts, BPF_ANY);
return0;
}
// 跟踪函数返回
SEC("kretprobe/tcp_sendmsg")
inttrace_return(struct pt_regs *ctx)
{
u64 pid_tgid = bpf_get_current_pid_tgid();
u64 *tsp, delta;
tsp = bpf_map_lookup_elem(&start_ts, &pid_tgid);
if (!tsp)
return0;
// 计算延迟
delta = bpf_ktime_get_ns() - *tsp;
bpf_map_delete_elem(&start_ts, &pid_tgid);
// 更新直方图
u32 bucket = bpf_log2l(delta / 1000); // 微秒
if (bucket >= 64)
bucket = 63;
u64 *val = bpf_map_lookup_elem(&latency_hist, &bucket);
if (val)
__atomic_add_fetch(val, 1, __ATOMIC_RELAXED);
return0;
}
char LICENSE[] SEC("license") = "GPL";
// memleak_detect.stp - 内存泄漏检测
#!/usr/bin/stap
global allocs[100000] // 分配记录
global leaks // 泄漏统计
probe kernel.function("kmalloc").return {
if ($return != 0) {
allocs[$return] = bytes_req
}
}
probe kernel.function("kfree") {
if ($objp in allocs) {
delete allocs[$objp]
} elseif ($objp != 0) {
printf("WARNING: freeing untracked memory %p\n", $objp)
}
}
probe timer.sec(60) {
printf("\n=== Potential Memory Leaks ===\n")
foreach (ptr in allocs) {
leaks[backtrace()] += allocs[ptr]
}
foreach (bt in leaks- limit 10) {
printf("Leaked %d bytes from:\n", leaks[bt])
print_stack(bt)
printf("\n")
}
}
#!/bin/bash
# io_latency.sh - I/O延迟分析
# 使用perf分析块设备I/O
analyze_block_io() {
# 记录块I/O事件
sudo perf record -e block:* -a -g -- sleep 30
# 生成I/O延迟报告
sudo perf script | awk '
/block_rq_issue/ { start[$NF] = $4 }
/block_rq_complete/ {
if ($NF in start) {
latency = $4 - start[$NF]
printf "%.3f ms\n", latency/1000000
delete start[$NF]
}
}'
}
# 使用biolatency工具
use_biolatency() {
# 安装bcc-tools
sudo apt install bcc-tools
# 运行biolatency
sudo biolatency-bpfcc -m 10 # 10秒内的延迟直方图
}
// cpu_affinity.c - CPU亲和性设置
#include<sched.h>
#include<pthread.h>
voidset_cpu_affinity(int cpu)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu, &cpuset);
pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
}
// 中断亲和性设置脚本
// irq_affinity.sh
#!/bin/bash
set_irq_affinity() {
IRQ=$1
CPU_MASK=$2
echo $CPU_MASK > /proc/irq/$IRQ/smp_affinity_list
}
# 网卡中断绑定到CPU 0-3
for irq in $(grep eth0 /proc/interrupts | cut -d: -f1); do
set_irq_affinity $irq "0-3"
done
#!/bin/bash
# numa_optimize.sh - NUMA优化
# 查看NUMA拓扑
numactl --hardware
# 绑定进程到NUMA节点
numactl --cpunodebind=0 --membind=0 ./app
# 查看NUMA统计
numastat -p $PID
# 内核NUMA平衡
echo 1 > /proc/sys/kernel/numa_balancing
#!/bin/bash
# network_optimize.sh - 网络栈优化
# 调整网络缓冲区
sysctl -w net.core.rmem_max=134217728
sysctl -w net.core.wmem_max=134217728
sysctl -w net.ipv4.tcp_rmem="4096 87380 134217728"
sysctl -w net.ipv4.tcp_wmem="4096 65536 134217728"
# 启用RPS
echo f > /sys/class/net/eth0/queues/rx-0/rps_cpus
# 启用XPS
echo 1 > /sys/class/net/eth0/queues/tx-0/xps_cpus
# 关闭GRO/TSO以减少延迟
ethtool -K eth0 gro off
ethtool -K eth0 tso off
# 安装依赖
install-deps:
sudo apt install linux-tools-generic
sudo apt install systemtap systemtap-sdt-dev
sudo apt install bpftrace bcc-tools
git clone https://github.com/brendangregg/FlameGraph
# 运行性能分析
perf-cpu:
sudo perf record -F 99 -a -g -- sleep 30
sudo perf report
perf-mem:
sudo perf mem record -- ./app
sudo perf mem report
stap-io:
sudo stap -v io_monitor.stp
flamegraph:
./flamegraph_cpu.sh
clean:
rm -f perf.data* *.svg *.folded
掌握性能分析工具后,可以: