Linux调度器负责决定哪个任务在哪个CPU上运行多长时间。从最初的O(1)调度器到现在的CFS/EEVDF,Linux调度器经历了持续演进。理解调度器对于性能调优、实时系统开发和内核研究都至关重要。
一、调度器架构总览
1.1 调度类(Scheduling Classes)
/*
* Linux调度类层次(优先级从高到低):
*
* stop_sched_class - 最高优先级,用于CPU迁移等停机操作
* dl_sched_class - Deadline调度(EDF算法)
* rt_sched_class - 实时调度(SCHED_FIFO/SCHED_RR)
* fair_sched_class - CFS公平调度(SCHED_NORMAL/BATCH/IDLE)
* idle_sched_class - CPU idle时运行
*
* 选择逻辑:按优先级顺序,第一个有可运行任务的调度类胜出
*/
# 查看进程调度策略
chrt -p $$ # 查看当前shell的调度策略
ps -eo pid,cls,pri,ni,cmd | head -20
# CLS: TS=正常, FF=FIFO, RR=RR, B=BATCH, IDL=IDLE, DL=Deadline
# 修改调度策略
sudo chrt -f -p 50 <PID> # 设置为SCHED_FIFO优先级50
sudo chrt -r -p 50 <PID> # 设置为SCHED_RR优先级50
sudo chrt -o -p 0 <PID> # 恢复为普通调度
# 查看实时任务
ps -eo pid,cls,pri,cmd | grep -E "FF|RR"
1.2 核心数据结构
// sched_structs.c - 调度核心数据结构分析
#include<linux/module.h>
#include<linux/sched.h>
#include<linux/sched/task.h>
MODULE_LICENSE("GPL");
/*
* 关键结构体位置:
* struct task_struct - include/linux/sched.h
* struct sched_entity - include/linux/sched.h
* struct cfs_rq - kernel/sched/sched.h
* struct rq - kernel/sched/sched.h
*/
staticvoiddump_task_sched_info(struct task_struct *task)
{
pr_info("=== Task Sched Info: %s (PID=%d) ===\n",
task->comm, task->pid);
/* 调度策略 */
pr_info("Policy: %u\n", task->policy);
/* 0=NORMAL, 1=FIFO, 2=RR, 3=BATCH, 5=IDLE, 6=DEADLINE */
/* 优先级 */
pr_info("static_prio: %d\n", task->static_prio); /* nice映射后 */
pr_info("normal_prio: %d\n", task->normal_prio); /* 正常优先级 */
pr_info("prio: %d\n", task->prio); /* 动态优先级 */
/* 优先级范围: 0(最高RT) - 99(最低RT) - 100(最高normal) - 139(最低normal) */
/* CFS调度实体 */
pr_info("se.vruntime: %llu ns\n", task->se.vruntime);
pr_info("se.sum_exec: %llu ns\n", task->se.sum_exec_runtime);
pr_info("se.on_rq: %d\n", task->se.on_rq);
/* 上下文切换统计 */
pr_info("nvcsw: %lu (voluntary)\n", task->nvcsw);
pr_info("nivcsw: %lu (involuntary)\n", task->nivcsw);
}
staticint __init sched_info_init(void)
{
structtask_struct *task;
rcu_read_lock();
for_each_process(task) {
if (task->pid <= 10) /* 只显示前几个进程 */
dump_task_sched_info(task);
}
rcu_read_unlock();
return0;
}
staticvoid __exit sched_info_exit(void) {}
module_init(sched_info_init);
module_exit(sched_info_exit);
二、CFS调度器深度解析
2.1 CFS核心思想
CFS(Completely Fair Scheduler)目标:
- 每个进程获得相同比例的CPU时间
- 使用"虚拟运行时间"(vruntime)衡量公平性
- vruntime增长越慢,优先级越高(nice越低)
vruntime计算公式:
vruntime += delta_exec * NICE_0_LOAD / weight
其中 weight 由nice值决定(nice -20时weight最大,nice 19时最小)
运行队列使用红黑树(RB-tree)组织:
- 按vruntime排序
- 最左节点(vruntime最小)= 下一个被调度的任务
- 插入/删除: O(log n)
2.2 nice值与权重
# nice值范围:-20(最高优先级)到 19(最低优先级)
# 默认nice值:0
# 以特定nice值启动程序
nice -n 10 ./myprogram
nice -n -5 ./myprogram # 需要root(负nice值)
# 修改运行中进程的nice值
renice -n 5 -p <PID>
renice -n -5 -p <PID> # 需要root
# 查看nice值
ps -eo pid,ni,cmd | head -20
# nice值对应的内核权重(prio_to_weight[]数组)
# nice -20: weight = 88761
# nice -5: weight = 3121
# nice 0: weight = 1024 (NICE_0_LOAD)
# nice 5: weight = 335
# nice 19: weight = 15
# 相邻nice值之间约相差10%的CPU时间
2.3 CFS调度器源码分析
/*
* kernel/sched/fair.c 关键函数解析
*
* 核心调度函数调用链:
* schedule()
* → __schedule()
* → pick_next_task()
* → fair_sched_class.pick_next_task()
* → pick_next_task_fair()
* → pick_next_entity()
* → __pick_first_entity() ← 选择rb-tree最左节点
*/
/* 虚拟运行时间更新 */
staticvoidupdate_curr(struct cfs_rq *cfs_rq)
{
structsched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
u64 delta_exec;
delta_exec = now - curr->exec_start;
curr->exec_start = now;
/* 累计实际运行时间 */
curr->sum_exec_runtime += delta_exec;
/* 更新vruntime(按权重缩放)*/
curr->vruntime += calc_delta_fair(delta_exec, curr);
/* 更新CFS队列的min_vruntime */
update_min_vruntime(cfs_rq);
}
/* 选择下一个实体(红黑树最左节点)*/
staticstructsched_entity *
__pick_first_entity(structcfs_rq *cfs_rq)
{
structrb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
returnNULL;
return rb_entry(left, struct sched_entity, run_node);
}
/* 判断是否需要抢占当前任务 */
staticvoidcheck_preempt_tick(struct cfs_rq *cfs_rq,
struct sched_entity *curr)
{
unsignedlong ideal_runtime, delta_exec;
structsched_entity *se;
s64 delta;
/* 理想运行时间 = 调度延迟 / nr_running */
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
/* 运行时间超过理想值,设置抢占标志 */
if (delta_exec > ideal_runtime) {
resched_curr(rq_of(cfs_rq));
return;
}
/* 如果最左实体的vruntime比当前小太多,也抢占 */
se = __pick_first_entity(cfs_rq);
delta = curr->vruntime - se->vruntime;
if (delta > (s64)ideal_runtime)
resched_curr(rq_of(cfs_rq));
}
2.4 CFS实验:观察vruntime
// vruntime_monitor.c - 通过proc监控vruntime
#include<linux/module.h>
#include<linux/proc_fs.h>
#include<linux/seq_file.h>
#include<linux/sched.h>
#include<linux/sched/task.h>
MODULE_LICENSE("GPL");
staticintvruntime_show(struct seq_file *m, void *v)
{
structtask_struct *task;
structsched_entity *se;
seq_printf(m, "%-8s %-16s %5s %20s %20s\n",
"PID", "COMM", "NI", "VRUNTIME(ns)", "SUM_EXEC(ns)");
seq_printf(m, "%s\n", "-----------------------------------------------------------");
rcu_read_lock();
for_each_process(task) {
if (task->policy == SCHED_NORMAL ||
task->policy == SCHED_BATCH) {
se = &task->se;
seq_printf(m, "%-8d %-16s %5d %20llu %20llu\n",
task->pid,
task->comm,
task_nice(task),
se->vruntime,
se->sum_exec_runtime);
}
}
rcu_read_unlock();
return0;
}
staticintvruntime_open(struct inode *inode, struct file *file)
{
return single_open(file, vruntime_show, NULL);
}
staticconststructproc_opsvruntime_fops = {
.proc_open = vruntime_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};
staticstructproc_dir_entry *proc_entry;
staticint __init vruntime_init(void)
{
proc_entry = proc_create("vruntime", 0444, NULL, &vruntime_fops);
if (!proc_entry)
return -ENOMEM;
pr_info("vruntime monitor: cat /proc/vruntime\n");
return0;
}
staticvoid __exit vruntime_exit(void)
{
proc_remove(proc_entry);
}
module_init(vruntime_init);
module_exit(vruntime_exit);
三、调度延迟与抢占
3.1 调度延迟参数
# CFS调度延迟参数(所有任务至少在此时间内运行一次)
cat /proc/sys/kernel/sched_latency_ns # 默认6ms(4+核时自动调整)
cat /proc/sys/kernel/sched_min_granularity_ns # 最小运行粒度 0.75ms
cat /proc/sys/kernel/sched_wakeup_granularity_ns # 唤醒粒度
# 实时调度统计
cat /proc/sys/kernel/sched_rt_period_us # RT调度周期 1秒
cat /proc/sys/kernel/sched_rt_runtime_us # RT任务每周期最大运行时间 950ms
# 上述限制防止RT任务饿死普通任务
# 设置-1禁用此限制(危险)
# 查看调度统计
cat /proc/schedstat
# 使用perf观察调度事件
sudo perf sched record -- sleep 5
sudo perf sched latency | head -30
sudo perf sched timehist | head -30
3.2 抢占模式
# 查看内核抢占配置
grep -E "PREEMPT" /boot/config-$(uname -r)
# CONFIG_PREEMPT_NONE - 无抢占(服务器,吞吐量优先)
# CONFIG_PREEMPT_VOLUNTARY - 自愿抢占(桌面,平衡)
# CONFIG_PREEMPT - 完全抢占(嵌入式,响应优先)
# CONFIG_PREEMPT_RT - 实时内核(RT-Linux,最低延迟)
// preempt_demo.c - 抢占计数与禁止
#include<linux/module.h>
#include<linux/preempt.h>
#include<linux/delay.h>
#include<linux/smp.h>
MODULE_LICENSE("GPL");
staticint __init preempt_demo_init(void)
{
pr_info("=== Preemption Demo ===\n");
pr_info("preempt_count: %d\n", preempt_count());
/* 禁止抢占 */
preempt_disable();
pr_info("After preempt_disable: count=%d, CPU=%d\n",
preempt_count(), smp_processor_id());
/* 此时任务不会被调度出去,CPU ID不会改变 */
udelay(100);
/* 重新允许抢占 */
preempt_enable();
pr_info("After preempt_enable: count=%d\n", preempt_count());
/* local_bh_disable: 禁止下半部(softirq/tasklet)*/
local_bh_disable();
/* 此时可以安全访问per-CPU数据 */
local_bh_enable();
/* in_interrupt(): 检查是否在中断上下文 */
pr_info("in_interrupt: %d\n", (int)in_interrupt());
pr_info("in_atomic: %d\n", (int)in_atomic());
pr_info("irqs_disabled: %d\n", (int)irqs_disabled());
return0;
}
staticvoid __exit preempt_demo_exit(void) {}
module_init(preempt_demo_init);
module_exit(preempt_demo_exit);
四、实时调度(SCHED_FIFO/SCHED_RR)
4.1 实时调度策略
/*
* SCHED_FIFO(先进先出实时):
* - 固定优先级(1-99,99最高)
* - 同优先级:先来先服务,不主动让出CPU
* - 只有更高优先级任务或自愿让出才切换
*
* SCHED_RR(轮转实时):
* - 同SCHED_FIFO,但同优先级任务分时片轮转
* - 时间片到期放到同优先级队列尾部
*
* 实时任务优先于所有普通任务
* 注意:不要让RT任务死循环,会饿死系统!
*/
// rt_task_demo.c - 实时任务示例
#include<stdio.h>
#include<stdlib.h>
#include<pthread.h>
#include<sched.h>
#include<sys/mman.h>
#include<time.h>
#include<string.h>
#define ITERATIONS 1000
#define PERIOD_US 1000 /* 1ms周期 */
structrt_stats {
long min_lat_us;
long max_lat_us;
long avg_lat_us;
int overruns;
};
staticvoidtimespec_add_us(struct timespec *ts, long us)
{
ts->tv_nsec += us * 1000;
while (ts->tv_nsec >= 1000000000L) {
ts->tv_nsec -= 1000000000L;
ts->tv_sec++;
}
}
staticlongtimespec_diff_us(struct timespec *a, struct timespec *b)
{
return (a->tv_sec - b->tv_sec) * 1000000L +
(a->tv_nsec - b->tv_nsec) / 1000L;
}
void *rt_periodic_task(void *arg)
{
structrt_statsstats = {.min_lat_us = LONG_MAX};
structtimespecnext, now;
int i;
/* 获取初始时间 */
clock_gettime(CLOCK_MONOTONIC, &next);
for (i = 0; i < ITERATIONS; i++) {
/* 计算下一个周期 */
timespec_add_us(&next, PERIOD_US);
/* 等待到下一个周期 */
clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &next, NULL);
/* 记录实际唤醒时间,计算延迟 */
clock_gettime(CLOCK_MONOTONIC, &now);
long latency = timespec_diff_us(&now, &next);
if (latency < stats.min_lat_us) stats.min_lat_us = latency;
if (latency > stats.max_lat_us) stats.max_lat_us = latency;
stats.avg_lat_us += latency;
if (latency > PERIOD_US / 2) stats.overruns++;
/* 实际工作(模拟100us处理时间)*/
structtimespecwork_end;
clock_gettime(CLOCK_MONOTONIC, &work_end);
timespec_add_us(&work_end, 100);
while (1) {
structtimespect;
clock_gettime(CLOCK_MONOTONIC, &t);
if (t.tv_sec > work_end.tv_sec ||
(t.tv_sec == work_end.tv_sec &&
t.tv_nsec >= work_end.tv_nsec))
break;
}
}
stats.avg_lat_us /= ITERATIONS;
printf("RT Task Stats (%d iterations, %dus period):\n", ITERATIONS, PERIOD_US);
printf(" Min latency: %ld us\n", stats.min_lat_us);
printf(" Max latency: %ld us\n", stats.max_lat_us);
printf(" Avg latency: %ld us\n", stats.avg_lat_us);
printf(" Overruns: %d\n", stats.overruns);
returnNULL;
}
intmain()
{
pthread_t thread;
pthread_attr_t attr;
structsched_paramparam;
/* 锁定所有内存,防止page fault引起延迟 */
if (mlockall(MCL_CURRENT | MCL_FUTURE) < 0) {
perror("mlockall (need root or CAP_IPC_LOCK)");
}
/* 预分配栈内存(防止栈增长时的page fault)*/
char stack_prefault[64 * 1024];
memset(stack_prefault, 0, sizeof(stack_prefault));
/* 创建实时线程 */
pthread_attr_init(&attr);
pthread_attr_setschedpolicy(&attr, SCHED_FIFO);
param.sched_priority = 80;
pthread_attr_setschedparam(&attr, ¶m);
pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
if (pthread_create(&thread, &attr, rt_periodic_task, NULL) != 0) {
perror("pthread_create (need root or RLIMIT_RTPRIO)");
/* 降级为普通线程 */
pthread_attr_setschedpolicy(&attr, SCHED_OTHER);
pthread_create(&thread, &attr, rt_periodic_task, NULL);
}
pthread_join(thread, NULL);
pthread_attr_destroy(&attr);
return0;
}
4.2 实时任务内核模块
// rt_kthread.c - 实时内核线程
#include<linux/module.h>
#include<linux/kthread.h>
#include<linux/sched.h>
#include<linux/sched/rt.h>
#include<linux/hrtimer.h>
#include<linux/delay.h>
MODULE_LICENSE("GPL");
staticstructtask_struct *rt_thread;
staticstructhrtimerperiodic_timer;
staticktime_t period;
staticint stop_thread = 0;
staticenum hrtimer_restart timer_fn(struct hrtimer *timer)
{
/* 唤醒RT线程 */
if (rt_thread)
wake_up_process(rt_thread);
hrtimer_forward_now(timer, period);
return HRTIMER_RESTART;
}
staticintrt_task_fn(void *data)
{
structsched_paramparam = { .sched_priority = 80 };
int count = 0;
/* 设置为SCHED_FIFO实时调度 */
sched_setscheduler(current, SCHED_FIFO, ¶m);
pr_info("RT kthread started: policy=%d, prio=%d\n",
current->policy, current->rt_priority);
while (!kthread_should_stop() && !stop_thread) {
/* 等待定时器唤醒 */
set_current_state(TASK_INTERRUPTIBLE);
schedule();
if (kthread_should_stop()) break;
/* 执行实时任务 */
count++;
if (count % 1000 == 0)
pr_info("RT task: %d iterations\n", count);
}
pr_info("RT kthread stopping after %d iterations\n", count);
return0;
}
staticint __init rt_kthread_init(void)
{
/* 1ms周期 */
period = ktime_set(0, 1000000);
/* 创建RT内核线程 */
rt_thread = kthread_create(rt_task_fn, NULL, "rt_demo");
if (IS_ERR(rt_thread))
return PTR_ERR(rt_thread);
wake_up_process(rt_thread);
/* 启动高精度定时器 */
hrtimer_init(&periodic_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
periodic_timer.function = timer_fn;
hrtimer_start(&periodic_timer, period, HRTIMER_MODE_REL);
pr_info("RT demo loaded\n");
return0;
}
staticvoid __exit rt_kthread_exit(void)
{
stop_thread = 1;
hrtimer_cancel(&periodic_timer);
if (rt_thread)
kthread_stop(rt_thread);
pr_info("RT demo unloaded\n");
}
module_init(rt_kthread_init);
module_exit(rt_kthread_exit);
五、Deadline调度(SCHED_DEADLINE)
5.1 EDF算法
/*
* SCHED_DEADLINE 基于 EDF(Earliest Deadline First)算法:
* - 参数:runtime, deadline, period
* - 含义:每period时间内,任务需要deadline前完成runtime的工作
* - 示例:runtime=5ms, deadline=10ms, period=20ms
* 每20ms内,任务要在10ms截止时间内运行5ms
*
* 准入控制:
* - 内核检查是否会导致调度不可行
* - sum(runtime/period) <= 1 才允许
*/
// deadline_task.c - Deadline调度用户空间示例
#define _GNU_SOURCE
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<sys/syscall.h>
#include<linux/sched.h>
#include<stdint.h>
#include<errno.h>
/* SCHED_DEADLINE相关结构体 */
structsched_attr {
uint32_t size;
uint32_t sched_policy;
uint64_t sched_flags;
int32_t sched_nice;
uint32_t sched_priority;
uint64_t sched_runtime; /* 每周期需要的CPU时间(ns)*/
uint64_t sched_deadline; /* 截止时间(ns,相对于周期开始)*/
uint64_t sched_period; /* 周期(ns)*/
};
staticintsched_setattr(pid_t pid, conststruct sched_attr *attr,
unsignedint flags)
{
return syscall(SYS_sched_setattr, pid, attr, flags);
}
staticintsched_getattr(pid_t pid, struct sched_attr *attr,
unsignedint size, unsignedint flags)
{
return syscall(SYS_sched_getattr, pid, attr, size, flags);
}
intmain()
{
structsched_attrattr;
memset(&attr, 0, sizeof(attr));
attr.size = sizeof(attr);
attr.sched_policy = SCHED_DEADLINE;
attr.sched_runtime = 5000000; /* 5ms runtime */
attr.sched_deadline = 10000000; /* 10ms deadline */
attr.sched_period = 20000000; /* 20ms period */
printf("Setting SCHED_DEADLINE: runtime=5ms, deadline=10ms, period=20ms\n");
if (sched_setattr(0, &attr, 0) < 0) {
perror("sched_setattr");
printf("Note: SCHED_DEADLINE requires root or CAP_SYS_NICE\n");
return1;
}
printf("SCHED_DEADLINE set successfully!\n");
/* 执行周期性工作 */
for (int i = 0; i < 100; i++) {
/* 模拟5ms的计算工作 */
structtimespecend;
clock_gettime(CLOCK_MONOTONIC, &end);
end.tv_nsec += 4000000; /* 4ms */
if (end.tv_nsec >= 1000000000) {
end.tv_nsec -= 1000000000;
end.tv_sec++;
}
while (1) {
structtimespecnow;
clock_gettime(CLOCK_MONOTONIC, &now);
if (now.tv_sec > end.tv_sec ||
(now.tv_sec == end.tv_sec && now.tv_nsec >= end.tv_nsec))
break;
}
/* 主动让出CPU,等待下一个周期 */
sched_yield();
}
printf("Done.\n");
return0;
}
六、EEVDF调度器(Linux 6.6+)
6.1 EEVDF概述
/*
* EEVDF(Earliest Eligible Virtual Deadline First)
* Linux 6.6正式引入,逐步替代CFS
*
* 改进点:
* - 解决CFS的"抢占过于激进"问题
* - 引入"lag"概念:任务欠的CPU时间(负lag=欠执行,正lag=超额执行)
* - 引入"slice":每次调度的时间片(可通过sched_attr设置)
* - eligible:只有lag <= 0的任务才是eligible(避免超额执行的任务抢占)
* - 在eligible任务中选择虚拟截止时间最早的
*
* 关键公式:
* virtual_deadline = vruntime + slice / weight
* lag = entity_key - min_vruntime
*
* 源码位置:kernel/sched/fair.c
* 关键函数:pick_eevdf()
*/
# 查看EEVDF相关参数(Linux 6.6+)
cat /proc/sys/kernel/sched_min_granularity_ns
cat /proc/sys/kernel/sched_base_slice_ns # EEVDF基础时间片
# 通过sched_attr设置自定义时间片(EEVDF特性)
# sched_attr.sched_flags |= SCHED_FLAG_UTIL_CLAMP
# sched_attr.sched_runtime = 2000000 # 2ms时间片
七、负载均衡
7.1 调度域(Scheduling Domains)
调度域层次反映CPU拓扑:
SMT Domain(超线程) - 同物理核的逻辑核
MC Domain(多核) - 同物理处理器的核
NUMA Domain - 同NUMA节点
System Domain - 跨NUMA节点
# 查看调度域信息
ls /proc/sys/kernel/sched_domain/
cat /sys/kernel/debug/sched/domains | head -50
# 查看CPU负载
cat /proc/schedstat | head -10
# 每行:CPU id, yield_count, ...
# 使用mpstat监控CPU负载均衡
mpstat -P ALL 1 5
# 设置CPU亲和性(避免迁移)
taskset -c 0,1 ./myprogram # 限制在CPU0和1上运行
taskset -p 0x3 <PID> # 通过CPU掩码设置
// affinity_demo.c - CPU亲和性设置
#include<linux/module.h>
#include<linux/sched.h>
#include<linux/cpumask.h>
#include<linux/smp.h>
MODULE_LICENSE("GPL");
staticint __init affinity_demo_init(void)
{
structcpumaskmask;
int cpu;
pr_info("=== CPU Affinity Demo ===\n");
pr_info("Current CPU: %d\n", smp_processor_id());
pr_info("Allowed CPUs: %*pbl\n",
cpumask_pr_args(current->cpus_ptr));
/* 限制当前任务只在CPU0上运行 */
cpumask_clear(&mask);
cpumask_set_cpu(0, &mask);
if (set_cpus_allowed_ptr(current, &mask) == 0) {
pr_info("Pinned to CPU 0, now on CPU: %d\n",
smp_processor_id());
}
/* 恢复所有CPU */
cpumask_copy(&mask, cpu_online_mask);
set_cpus_allowed_ptr(current, &mask);
/* 在指定CPU上执行函数 */
pr_info("Calling function on CPU 0...\n");
smp_call_function_single(0, (void(*)(void*))pr_info,
"Hello from CPU 0\n", 1);
/* 统计各CPU运行队列长度 */
for_each_online_cpu(cpu) {
pr_info("CPU %d: nr_running=%u\n",
cpu, cpu_rq(cpu)->nr_running);
}
return0;
}
staticvoid __exit affinity_demo_exit(void) {}
module_init(affinity_demo_init);
module_exit(affinity_demo_exit);
八、调度器调试与分析
8.1 ftrace追踪调度事件
# 追踪进程调度切换
cd /sys/kernel/debug/tracing
echo 0 > tracing_on
# 方法1:使用sched_switch追踪点
echo 1 > events/sched/sched_switch/enable
echo 1 > events/sched/sched_wakeup/enable
echo 1 > tracing_on
sleep 2
echo 0 > tracing_on
cat trace | head -50
# 方法2:function_graph追踪调度函数
echo function_graph > current_tracer
echo schedule > set_graph_function
echo 1 > tracing_on
sleep 0.1
echo 0 > tracing_on
cat trace | head -30
# 清理
echo nop > current_tracer
echo > set_graph_function
echo > trace
8.2 perf调度分析
# 记录调度数据
sudo perf sched record -a -- sleep 5
# 分析调度延迟
sudo perf sched latency | head -40
# 时间线视图
sudo perf sched timehist | head -30
# 任务切换统计
sudo perf sched summary
# 分析特定进程的调度
sudo perf stat -e context-switches,cpu-migrations,\
sched:sched_switch,sched:sched_wakeup \
-p <PID> sleep 5
8.3 调度延迟分析模块
// sched_latency.c - 测量调度延迟的内核模块
#include<linux/module.h>
#include<linux/ktime.h>
#include<linux/tracepoint.h>
#include<linux/sched.h>
MODULE_LICENSE("GPL");
static u64 wakeup_time;
static u64 max_latency_ns = 0;
staticatomic64_t total_latency;
staticatomic_t wakeup_count;
/* 追踪唤醒事件 */
staticvoidprobe_sched_wakeup(void *data, struct task_struct *p)
{
if (strcmp(p->comm, "latency_test") == 0) {
wakeup_time = ktime_get_ns();
}
}
/* 追踪切换事件 */
staticvoidprobe_sched_switch(void *data, bool preempt,
struct task_struct *prev,
struct task_struct *next,
unsignedint prev_state)
{
if (strcmp(next->comm, "latency_test") == 0 && wakeup_time) {
u64 latency = ktime_get_ns() - wakeup_time;
if (latency > max_latency_ns)
max_latency_ns = latency;
atomic64_add(latency, &total_latency);
atomic_inc(&wakeup_count);
wakeup_time = 0;
}
}
staticstructtracepoint *tp_wakeup, *tp_switch;
staticint __init sched_latency_init(void)
{
/* 注册追踪点 */
register_trace_sched_wakeup(probe_sched_wakeup, NULL);
register_trace_sched_switch(probe_sched_switch, NULL);
pr_info("Sched latency monitor loaded\n");
pr_info("Run a process named 'latency_test' to measure its scheduling latency\n");
return0;
}
staticvoid __exit sched_latency_exit(void)
{
unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
unregister_trace_sched_switch(probe_sched_switch, NULL);
int count = atomic_read(&wakeup_count);
if (count > 0) {
u64 avg = atomic64_read(&total_latency) / count;
pr_info("Scheduling latency stats:\n");
pr_info(" Samples: %d\n", count);
pr_info(" Max: %llu ns (%llu us)\n",
max_latency_ns, max_latency_ns / 1000);
pr_info(" Avg: %llu ns (%llu us)\n",
avg, avg / 1000);
}
tracepoint_synchronize_unregister();
}
module_init(sched_latency_init);
module_exit(sched_latency_exit);
九、cgroup与调度
9.1 CPU cgroup控制
# CPU时间配额控制(cpu.cfs_quota_us/cpu.cfs_period_us)
sudo mkdir /sys/fs/cgroup/myapp
# 允许子控制器
echo"+cpu" | sudo tee /sys/fs/cgroup/myapp/cgroup.subtree_control
sudo mkdir /sys/fs/cgroup/myapp/worker
# 限制CPU使用率为50%(100ms周期内使用50ms)
echo"50000 100000" | sudo tee /sys/fs/cgroup/myapp/worker/cpu.max
# 格式:quota period(单位:微秒)
# 设置CPU权重(相对于其他cgroup的优先级,默认100)
echo 200 | sudo tee /sys/fs/cgroup/myapp/worker/cpu.weight
# 将进程加入cgroup
echo $$ | sudo tee /sys/fs/cgroup/myapp/worker/cgroup.procs
# 查看CPU统计
cat /sys/fs/cgroup/myapp/worker/cpu.stat
# 清理
echo $$ | sudo tee /sys/fs/cgroup/cgroup.procs
sudo rmdir /sys/fs/cgroup/myapp/worker
sudo rmdir /sys/fs/cgroup/myapp
实践检查清单
CFS基础
- [ ] 理解vruntime是什么,为什么能实现"公平调度"
- [ ] 理解nice值与权重(weight)的关系
- [ ] 能描述
schedule() 到 pick_next_task_fair()的完整调用链 - [ ] 编写
/proc/vruntime 模块观察各进程的vruntime差异
实时调度
- [ ] 理解
SCHED_FIFO 和 SCHED_RR 的区别 - [ ] 理解
sched_rt_runtime_us的保护作用 - [ ] 理解为什么RT任务需要
mlockall()
调度域与负载均衡
- [ ] 能用
taskset/numactl 设置CPU亲和性 - [ ] 理解主动负载均衡(idle load balance)触发条件
调试工具
- [ ] 用
perf sched latency分析系统调度延迟分布 - [ ] 用
ftrace sched_switch 追踪进程切换事件 - [ ] 用 CPU cgroup 限制进程的CPU使用率
- [ ] 理解
context-switches 和 cpu-migrations 的含义