概述
Linux内存管理是内核最复杂的子系统之一,涵盖从物理内存的伙伴系统到用户空间虚拟内存的完整体系。本专题深入剖析各层次内存分配机制、页表结构、内存回收和NUMA架构。
一、物理内存模型
1.1 内存区域(Zone)
/*
* Linux将物理内存划分为多个Zone:
*
* ZONE_DMA (0 - 16MB) - ISA DMA设备使用(x86历史遗留)
* ZONE_DMA32 (0 - 4GB) - 32位DMA设备使用
* ZONE_NORMAL (16MB - ...) - 内核直接映射区域
* ZONE_HIGHMEM (>896MB,32位) - 32位内核无法直接映射的高端内存
* ZONE_MOVABLE - 可迁移页面(支持内存热插拔)
* ZONE_DEVICE - 设备内存(pmem等)
*/
# 查看内存Zone信息
cat /proc/zoneinfo | head -60
# 查看Zone中空闲页面分布
cat /proc/buddyinfo
# Node 0, zone Normal 312 156 78 45 23 12 6 3 1 1 0
# 每列代表 2^0, 2^1, ..., 2^10 页的空闲块数量
# 查看内存总体信息
cat /proc/meminfo
# 查看NUMA节点信息
numactl --hardware
cat /sys/devices/system/node/node*/meminfo
1.2 页描述符(struct page)
// page_info.c - 理解struct page
#include<linux/module.h>
#include<linux/mm.h>
#include<linux/mmzone.h>
#include<linux/page-flags.h>
MODULE_LICENSE("GPL");
staticvoidanalyze_page(struct page *page)
{
pr_info("=== Page Analysis ===\n");
pr_info("PFN: %lu\n", page_to_pfn(page));
pr_info("Physical addr: 0x%llx\n", (u64)page_to_phys(page));
pr_info("Virtual addr: %p\n", page_address(page));
pr_info("Page flags: 0x%lx\n", page->flags);
/* 检查页面标志 */
pr_info("PageLocked: %d\n", PageLocked(page));
pr_info("PageDirty: %d\n", PageDirty(page));
pr_info("PageUptodate: %d\n", PageUptodate(page));
pr_info("PageSlab: %d\n", PageSlab(page));
pr_info("PageCompound: %d\n", PageCompound(page));
pr_info("PageAnon: %d\n", PageAnon(page));
/* Zone信息 */
pr_info("Zone: %s\n", page_zone(page)->name);
pr_info("NUMA node: %d\n", page_to_nid(page));
/* 引用计数 */
pr_info("_refcount: %d\n", page_ref_count(page));
pr_info("_mapcount: %d\n", page_mapcount(page));
}
staticint __init page_info_init(void)
{
structpage *page;
void *vaddr;
/* 分配一个页 */
page = alloc_page(GFP_KERNEL);
if (!page) {
pr_err("Failed to allocate page\n");
return -ENOMEM;
}
vaddr = page_address(page);
analyze_page(page);
/* 写入数据 */
memset(vaddr, 0xAB, PAGE_SIZE);
pr_info("Page content[0]: 0x%02x\n", *(unsignedchar *)vaddr);
__free_page(page);
return0;
}
staticvoid __exit page_info_exit(void) {}
module_init(page_info_init);
module_exit(page_info_exit);
二、伙伴系统(Buddy System)
2.1 伙伴系统原理
内存被划分为2^order个连续页面的块(order从0到MAX_ORDER-1,通常11)
分配2^2=4页的请求:
Order: 0 1 2 3 4
Free: 2 1 0 1 2
步骤:
1. 查找order=2的空闲块,找到直接返回
2. 如果没有,查找order=3的块,分裂为两个order=2,返回一个
释放时:
1. 检查伙伴(相邻的同大小块)是否空闲
2. 如果是,合并成更大的块
3. 重复直到无法合并
2.2 伙伴系统操作
// buddy_demo.c - 伙伴系统使用
#include<linux/module.h>
#include<linux/mm.h>
#include<linux/gfp.h>
MODULE_LICENSE("GPL");
staticvoidbuddy_alloc_demo(void)
{
structpage *pages[5];
int orders[] = {0, 1, 2, 3, 4}; /* 2^order 页 */
int i;
pr_info("=== Buddy System Demo ===\n");
for (i = 0; i < ARRAY_SIZE(orders); i++) {
int order = orders[i];
pages[i] = alloc_pages(GFP_KERNEL, order);
if (pages[i]) {
pr_info("order=%d: allocated %lu pages at PFN %lu, phys=0x%llx\n",
order,
(unsignedlong)(1 << order),
page_to_pfn(pages[i]),
(u64)page_to_phys(pages[i]));
} else {
pr_err("order=%d: allocation failed!\n", order);
}
}
/* 释放所有页 */
for (i = 0; i < ARRAY_SIZE(orders); i++) {
if (pages[i])
__free_pages(pages[i], orders[i]);
}
/* 显示伙伴系统状态 */
/* 在用户空间用: cat /proc/buddyinfo */
}
staticvoidgfp_flags_demo(void)
{
structpage *page;
pr_info("=== GFP Flags Demo ===\n");
/* GFP_KERNEL: 可睡眠,用于进程上下文 */
page = alloc_page(GFP_KERNEL);
if (page) {
pr_info("GFP_KERNEL: OK\n");
__free_page(page);
}
/* GFP_ATOMIC: 不可睡眠,用于中断上下文 */
page = alloc_page(GFP_ATOMIC);
if (page) {
pr_info("GFP_ATOMIC: OK\n");
__free_page(page);
}
/* GFP_DMA: 分配DMA区域内存 */
page = alloc_page(GFP_DMA);
if (page) {
pr_info("GFP_DMA: OK, phys=0x%llx\n", (u64)page_to_phys(page));
__free_page(page);
}
/* GFP_HIGHUSER_MOVABLE: 用户空间可迁移页 */
page = alloc_page(GFP_HIGHUSER_MOVABLE);
if (page) {
pr_info("GFP_HIGHUSER_MOVABLE: OK\n");
__free_page(page);
}
}
staticint __init buddy_demo_init(void)
{
buddy_alloc_demo();
gfp_flags_demo();
return0;
}
staticvoid __exit buddy_demo_exit(void) {}
module_init(buddy_demo_init);
module_exit(buddy_demo_exit);
三、SLUB分配器
3.1 Slab分配器演进
SLAB → 最早(1994),复杂,有bug
SLOB → 嵌入式系统,极简,碎片多
SLUB → 现代默认(2007),简化设计,更好性能
3.2 SLUB工作原理
/*
* SLUB核心概念:
*
* kmem_cache - 对象缓存(每种固定大小对象一个)
* slab - 一个或多个连续页组成的对象池
* cpu_slab - 每CPU的本地缓存(减少锁竞争)
* node - NUMA节点的部分满slab列表
*
* 分配流程:
* 1. 检查per-CPU缓存(无锁)
* 2. 从本CPU的partial slab获取
* 3. 从节点的partial slab获取
* 4. 向伙伴系统申请新页
*/
// slub_demo.c - SLUB分配器使用
#include<linux/module.h>
#include<linux/slab.h>
MODULE_LICENSE("GPL");
/* 自定义对象 */
structmy_object {
int id;
char data[60];
structlist_headlist;
};
staticstructkmem_cache *my_cache;
staticvoidkmem_cache_demo(void)
{
structmy_object *objs[10];
int i;
pr_info("=== kmem_cache Demo ===\n");
/* 创建专用缓存 */
my_cache = kmem_cache_create(
"my_object_cache", /* 名称 */
sizeof(struct my_object), /* 对象大小 */
0, /* 对齐(0=默认)*/
SLAB_HWCACHE_ALIGN | /* Cache line对齐 */
SLAB_POISON | /* 内存污染检测 */
SLAB_RED_ZONE, /* 缓冲区溢出检测 */
NULL/* 构造函数 */
);
if (!my_cache) {
pr_err("Failed to create kmem_cache\n");
return;
}
pr_info("Cache created: object_size=%u, size=%u\n",
my_cache->object_size, my_cache->size);
/* 分配对象 */
for (i = 0; i < ARRAY_SIZE(objs); i++) {
objs[i] = kmem_cache_alloc(my_cache, GFP_KERNEL);
if (objs[i]) {
objs[i]->id = i;
snprintf(objs[i]->data, sizeof(objs[i]->data),
"object_%d", i);
}
}
pr_info("Allocated %d objects\n", i);
/* 释放对象 */
for (i = 0; i < ARRAY_SIZE(objs); i++) {
if (objs[i])
kmem_cache_free(my_cache, objs[i]);
}
kmem_cache_destroy(my_cache);
my_cache = NULL;
}
staticvoidkmalloc_size_demo(void)
{
void *ptrs[10];
size_t sizes[] = {8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
int i;
pr_info("=== kmalloc Size Classes ===\n");
for (i = 0; i < ARRAY_SIZE(sizes); i++) {
ptrs[i] = kmalloc(sizes[i], GFP_KERNEL);
if (ptrs[i]) {
pr_info("kmalloc(%zu) -> actual=%zu, addr=%p\n",
sizes[i], ksize(ptrs[i]), ptrs[i]);
}
}
for (i = 0; i < ARRAY_SIZE(sizes); i++) {
kfree(ptrs[i]);
}
}
staticint __init slub_demo_init(void)
{
kmem_cache_demo();
kmalloc_size_demo();
return0;
}
staticvoid __exit slub_demo_exit(void) {}
module_init(slub_demo_init);
module_exit(slub_demo_exit);
# 查看SLUB缓存信息
cat /proc/slabinfo | head -20
# 或使用更友好的工具
sudo slabtop
# 查看特定缓存详细信息
sudo cat /sys/kernel/slab/task_struct/object_size
sudo cat /sys/kernel/slab/task_struct/slabs
sudo cat /sys/kernel/slab/task_struct/objects
# SLUB调试(内核参数)
# slub_debug=FPZU 启用所有调试选项
# F: 一致性检查
# P: 内存毒化
# Z: 红区
# U: 使用追踪
四、vmalloc与高端内存
4.1 kmalloc vs vmalloc
/*
* kmalloc:
* - 物理上连续
* - 适合小对象(< 4MB)
* - 速度快
* - 可用于DMA
*
* vmalloc:
* - 虚拟上连续,物理上可以不连续
* - 适合大块内存(> 4MB)
* - 速度较慢(需要建立页表)
* - 不可直接用于DMA
*
* get_free_pages:
* - 物理连续
* - 按页分配(2^order页)
* - 无额外元数据开销
*/
// vmalloc_demo.c - vmalloc使用示例
#include<linux/module.h>
#include<linux/vmalloc.h>
#include<linux/mm.h>
MODULE_LICENSE("GPL");
staticvoidvmalloc_test(void)
{
void *vptr;
structpage *page;
size_t size = 2 * 1024 * 1024; /* 2MB */
pr_info("=== vmalloc Demo ===\n");
/* vmalloc分配 */
vptr = vmalloc(size);
if (!vptr) {
pr_err("vmalloc failed\n");
return;
}
pr_info("vmalloc(%zu) -> vaddr=%p\n", size, vptr);
/* 验证物理不连续性 */
unsignedlong vaddr = (unsignedlong)vptr;
unsignedlong phys0 = page_to_phys(vmalloc_to_page((void *)vaddr));
unsignedlong phys1 = page_to_phys(vmalloc_to_page((void *)(vaddr + PAGE_SIZE)));
unsignedlong phys2 = page_to_phys(vmalloc_to_page((void *)(vaddr + 2*PAGE_SIZE)));
pr_info("Physical addresses of first 3 pages:\n");
pr_info(" page[0]: 0x%lx\n", phys0);
pr_info(" page[1]: 0x%lx\n", phys1);
pr_info(" page[2]: 0x%lx\n", phys2);
if (phys1 != phys0 + PAGE_SIZE)
pr_info(" -> Pages are NOT physically contiguous (as expected)\n");
/* 写入测试 */
memset(vptr, 0, size);
vfree(vptr);
pr_info("vmalloc freed\n");
}
staticvoidioremap_demo(void)
{
/*
* ioremap用于映射设备寄存器(MMIO)
* 这里仅演示API,实际需要真实的物理地址
*/
pr_info("=== ioremap API (demonstration) ===\n");
pr_info("ioremap maps device physical addr to kernel virtual addr\n");
pr_info("Usage: void __iomem *base = ioremap(phys_addr, size);\n");
pr_info(" ioread32(base + offset);\n");
pr_info(" iowrite32(val, base + offset);\n");
pr_info(" iounmap(base);\n");
}
staticint __init vmalloc_demo_init(void)
{
vmalloc_test();
ioremap_demo();
return0;
}
staticvoid __exit vmalloc_demo_exit(void) {}
module_init(vmalloc_demo_init);
module_exit(vmalloc_demo_exit);
五、页表结构
5.1 x86-64四级页表
虚拟地址(48位):
+--------+--------+--------+--------+------------+
| PML4 | PDP | PD | PT | Offset |
| 9 bits | 9 bits | 9 bits | 9 bits | 12 bits |
+--------+--------+--------+--------+------------+
47-39 38-30 29-21 20-12 11-0
地址翻译:
CR3 → PML4 Entry → PDP Entry → PD Entry → PT Entry → 物理页
每级表项64位,9位索引 → 512个表项/页
# 查看进程的页表信息
sudo cat /proc/$(pgrep bash)/maps
# 查看页表统计
cat /proc/vmstat | grep -E "pgfault|pgmajfault|pswpin|pswpout"
# x86-64页表条目格式
# Bit 0: P (Present)
# Bit 1: R/W (Read/Write)
# Bit 2: U/S (User/Supervisor)
# Bit 3: PWT (Page Write Through)
# Bit 4: PCD (Page Cache Disable)
# Bit 5: A (Accessed)
# Bit 6: D (Dirty)
# Bit 7: PS (Page Size, 2MB/1GB huge pages)
# Bits 12-51: Physical address
# Bit 63: NX (No Execute)
// pagetable_demo.c - 页表遍历
#include<linux/module.h>
#include<linux/mm.h>
#include<linux/sched.h>
#include<linux/sched/mm.h>
#include<asm/pgtable.h>
MODULE_LICENSE("GPL");
staticintwalk_page_table(unsignedlong addr, struct mm_struct *mm)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pr_info("Walking page table for vaddr=0x%lx\n", addr);
/* Level 1: PGD (Page Global Directory = PML4) */
pgd = pgd_offset(mm, addr);
if (pgd_none(*pgd) || pgd_bad(*pgd)) {
pr_info("PGD: not present\n");
return -EFAULT;
}
pr_info("PGD: val=0x%lx\n", pgd_val(*pgd));
/* Level 2: P4D */
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
pr_info("P4D: not present\n");
return -EFAULT;
}
/* Level 3: PUD */
pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
pr_info("PUD: not present\n");
return -EFAULT;
}
if (pud_large(*pud)) {
pr_info("PUD: 1GB huge page, phys=0x%llx\n",
(u64)(pud_pfn(*pud) << PAGE_SHIFT));
return0;
}
/* Level 4: PMD */
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
pr_info("PMD: not present\n");
return -EFAULT;
}
if (pmd_large(*pmd)) {
pr_info("PMD: 2MB huge page, phys=0x%llx\n",
(u64)(pmd_pfn(*pmd) << PAGE_SHIFT));
return0;
}
/* Level 5: PTE */
pte = pte_offset_map(pmd, addr);
if (!pte || pte_none(*pte)) {
pr_info("PTE: not present\n");
if (pte) pte_unmap(pte);
return -EFAULT;
}
pr_info("PTE: val=0x%lx\n", pte_val(*pte));
pr_info(" Physical: 0x%llx\n", (u64)(pte_pfn(*pte) << PAGE_SHIFT));
pr_info(" Present: %d\n", pte_present(*pte));
pr_info(" Write: %d\n", pte_write(*pte));
pr_info(" User: %d\n", pte_user(*pte));
pr_info(" Dirty: %d\n", pte_dirty(*pte));
pr_info(" Young: %d\n", pte_young(*pte));
pte_unmap(pte);
return0;
}
staticint __init pagetable_demo_init(void)
{
structmm_struct *mm = current->mm;
unsignedlong stack_addr = current->mm->start_stack;
mmget(mm);
mmap_read_lock(mm);
walk_page_table(stack_addr, mm);
mmap_read_unlock(mm);
mmput(mm);
return0;
}
staticvoid __exit pagetable_demo_exit(void) {}
module_init(pagetable_demo_init);
module_exit(pagetable_demo_exit);
六、内存回收机制
6.1 页面回收概述
内存压力触发回收:
kswapd(后台,水位线)
direct reclaim(直接回收,内存紧张时)
回收目标页面类型:
Clean file pages → 直接丢弃(可从磁盘重读)
Dirty file pages → 写回磁盘后丢弃
Anonymous pages → 写入swap分区/文件
Slab caches → 调用shrinker回调
LRU链表(两组):
active_anon inactive_anon 匿名页
active_file inactive_file 文件页
页面老化:
访问时放入active链表头
定期移动到inactive链表
inactive链表尾部的页被回收
# 查看内存水位线
cat /proc/zoneinfo | grep -A5 "Node 0"
# 查看内存压力统计
cat /proc/vmstat | grep -E "pgsteal|pgscand|pgactivate|pgdeactivate"
# 调整swappiness(0=不换出匿名页,100=积极换出)
cat /proc/sys/vm/swappiness
sudo sysctl -w vm.swappiness=10
# 触发手动内存回收(不建议生产使用)
# 1: PageCache, 2: dentries/inodes, 3: 两者都清
echo 1 | sudo tee /proc/sys/vm/drop_caches
# 查看swap使用
swapon --show
cat /proc/swaps
vmstat 1 5
6.2 OOM Killer
# 查看OOM分数
cat /proc/$(pgrep firefox)/oom_score
cat /proc/$(pgrep firefox)/oom_score_adj # -1000到1000
# 设置OOM分数(-17到15, -17=永不杀死)
echo -1000 | sudo tee /proc/1/oom_score_adj # 保护init
# 查看OOM killer日志
dmesg | grep -E "Out of memory|oom_kill"
journalctl -k | grep OOM
// oom_demo.c - OOM相关内核操作
#include<linux/module.h>
#include<linux/oom.h>
#include<linux/sched.h>
MODULE_LICENSE("GPL");
staticint __init oom_demo_init(void)
{
structtask_struct *task;
pr_info("=== OOM Score Demo ===\n");
rcu_read_lock();
for_each_process(task) {
if (task->mm) { /* 只看有内存的进程 */
long score = oom_badness(task, ULONG_MAX);
if (score > 100) { /* 只打印分数较高的 */
pr_info("PID=%d (%s): oom_score=%ld, adj=%d\n",
task->pid, task->comm,
score,
task->signal->oom_score_adj);
}
}
}
rcu_read_unlock();
return0;
}
staticvoid __exit oom_demo_exit(void) {}
module_init(oom_demo_init);
module_exit(oom_demo_exit);
七、NUMA架构内存管理
7.1 NUMA基础
/*
* NUMA (Non-Uniform Memory Access):
* - 多个内存节点(Node),每个与特定CPU组关联
* - 访问本地节点内存快,访问远程节点慢
* - 内核尽量在任务所在节点分配内存
*/
# 查看NUMA拓扑
numactl --hardware
# 示例输出:
# available: 2 nodes (0-1)
# node 0 cpus: 0 1 2 3
# node 0 size: 8192 MB
# node 1 cpus: 4 5 6 7
# node 1 size: 8192 MB
# node distances: 0 1
# 0: 10 20
# 1: 20 10
# NUMA内存统计
numastat
numastat -p $(pgrep java)
# 绑定进程到特定NUMA节点
numactl --cpunodebind=0 --membind=0 ./myprogram
# 查看NUMA迁移统计
cat /proc/vmstat | grep numa
// numa_demo.c - NUMA感知的内存分配
#include<linux/module.h>
#include<linux/mm.h>
#include<linux/numa.h>
#include<linux/topology.h>
#include<linux/nodemask.h>
MODULE_LICENSE("GPL");
staticint __init numa_demo_init(void)
{
int node;
structpage *page;
pr_info("=== NUMA Info ===\n");
pr_info("num_online_nodes: %d\n", num_online_nodes());
pr_info("Current CPU node: %d\n", numa_node_id());
/* 在每个节点分配内存 */
for_each_online_node(node) {
pr_info("Node %d: present=%lu pages\n",
node,
node_present_pages(node));
/* 在指定节点分配页 */
page = alloc_pages_node(node, GFP_KERNEL, 0);
if (page) {
pr_info(" Allocated page on node %d, PFN=%lu\n",
node, page_to_pfn(page));
__free_page(page);
}
}
/* kmalloc_node: 在指定NUMA节点分配 */
void *buf = kmalloc_node(4096, GFP_KERNEL, 0);
if (buf) {
pr_info("kmalloc_node(0): %p, node=%d\n",
buf, page_to_nid(virt_to_page(buf)));
kfree(buf);
}
return0;
}
staticvoid __exit numa_demo_exit(void) {}
module_init(numa_demo_init);
module_exit(numa_demo_exit);
八、大页(HugePage)
8.1 标准大页(Huge Pages)
# 查看大页配置
cat /proc/meminfo | grep -i huge
# HugePages_Total: 0 大页总数
# HugePages_Free: 0 空闲大页
# Hugepagesize: 2048 kB 大页大小(2MB on x86-64)
# 分配大页
echo 128 | sudo tee /proc/sys/vm/nr_hugepages
# 挂载hugetlbfs
sudo mkdir /mnt/hugepages
sudo mount -t hugetlbfs nodev /mnt/hugepages
# 查看1GB大页(需要CPU支持pdpe1gb)
grep pdpe1gb /proc/cpuinfo
echo 2 | sudo tee /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
// hugepage_demo.c - 用户空间大页使用
#include<stdio.h>
#include<stdlib.h>
#include<sys/mman.h>
#include<sys/stat.h>
#include<fcntl.h>
#include<unistd.h>
#include<string.h>
#include<errno.h>
#define HUGE_PAGE_SIZE (2 * 1024 * 1024) /* 2MB */
intmain()
{
void *ptr;
int fd;
/* 方法1:通过hugetlbfs文件 */
fd = open("/mnt/hugepages/test", O_CREAT | O_RDWR, 0755);
if (fd < 0) {
perror("open hugetlbfs");
goto try_mmap;
}
ftruncate(fd, HUGE_PAGE_SIZE);
ptr = mmap(NULL, HUGE_PAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);
if (ptr != MAP_FAILED) {
printf("Hugepage via hugetlbfs: %p\n", ptr);
memset(ptr, 0, HUGE_PAGE_SIZE);
munmap(ptr, HUGE_PAGE_SIZE);
}
close(fd);
unlink("/mnt/hugepages/test");
try_mmap:
/* 方法2:通过MAP_HUGETLB(需要大页池)*/
ptr = mmap(NULL, HUGE_PAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
if (ptr != MAP_FAILED) {
printf("Hugepage via MAP_HUGETLB: %p\n", ptr);
memset(ptr, 0xAB, HUGE_PAGE_SIZE);
munmap(ptr, HUGE_PAGE_SIZE);
} else {
perror("MAP_HUGETLB mmap");
}
return0;
}
8.2 透明大页(THP)
# 查看THP状态
cat /sys/kernel/mm/transparent_hugepage/enabled
# [always] madvise never
# 设置THP策略
echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
# 查看THP统计
cat /proc/vmstat | grep thp
# 在程序中使用madvise提示THP
# madvise(addr, length, MADV_HUGEPAGE); # 建议使用THP
# madvise(addr, length, MADV_NOHUGEPAGE); # 禁止使用THP
九、内存映射与VMA
9.1 虚拟内存区域(VMA)
// vma_demo.c - VMA操作
#include<linux/module.h>
#include<linux/mm.h>
#include<linux/sched.h>
#include<linux/sched/mm.h>
MODULE_LICENSE("GPL");
staticvoiddump_process_vmas(struct task_struct *task)
{
structmm_struct *mm;
structvm_area_struct *vma;
mm = get_task_mm(task);
if (!mm) return;
pr_info("VMAs for PID=%d (%s):\n", task->pid, task->comm);
mmap_read_lock(mm);
/* 遍历所有VMA(使用VMA迭代器)*/
VMA_ITERATOR(vmi, mm, 0);
for_each_vma(vmi, vma) {
constchar *name = vma->vm_file ?
vma->vm_file->f_path.dentry->d_name.name : "[anonymous]";
pr_info(" [0x%lx - 0x%lx] size=%lukB flags=%c%c%c %s\n",
vma->vm_start,
vma->vm_end,
(vma->vm_end - vma->vm_start) / 1024,
(vma->vm_flags & VM_READ) ? 'r' : '-',
(vma->vm_flags & VM_WRITE) ? 'w' : '-',
(vma->vm_flags & VM_EXEC) ? 'x' : '-',
name);
}
mmap_read_unlock(mm);
mmput(mm);
}
staticint __init vma_demo_init(void)
{
dump_process_vmas(current);
return0;
}
staticvoid __exit vma_demo_exit(void) {}
module_init(vma_demo_init);
module_exit(vma_demo_exit);
十、内存压缩与KSM
10.1 内存压缩(zswap/zram)
# zswap:压缩swap缓存(拦截页面写入swap时先压缩)
echo 1 | sudo tee /sys/module/zswap/parameters/enabled
cat /sys/kernel/debug/zswap/pool_total_size
# zram:创建压缩内存块设备用作swap
sudo modprobe zram
echo lz4 | sudo tee /sys/block/zram0/comp_algorithm
echo 2G | sudo tee /sys/block/zram0/disksize
sudo mkswap /dev/zram0
sudo swapon -p 100 /dev/zram0
# 查看zram统计
cat /sys/block/zram0/mm_stat
10.2 KSM(内核同页合并)
# KSM将相同内容的页面合并为一个(节省内存)
# 典型场景:虚拟机中多个相同OS实例
# 启用KSM
echo 1 | sudo tee /sys/kernel/mm/ksm/run
# 查看KSM统计
cat /sys/kernel/mm/ksm/pages_shared # 共享的页数
cat /sys/kernel/mm/ksm/pages_sharing # 使用共享页的进程数
cat /sys/kernel/mm/ksm/pages_unshared # 扫描但未共享的页数
# 进程中标记可合并区域
# madvise(addr, len, MADV_MERGEABLE);
# madvise(addr, len, MADV_UNMERGEABLE);
实践检查清单
伙伴系统
- [ ] 能用
alloc_pages() 分配不同order的页 - [ ] 读懂
/proc/buddyinfo 输出
SLUB分配器
- [ ] 理解
kmalloc 的内部大小类(size classes) - [ ] 理解SLAB_POISON和SLAB_RED_ZONE的作用
页表
内存回收
- [ ] 理解kswapd的触发条件(min/low/high水位线)
- [ ] 能通过
/proc/vmstat监控内存压力指标
高级特性