从内核可编程到极致性能 - 探索 eBPF 和 XDP 的强大能力
系列:Linux 网络子系统源码剖析篇号:第 14 篇内核版本:Linux 5.10 LTS重点模块:eBPF、XDP、BPF 虚拟机、网络加速
约 100-120 分钟
eBPF(extended Berkeley Packet Filter)是 Linux 内核中的一个革命性技术,允许在内核空间运行沙箱程序,无需修改内核代码或加载内核模块。
eBPF 特点:
eBPF 架构:

XDP(eXpress Data Path)是 eBPF 在网络数据包处理中的应用,在网卡驱动层提供可编程的数据包处理。
XDP 特点:
XDP 处理位置:

性能对比:
方案 PPS (百万) CPU 使用率 延迟--------------------------------------------------------传统内核栈 1-2 Mpps 100% 100 μsiptables 0.5-1 Mpps 100% 150 μsDPDK 10-20 Mpps 100% 10 μsXDP 10-24 Mpps 50-70% 10-20 μsXDP 优势:- 性能接近 DPDK- 无需绕过内核- 可以与内核栈协同工作- 开发和部署更简单架构对比:

DDoS 防护:
负载均衡:
防火墙:
监控和采样:
eBPF 使用 64 位 RISC 指令集。
寄存器:
/* eBPF 寄存器 */R0: 返回值寄存器R1-R5: 函数参数寄存器R6-R9: 被调用者保存寄存器R10: 只读栈指针/* 寄存器大小 */64 位寄存器,支持 32 位操作指令格式:
structbpf_insn { __u8 code; /* 操作码 */ __u8 dst_reg:4; /* 目标寄存器 */ __u8 src_reg:4; /* 源寄存器 */ __s16 off; /* 偏移 */ __s32 imm; /* 立即数 */};基本指令:
/* 算术指令 */BPF_ADD | BPF_X | BPF_ALU64 // R0 = R0 + R1BPF_SUB | BPF_K | BPF_ALU64 // R0 = R0 - immBPF_MUL | BPF_X | BPF_ALU64 // R0 = R0 * R1BPF_DIV | BPF_X | BPF_ALU64 // R0 = R0 / R1/* 位操作 */BPF_AND | BPF_X | BPF_ALU64 // R0 = R0 & R1BPF_OR | BPF_X | BPF_ALU64 // R0 = R0 | R1BPF_XOR | BPF_X | BPF_ALU64 // R0 = R0 ^ R1BPF_LSH | BPF_K | BPF_ALU64 // R0 = R0 << imm/* 内存访问 */BPF_LDX | BPF_MEM | BPF_W // R0 = *(u32 *)(R1 + off)BPF_STX | BPF_MEM | BPF_W // *(u32 *)(R1 + off) = R0/* 跳转指令 */BPF_JEQ | BPF_X | BPF_JMP // if (R0 == R1) goto +offBPF_JNE | BPF_X | BPF_JMP // if (R0 != R1) goto +offBPF_JGT | BPF_X | BPF_JMP // if (R0 > R1) goto +off/* 函数调用 */BPF_CALL // call helper functionBPF_EXIT // return验证器确保 eBPF 程序的安全性。
源码位置:kernel/bpf/verifier.c
/** * bpf_check - 验证 eBPF 程序 * @prog: eBPF 程序 * @attr: 程序属性 */intbpf_check(struct bpf_prog **prog, union bpf_attr *attr,union bpf_attr __user *uattr){structbpf_verifier_env *env;structbpf_verifier_log *log;int ret = -EINVAL;/* ========== 1. 创建验证器环境 ========== */ env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);if (!env)return -ENOMEM; env->prog = *prog;/* ========== 2. 初始化日志 ========== */log = &env->log;log->level = attr->log_level;log->ubuf = (char __user *)attr->log_buf;log->len_total = attr->log_size;/* ========== 3. 检查程序大小 ========== */if ((*prog)->len > BPF_MAXINSNS) { verbose(env, "program too large (%u insns)\n", (*prog)->len);goto err_free; }/* ========== 4. 检查指令合法性 ========== */ ret = check_cfg(env);if (ret < 0)goto err_free;/* ========== 5. 数据流分析 ========== */ ret = do_check(env);if (ret < 0)goto err_free;/* ========== 6. 检查内存访问 ========== */ ret = check_mem_access(env);if (ret < 0)goto err_free;/* ========== 7. 优化程序 ========== */ ret = opt_remove_dead_code(env);if (ret < 0)goto err_free;/* ========== 8. 转换程序 ========== */ ret = convert_ctx_accesses(env);if (ret < 0)goto err_free;return0;err_free: kfree(env);return ret;}/** * do_check - 执行数据流分析 * @env: 验证器环境 */staticintdo_check(struct bpf_verifier_env *env){structbpf_verifier_state *state = env->cur_state;structbpf_insn *insns = env->prog->insnsi;int insn_cnt = env->prog->len;int insn_idx, prev_insn_idx = 0;int ret;/* ========== 遍历所有指令 ========== */for (insn_idx = 0; insn_idx < insn_cnt; insn_idx++) {structbpf_insn *insn = &insns[insn_idx]; u8 class = BPF_CLASS(insn->code);/* ========== 检查指令类型 ========== */if (class == BPF_ALU || class == BPF_ALU64) {/* 算术指令 */ ret = check_alu_op(env, insn); } elseif (class == BPF_LDX) {/* 加载指令 */ ret = check_mem_access(env, insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, false); } elseif (class == BPF_STX) {/* 存储指令 */ ret = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, false); } elseif (class == BPF_JMP || class == BPF_JMP32) {/* 跳转指令 */ ret = check_cond_jmp_op(env, insn, &insn_idx); }if (ret < 0)return ret; prev_insn_idx = insn_idx; }return0;}验证规则:
/* 1. 禁止无限循环 */- 必须有明确的退出条件- 循环次数有上限/* 2. 内存访问检查 */- 所有指针必须先检查再使用- 不能访问未初始化的内存- 不能越界访问/* 3. 寄存器状态跟踪 */- 跟踪每个寄存器的类型和值范围- 确保类型安全/* 4. 指令数量限制 */- 最多 4096 条指令(可配置)- 防止过于复杂的程序/* 5. 栈大小限制 */- 最多 512 字节栈空间JIT 将 eBPF 字节码编译为本地机器码。
源码位置:arch/x86/net/bpf_jit_comp.c
/** * bpf_int_jit_compile - JIT 编译 eBPF 程序 * @prog: eBPF 程序 */struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog){structbpf_binary_header *header =NULL;structbpf_prog *tmp, *orig_prog = prog;structjit_contextctx = {}; u8 *image = NULL;int *addrs;int pass;/* ========== 1. 分配地址数组 ========== */ addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);if (!addrs)return orig_prog;/* ========== 2. 多遍编译 ========== */for (pass = 0; pass < 10; pass++) {/* 第一遍:计算大小 */if (pass == 0) { ctx.cleanup_addr = 0; }/* 编译所有指令 */ proglen = do_jit(prog, addrs, image, &ctx);if (proglen <= 0) { image = NULL;goto out_addrs; }/* 第二遍:分配内存并生成代码 */if (pass == 1) { header = bpf_jit_binary_alloc(proglen, &image,1, jit_fill_hole);if (!header) { prog = orig_prog;goto out_addrs; } }/* 检查是否收敛 */if (proglen == old_proglen)break; old_proglen = proglen; }/* ========== 3. 设置程序 ========== */if (image) { bpf_jit_binary_lock_ro(header); prog->bpf_func = (void *)image; prog->jited = 1; prog->jited_len = proglen; }out_addrs: kfree(addrs);return prog;}/** * do_jit - 执行 JIT 编译 * @prog: eBPF 程序 * @addrs: 指令地址数组 * @image: 目标代码缓冲区 * @ctx: JIT 上下文 */staticintdo_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,struct jit_context *ctx){structbpf_insn *insn = bpf_prog->insnsi;int insn_cnt = bpf_prog->len; u8 *prog = image;int i;/* ========== 遍历所有 eBPF 指令 ========== */for (i = 0; i < insn_cnt; i++, insn++) {const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; u8 b1 = 0, b2 = 0, b3 = 0; s64 jmp_offset; u8 *func;/* 记录指令地址 */ addrs[i] = prog - image;/* ========== 根据指令类型生成机器码 ========== */switch (insn->code) {/* ALU 操作 */case BPF_ALU64 | BPF_ADD | BPF_X:/* add dst, src */ EMIT3(0x48, 0x01, add_2reg(0xC0, src_reg, dst_reg));break;case BPF_ALU64 | BPF_SUB | BPF_X:/* sub dst, src */ EMIT3(0x48, 0x29, add_2reg(0xC0, src_reg, dst_reg));break;/* 内存访问 */case BPF_LDX | BPF_MEM | BPF_W:/* mov dst, dword ptr [src + off] */ EMIT3(0x8B, add_2reg(0x40, src_reg, dst_reg), insn->off);break;/* 跳转 */case BPF_JMP | BPF_JEQ | BPF_X:/* cmp dst, src */ EMIT3(0x48, 0x39, add_2reg(0xC0, src_reg, dst_reg));/* je offset */ jmp_offset = addrs[i + insn->off + 1] - (addrs[i] + 6); EMIT2_off32(0x0F, 0x84, jmp_offset);break;/* 函数调用 */case BPF_JMP | BPF_CALL: func = (u8 *)__bpf_call_base + imm32; jmp_offset = func - (image + addrs[i] + 5); EMIT1_off32(0xE8, jmp_offset);break;/* 返回 */case BPF_JMP | BPF_EXIT:/* ret */ EMIT1(0xC3);break; } }return prog - image;}Helper 函数提供内核功能给 eBPF 程序。
常用 Helper 函数:
/* 映射操作 */void *bpf_map_lookup_elem(struct bpf_map *map, constvoid *key);intbpf_map_update_elem(struct bpf_map *map, constvoid *key,constvoid *value, u64 flags);intbpf_map_delete_elem(struct bpf_map *map, constvoid *key);/* 数据包操作 */intbpf_skb_load_bytes(conststruct sk_buff *skb, u32 offset,void *to, u32 len);intbpf_skb_store_bytes(struct sk_buff *skb, u32 offset,constvoid *from, u32 len, u64 flags);intbpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags);/* 时间和随机数 */u64 bpf_ktime_get_ns(void);u32 bpf_get_prandom_u32(void);/* 调试 */intbpf_trace_printk(constchar *fmt, u32 fmt_size, ...);/* 校验和 */__wsum bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed);Helper 函数实现示例:
/** * bpf_map_lookup_elem - 查找映射元素 * @map: BPF 映射 * @key: 键 */BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key){void *value;/* ========== 调用映射的查找函数 ========== */ value = map->ops->map_lookup_elem(map, key);return (unsignedlong)value;}conststructbpf_func_protobpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY,};XDP 钩子点:

xdp_buff:
源码位置:include/net/xdp.h
/** * struct xdp_buff - XDP 数据包缓冲区 */structxdp_buff {void *data; /* 数据包起始位置 */void *data_end; /* 数据包结束位置 */void *data_meta; /* 元数据起始位置 */void *data_hard_start; /* 缓冲区起始位置 */structxdp_rxq_info *rxq;/* 接收队列信息 */structxdp_txq_info *txq;/* 发送队列信息 */ u32 frame_sz; /* 帧大小 */};/** * struct xdp_md - XDP 元数据(用户空间视图) */structxdp_md { __u32 data; /* 数据包起始 */ __u32 data_end; /* 数据包结束 */ __u32 data_meta; /* 元数据起始 */ __u32 ingress_ifindex; /* 入接口索引 */ __u32 rx_queue_index; /* RX 队列索引 */ __u32 egress_ifindex; /* 出接口索引 */};XDP 返回值:
/* XDP 动作代码 */enumxdp_action { XDP_ABORTED = 0, /* 异常终止,丢弃包 */ XDP_DROP, /* 丢弃包 */ XDP_PASS, /* 传递给协议栈 */ XDP_TX, /* 从同一网卡发送回去 */ XDP_REDIRECT, /* 重定向到其他网卡 */};动作处理:
源码位置:net/core/dev.c
/** * xdp_do_redirect - 执行 XDP 重定向 * @dev: 网络设备 * @xdp: XDP 缓冲区 * @prog: XDP 程序 */intxdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,struct bpf_prog *prog){structbpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);structnet_device *fwd; u32 index = ri->tgt_index;int err;/* ========== 1. 获取目标设备 ========== */ fwd = dev_get_by_index_rcu(dev_net(dev), index);if (unlikely(!fwd)) { err = -EINVAL;goto err; }/* ========== 2. 检查设备状态 ========== */if (unlikely(!(fwd->flags & IFF_UP))) { err = -ENETDOWN;goto err; }/* ========== 3. 执行重定向 ========== */ err = __bpf_tx_xdp(fwd, NULL, xdp, 0);if (err)goto err;return0;err: trace_xdp_exception(dev, prog, XDP_REDIRECT);return err;}驱动中的 XDP 支持:
源码位置:drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
/** * stmmac_xdp_setup - 设置 XDP 程序 * @dev: 网络设备 * @prog: XDP 程序 */staticintstmmac_xdp_setup(struct net_device *dev, struct bpf_prog *prog){structstmmac_priv *priv = netdev_priv(dev);structbpf_prog *old_prog;bool need_update;/* ========== 1. 检查是否需要更新 ========== */ old_prog = xchg(&priv->xdp_prog, prog); need_update = !!prog != !!old_prog;/* ========== 2. 如果设备正在运行,需要重新配置 ========== */if (netif_running(dev) && need_update) {/* 停止设备 */ stmmac_release(dev);/* 重新启动设备 */ stmmac_open(dev); }/* ========== 3. 释放旧程序 ========== */if (old_prog) bpf_prog_put(old_prog);return0;}/** * stmmac_run_xdp - 运行 XDP 程序 * @priv: 驱动私有数据 * @xdp: XDP 缓冲区 */staticintstmmac_run_xdp(struct stmmac_priv *priv, struct xdp_buff *xdp){structbpf_prog *xdp_prog;int result;/* ========== 1. 获取 XDP 程序 ========== */ rcu_read_lock(); xdp_prog = READ_ONCE(priv->xdp_prog);if (!xdp_prog) { result = XDP_PASS;goto out; }/* ========== 2. 执行 XDP 程序 ========== */ result = bpf_prog_run_xdp(xdp_prog, xdp);/* ========== 3. 处理返回值 ========== */switch (result) {case XDP_PASS:break;case XDP_TX: result = stmmac_xdp_xmit_back(priv, xdp);break;case XDP_REDIRECT:if (xdp_do_redirect(priv->dev, xdp, xdp_prog) < 0) result = XDP_ABORTED;break;default: bpf_warn_invalid_xdp_action(result);/* fall through */case XDP_ABORTED: trace_xdp_exception(priv->dev, xdp_prog, result);/* fall through */case XDP_DROP:break; }out: rcu_read_unlock();return result;}/** * stmmac_rx_refill - 接收路径(集成 XDP) * @priv: 驱动私有数据 * @queue: 队列索引 */staticintstmmac_rx_refill(struct stmmac_priv *priv, u32 queue){structstmmac_rx_queue *rx_q = &priv->rx_queue[queue];int dirty = stmmac_rx_dirty(priv, queue);unsignedint entry = rx_q->dirty_rx;while (dirty-- > 0) {structdma_desc *p;structsk_buff *skb;dma_addr_t dma_addr;/* ========== 1. 获取描述符 ========== */ entry = STMMAC_GET_ENTRY(entry, DMA_RX_SIZE); p = rx_q->dma_rx + entry;/* ========== 2. 分配缓冲区 ========== */if (priv->xdp_prog) {/* XDP 模式:分配页面 */structpage *page; page = dev_alloc_page();if (!page)break; rx_q->rx_skbuff[entry] = NULL; rx_q->rx_page[entry] = page; dma_addr = dma_map_page(priv->device, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); } else {/* 普通模式:分配 sk_buff */ skb = __netdev_alloc_skb_ip_align(priv->dev, priv->dma_buf_sz, GFP_ATOMIC);if (!skb)break; rx_q->rx_skbuff[entry] = skb; dma_addr = dma_map_single(priv->device, skb->data, priv->dma_buf_sz, DMA_FROM_DEVICE); }/* ========== 3. 设置描述符 ========== */ stmmac_set_desc_addr(priv, p, dma_addr); stmmac_set_rx_owner(priv, p, priv->use_rxtoe); entry = STMMAC_GET_ENTRY(entry + 1, DMA_RX_SIZE); } rx_q->dirty_rx = entry;return0;}三种 XDP 模式:
1. Native XDP(原生模式) - 驱动直接支持 - 性能最高 - 需要驱动修改2. Offloaded XDP(卸载模式) - 网卡硬件执行 - 性能最高 - 需要硬件支持3. Generic XDP(通用模式) - 内核通用实现 - 所有网卡都支持 - 性能较低Generic XDP 实现:
源码位置:net/core/dev.c
/** * do_xdp_generic - 通用 XDP 处理 * @skb: sk_buff * @xdp_prog: XDP 程序 */static u32 do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb){structxdp_buffxdp; u32 act = XDP_DROP;/* ========== 1. 准备 XDP 缓冲区 ========== */ xdp.data_hard_start = skb->data - skb_headroom(skb); xdp.data = skb->data; xdp.data_end = skb->data + skb_headlen(skb); xdp.data_meta = xdp.data; xdp.rxq = &skb->dev->xdp_rxq;/* ========== 2. 执行 XDP 程序 ========== */ act = bpf_prog_run_xdp(xdp_prog, &xdp);/* ========== 3. 处理数据包修改 ========== */switch (act) {case XDP_REDIRECT:case XDP_TX:case XDP_PASS:/* 调整 skb 指针 */ __skb_pull(skb, xdp.data - xdp.data_hard_start); skb->len = xdp.data_end - xdp.data;break; }return act;}安装依赖:
# Ubuntu/Debianapt-get install -y \ clang llvm \ libbpf-dev \ linux-headers-$(uname -r) \ linux-tools-$(uname -r)# CentOS/RHELyum install -y \ clang llvm \ libbpf-devel \ kernel-devel \ kernel-headers项目结构:
xdp-project/├── src/│ ├── xdp_prog.c # XDP 程序│ └── xdp_loader.c # 加载器├── include/│ └── bpf_helpers.h # Helper 函数声明├── Makefile└── README.md示例 1:丢弃所有 ICMP 包:
/* xdp_drop_icmp.c */#include<linux/bpf.h>#include<linux/if_ether.h>#include<linux/ip.h>#include<linux/icmp.h>#include<bpf/bpf_helpers.h>SEC("xdp")intxdp_drop_icmp_prog(struct xdp_md *ctx){/* ========== 1. 获取数据包指针 ========== */void *data_end = (void *)(long)ctx->data_end;void *data = (void *)(long)ctx->data;/* ========== 2. 解析以太网头 ========== */structethhdr *eth = data;/* 边界检查 */if ((void *)(eth + 1) > data_end)return XDP_PASS;/* 检查是否是 IP 包 */if (eth->h_proto != __constant_htons(ETH_P_IP))return XDP_PASS;/* ========== 3. 解析 IP 头 ========== */structiphdr *iph = (void *)(eth + 1);/* 边界检查 */if ((void *)(iph + 1) > data_end)return XDP_PASS;/* ========== 4. 检查是否是 ICMP ========== */if (iph->protocol == IPPROTO_ICMP) {/* 丢弃 ICMP 包 */return XDP_DROP; }/* 其他包放行 */return XDP_PASS;}char _license[] SEC("license") = "GPL";编译:
# 编译为 eBPF 字节码clang -O2 -target bpf -c xdp_drop_icmp.c -o xdp_drop_icmp.o# 查看生成的 eBPF 指令llvm-objdump -S xdp_drop_icmp.o示例 2:IP 黑名单过滤:
/* xdp_blacklist.c */#include<linux/bpf.h>#include<linux/if_ether.h>#include<linux/ip.h>#include<bpf/bpf_helpers.h>/* ========== 定义 BPF Map ========== */struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 10000); __type(key, __u32); /* IP 地址 */ __type(value, __u64); /* 丢弃计数 */} blacklist SEC(".maps");SEC("xdp")intxdp_blacklist_prog(struct xdp_md *ctx){void *data_end = (void *)(long)ctx->data_end;void *data = (void *)(long)ctx->data;/* ========== 1. 解析以太网头 ========== */structethhdr *eth = data;if ((void *)(eth + 1) > data_end)return XDP_PASS;if (eth->h_proto != __constant_htons(ETH_P_IP))return XDP_PASS;/* ========== 2. 解析 IP 头 ========== */structiphdr *iph = (void *)(eth + 1);if ((void *)(iph + 1) > data_end)return XDP_PASS;/* ========== 3. 查找黑名单 ========== */ __u32 src_ip = iph->saddr; __u64 *count = bpf_map_lookup_elem(&blacklist, &src_ip);if (count) {/* IP 在黑名单中,丢弃并更新计数 */ __sync_fetch_and_add(count, 1);return XDP_DROP; }/* IP 不在黑名单中,放行 */return XDP_PASS;}char _license[] SEC("license") = "GPL";用户空间加载器:
/* xdp_loader.c */#include<stdio.h>#include<stdlib.h>#include<string.h>#include<errno.h>#include<unistd.h>#include<net/if.h>#include<linux/if_link.h>#include<bpf/libbpf.h>#include<bpf/bpf.h>intmain(int argc, char **argv){structbpf_object *obj;structbpf_program *prog;int prog_fd, map_fd;int ifindex;int err;if (argc != 3) {fprintf(stderr, "Usage: %s <ifname> <prog.o>\n", argv[0]);return1; }/* ========== 1. 获取网卡索引 ========== */ ifindex = if_nametoindex(argv[1]);if (!ifindex) {fprintf(stderr, "Invalid interface: %s\n", argv[1]);return1; }/* ========== 2. 加载 BPF 对象 ========== */ obj = bpf_object__open_file(argv[2], NULL);if (libbpf_get_error(obj)) {fprintf(stderr, "Failed to open BPF object\n");return1; }/* ========== 3. 加载到内核 ========== */ err = bpf_object__load(obj);if (err) {fprintf(stderr, "Failed to load BPF object: %d\n", err);goto cleanup; }/* ========== 4. 获取程序 FD ========== */ prog = bpf_object__find_program_by_name(obj, "xdp_blacklist_prog");if (!prog) {fprintf(stderr, "Failed to find XDP program\n");goto cleanup; } prog_fd = bpf_program__fd(prog);if (prog_fd < 0) {fprintf(stderr, "Failed to get program FD\n");goto cleanup; }/* ========== 5. 附加到网卡 ========== */ err = bpf_set_link_xdp_fd(ifindex, prog_fd, XDP_FLAGS_UPDATE_IF_NOEXIST);if (err) {fprintf(stderr, "Failed to attach XDP program: %d\n", err);goto cleanup; }printf("XDP program attached to %s\n", argv[1]);/* ========== 6. 获取 Map FD ========== */ map_fd = bpf_object__find_map_fd_by_name(obj, "blacklist");if (map_fd < 0) {fprintf(stderr, "Failed to find blacklist map\n");goto cleanup; }/* ========== 7. 添加黑名单 IP ========== */ __u32 ip = 0x0100007f; /* 127.0.0.1 */ __u64 count = 0; err = bpf_map_update_elem(map_fd, &ip, &count, BPF_ANY);if (err) {fprintf(stderr, "Failed to update map: %d\n", err);goto cleanup; }printf("Added 127.0.0.1 to blacklist\n");/* ========== 8. 保持运行 ========== */printf("Press Ctrl+C to exit\n"); pause();cleanup: bpf_object__close(obj);return err;}编译和运行:
# 编译 XDP 程序clang -O2 -target bpf -c xdp_blacklist.c -o xdp_blacklist.o# 编译加载器gcc -o xdp_loader xdp_loader.c -lbpf# 运行(需要 root 权限)sudo ./xdp_loader eth0 xdp_blacklist.o# 测试ping 127.0.0.1 # 应该被丢弃# 卸载sudo ip linkset dev eth0 xdp off示例 3:修改 TTL:
/* xdp_modify_ttl.c */#include<linux/bpf.h>#include<linux/if_ether.h>#include<linux/ip.h>#include<bpf/bpf_helpers.h>#include<bpf/bpf_endian.h>/* 重新计算 IP 校验和 */static __always_inline __u16 csum_fold_helper(__u32 csum){ __u32 sum; sum = (csum >> 16) + (csum & 0xffff); sum += (sum >> 16);return ~sum;}static __always_inline voidipv4_csum(struct iphdr *iph){ iph->check = 0; __u32 csum = bpf_csum_diff(0, 0, (__be32 *)iph, sizeof(*iph), 0); iph->check = csum_fold_helper(csum);}SEC("xdp")intxdp_modify_ttl_prog(struct xdp_md *ctx){void *data_end = (void *)(long)ctx->data_end;void *data = (void *)(long)ctx->data;/* ========== 1. 解析以太网头 ========== */structethhdr *eth = data;if ((void *)(eth + 1) > data_end)return XDP_PASS;if (eth->h_proto != bpf_htons(ETH_P_IP))return XDP_PASS;/* ========== 2. 解析 IP 头 ========== */structiphdr *iph = (void *)(eth + 1);if ((void *)(iph + 1) > data_end)return XDP_PASS;/* ========== 3. 修改 TTL ========== */if (iph->ttl > 1) { iph->ttl--;/* 重新计算校验和 */ ipv4_csum(iph); }return XDP_PASS;}char _license[] SEC("license") = "GPL";示例 4:负载均衡:
/* xdp_lb.c */#include<linux/bpf.h>#include<linux/if_ether.h>#include<linux/ip.h>#include<linux/tcp.h>#include<bpf/bpf_helpers.h>#include<bpf/bpf_endian.h>/* 后端服务器列表 */struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 4); __type(key, __u32); __type(value, __u32); /* 后端 ifindex */} backends SEC(".maps");/* 统计信息 */struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(max_entries, 4); __type(key, __u32); __type(value, __u64); /* 包计数 */} stats SEC(".maps");SEC("xdp")intxdp_lb_prog(struct xdp_md *ctx){void *data_end = (void *)(long)ctx->data_end;void *data = (void *)(long)ctx->data; __u32 key, *ifindex; __u64 *count;/* ========== 1. 解析以太网头 ========== */structethhdr *eth = data;if ((void *)(eth + 1) > data_end)return XDP_PASS;if (eth->h_proto != bpf_htons(ETH_P_IP))return XDP_PASS;/* ========== 2. 解析 IP 头 ========== */structiphdr *iph = (void *)(eth + 1);if ((void *)(iph + 1) > data_end)return XDP_PASS;/* ========== 3. 计算哈希选择后端 ========== */ __u32 hash = iph->saddr ^ iph->daddr;if (iph->protocol == IPPROTO_TCP) {structtcphdr *tcph = (void *)(iph + 1);if ((void *)(tcph + 1) > data_end)return XDP_PASS; hash ^= (tcph->source << 16) | tcph->dest; } key = hash % 4; /* 4 个后端 *//* ========== 4. 查找后端 ========== */ ifindex = bpf_map_lookup_elem(&backends, &key);if (!ifindex)return XDP_PASS;/* ========== 5. 更新统计 ========== */ count = bpf_map_lookup_elem(&stats, &key);if (count) __sync_fetch_and_add(count, 1);/* ========== 6. 重定向到后端 ========== */return bpf_redirect(*ifindex, 0);}char _license[] SEC("license") = "GPL";TC(Traffic Control)eBPF 在流量控制层提供可编程能力。
TC eBPF 示例:
/* tc_rate_limit.c - 速率限制 */#include<linux/bpf.h>#include<linux/pkt_cls.h>#include<bpf/bpf_helpers.h>/* 令牌桶 */struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 1024); __type(key, __u32); /* 流 ID */ __type(value, __u64); /* 令牌数 */} token_bucket SEC(".maps");SEC("tc")inttc_rate_limit_prog(struct __sk_buff *skb){ __u32 flow_id = skb->hash; __u64 *tokens, now;/* 查找令牌桶 */ tokens = bpf_map_lookup_elem(&token_bucket, &flow_id);if (!tokens) {/* 新流,初始化令牌桶 */ __u64 init_tokens = 1000; bpf_map_update_elem(&token_bucket, &flow_id, &init_tokens, BPF_ANY);return TC_ACT_OK; }/* 检查令牌 */if (*tokens > 0) { __sync_fetch_and_sub(tokens, 1);return TC_ACT_OK; /* 放行 */ }return TC_ACT_SHOT; /* 丢弃 */}char _license[] SEC("license") = "GPL";加载 TC eBPF:
# 编译clang -O2 -target bpf -c tc_rate_limit.c -o tc_rate_limit.o# 创建 clsact qdisctc qdisc add dev eth0 clsact# 加载 eBPF 程序到 ingresstc filter add dev eth0 ingress bpf da obj tc_rate_limit.o sec tc# 查看tc filter show dev eth0 ingress# 删除tc filter del dev eth0 ingresstc qdisc del dev eth0 clsactSocket 层的 eBPF 过滤器。
/* socket_filter.c */#include<linux/bpf.h>#include<linux/if_ether.h>#include<linux/ip.h>#include<bpf/bpf_helpers.h>SEC("socket")intsocket_filter_prog(struct __sk_buff *skb){void *data_end = (void *)(long)skb->data_end;void *data = (void *)(long)skb->data;structethhdr *eth = data;if ((void *)(eth + 1) > data_end)return0; /* 丢弃 *//* 只接受 IP 包 */if (eth->h_proto == __constant_htons(ETH_P_IP))return-1; /* 接受 */return0; /* 丢弃 */}char _license[] SEC("license") = "GPL";测试环境:
测试结果:
方案 PPS (Mpps) CPU 使用率 延迟 (μs)------------------------------------------------------------传统内核栈 1.5 100% 120iptables 0.8 100% 180TC + eBPF 8.0 60% 25XDP (Generic) 3.0 80% 50XDP (Native) 14.0 45% 15XDP (Offload) 24.0 10% 8DPDK 20.0 100% 10结论:- XDP Native 性能接近 DPDK- XDP 的 CPU 效率更高- XDP 可以与内核栈协同工作1. 使用 Per-CPU Maps:
/* 避免锁竞争 */struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(max_entries, 1); __type(key, __u32); __type(value, __u64);} stats SEC(".maps");2. 内联函数:
/* 使用 __always_inline 减少函数调用开销 */static __always_inline intparse_ipv4(void *data, void *data_end){/* ... */}3. 批量处理:
/* 使用 XDP_REDIRECT 批量重定向 */bpf_redirect_map(&tx_port, 0, 0);4. 避免复杂计算:
/* 使用查表代替计算 */struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 256); __type(key, __u32); __type(value, __u32);} lookup_table SEC(".maps");场景:防御 SYN Flood 攻击。
/* xdp_ddos_protection.c */#include<linux/bpf.h>#include<linux/if_ether.h>#include<linux/ip.h>#include<linux/tcp.h>#include<bpf/bpf_helpers.h>/* SYN 速率限制 */struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); __uint(max_entries, 100000); __type(key, __u32); /* 源 IP */ __type(value, __u64); /* 时间戳 */} syn_tracker SEC(".maps");/* 黑名单 */struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 10000); __type(key, __u32); __type(value, __u8);} blacklist SEC(".maps");#define SYN_THRESHOLD 100 /* 每秒 SYN 数量阈值 */#define TIME_WINDOW 1000000000ULL /* 1 秒(纳秒)*/SEC("xdp")intxdp_ddos_protection_prog(struct xdp_md *ctx){void *data_end = (void *)(long)ctx->data_end;void *data = (void *)(long)ctx->data;/* 解析以太网头 */structethhdr *eth = data;if ((void *)(eth + 1) > data_end)return XDP_PASS;if (eth->h_proto != bpf_htons(ETH_P_IP))return XDP_PASS;/* 解析 IP 头 */structiphdr *iph = (void *)(eth + 1);if ((void *)(iph + 1) > data_end)return XDP_PASS; __u32 src_ip = iph->saddr;/* 检查黑名单 */if (bpf_map_lookup_elem(&blacklist, &src_ip))return XDP_DROP;/* 只处理 TCP */if (iph->protocol != IPPROTO_TCP)return XDP_PASS;/* 解析 TCP 头 */structtcphdr *tcph = (void *)(iph + 1);if ((void *)(tcph + 1) > data_end)return XDP_PASS;/* 检查是否是 SYN 包 */if (tcph->syn && !tcph->ack) { __u64 now = bpf_ktime_get_ns(); __u64 *last_time = bpf_map_lookup_elem(&syn_tracker, &src_ip);if (last_time) {/* 检查速率 */if (now - *last_time < TIME_WINDOW / SYN_THRESHOLD) {/* 超过阈值,加入黑名单 */ __u8 val = 1; bpf_map_update_elem(&blacklist, &src_ip, &val, BPF_ANY);return XDP_DROP; } }/* 更新时间戳 */ bpf_map_update_elem(&syn_tracker, &src_ip, &now, BPF_ANY); }return XDP_PASS;}char _license[] SEC("license") = "GPL";完整的负载均衡实现:
/* xdp_lb_full.c */#include<linux/bpf.h>#include<linux/if_ether.h>#include<linux/ip.h>#include<linux/tcp.h>#include<linux/udp.h>#include<bpf/bpf_helpers.h>/* 后端服务器 */structbackend { __be32 ip; __be16 port; __u8 mac[ETH_ALEN]; __u32 weight;};/* 后端列表 */struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 16); __type(key, __u32); __type(value, struct backend);} backends SEC(".maps");/* 连接跟踪 */structconn_key { __be32 src_ip; __be32 dst_ip; __be16 src_port; __be16 dst_port; __u8 proto;};struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); __uint(max_entries, 100000); __type(key, struct conn_key); __type(value, __u32); /* 后端索引 */} conn_track SEC(".maps");/* 统计信息 */struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(max_entries, 16); __type(key, __u32); __type(value, __u64);} backend_stats SEC(".maps");static __always_inline __u32 hash_conn(struct conn_key *key){return key->src_ip ^ key->dst_ip ^ ((__u32)key->src_port << 16 | key->dst_port);}SEC("xdp")intxdp_lb_full_prog(struct xdp_md *ctx){void *data_end = (void *)(long)ctx->data_end;void *data = (void *)(long)ctx->data;structconn_keykey = {}; __u32 *backend_idx;structbackend *backend; __u64 *stats;/* 解析以太网头 */structethhdr *eth = data;if ((void *)(eth + 1) > data_end)return XDP_PASS;if (eth->h_proto != bpf_htons(ETH_P_IP))return XDP_PASS;/* 解析 IP 头 */structiphdr *iph = (void *)(eth + 1);if ((void *)(iph + 1) > data_end)return XDP_PASS; key.src_ip = iph->saddr; key.dst_ip = iph->daddr; key.proto = iph->protocol;/* 解析传输层 */if (iph->protocol == IPPROTO_TCP) {structtcphdr *tcph = (void *)(iph + 1);if ((void *)(tcph + 1) > data_end)return XDP_PASS; key.src_port = tcph->source; key.dst_port = tcph->dest; } elseif (iph->protocol == IPPROTO_UDP) {struct udphdr *udph = (void *)(iph + 1);if ((void *)(udph + 1) > data_end)return XDP_PASS; key.src_port = udph->source; key.dst_port = udph->dest; } else {return XDP_PASS; }/* 查找连接跟踪 */ backend_idx = bpf_map_lookup_elem(&conn_track, &key);if (!backend_idx) {/* 新连接,选择后端 */ __u32 hash = hash_conn(&key); __u32 idx = hash % 16; /* 假设 16 个后端 *//* 保存连接跟踪 */ bpf_map_update_elem(&conn_track, &key, &idx, BPF_ANY); backend_idx = &idx; }/* 获取后端信息 */ backend = bpf_map_lookup_elem(&backends, backend_idx);if (!backend)return XDP_PASS;/* 修改目标 IP 和 MAC */ iph->daddr = backend->ip; __builtin_memcpy(eth->h_dest, backend->mac, ETH_ALEN);/* 重新计算校验和 */ iph->check = 0; __u32 csum = bpf_csum_diff(0, 0, (__be32 *)iph, sizeof(*iph), 0); iph->check = csum_fold_helper(csum);/* 更新统计 */ stats = bpf_map_lookup_elem(&backend_stats, backend_idx);if (stats) __sync_fetch_and_add(stats, 1);return XDP_TX; /* 从同一网卡发送回去 */}char _license[] SEC("license") = "GPL";eBPF 特性:
XDP 优势:
应用场景:
性能对比:
XDP Native: 14 Mpps @ 45% CPUDPDK: 20 Mpps @ 100% CPU传统内核: 1.5 Mpps @ 100% CPUXDP 优势:- 性能接近 DPDK- CPU 效率更高- 无需绕过内核- 开发部署更简单开发建议:
部署建议:
下一篇《网络子系统调试技术》将深入分析:
选择 XDP 的场景:
选择 DPDK 的场景:
调试方法:
bpf_trace_printk("src_ip: %x\n", iph->saddr);查看输出:
cat /sys/kernel/debug/tracing/trace_pipe# 查看加载的程序bpftool prog show# 查看 Map 内容bpftool map dump id <map_id># 查看程序统计bpftool prog show id <prog_id> --jsonperf record -e xdp:* -aperf script限制:
解决方法:
监控指标:
# 1. 查看 XDP 统计ip -s link show dev eth0# 2. 查看丢包ethtool -S eth0 | grep xdp# 3. 查看 Map 统计bpftool map show# 4. 使用 bpftrace 监控bpftrace -e 'tracepoint:xdp:* { @[probe] = count(); }'Native XDP 支持的网卡:
检查支持:
# 查看驱动是否支持 XDPethtool -i eth0# 尝试加载 XDP 程序ip linkset dev eth0 xdp obj prog.oeBPF 核心:
kernel/bpf/verifier.c # 验证器kernel/bpf/core.c # eBPF 核心arch/x86/net/bpf_jit_comp.c # JIT 编译器XDP 实现:
net/core/dev.c # XDP 核心net/core/filter.c # XDP 程序执行include/net/xdp.h # XDP 数据结构libbpf:
bpftool:
bcc:
bpftrace:
书籍:
在线教程:
示例代码:
作者:肇中内核版本:Linux 5.10 LTS
系列文章:
勘误和建议:欢迎提出勘误和改进建议。