概述
Linux网络子系统是内核中最复杂的部分之一,涵盖了从网卡驱动到应用层接口的完整协议栈。本专题深入分析网络包处理流程、Netfilter框架和高性能XDP技术。
一、网络协议栈架构
1.1 网络层次结构
/*
* Linux网络栈层次:
*
* 应用层 socket接口
* ↕
* 传输层 TCP/UDP
* ↕
* 网络层 IP/ICMP
* ↕
* 链路层 Ethernet
* ↕
* 物理层 网卡驱动
*/
1.2 核心数据结构
// sk_buff.c - 网络数据包结构
#include<linux/skbuff.h>
#include<linux/netdevice.h>
#include<linux/ip.h>
#include<linux/tcp.h>
// sk_buff是网络子系统最重要的数据结构
structsk_buff {
// 链表管理
structsk_buff *next;
structsk_buff *prev;
// 时间戳
ktime_t tstamp;
// 关联的socket
structsock *sk;
// 网络设备
structnet_device *dev;
// 协议相关
__u16 protocol;
__u16 transport_header;
__u16 network_header;
__u16 mac_header;
// 数据指针
unsignedchar *head;
unsignedchar *data;
unsignedchar *tail;
unsignedchar *end;
// 长度信息
unsignedint len;
unsignedint data_len;
// 控制信息
__u8 ip_summed;
__u8 cloned;
__u8 pkt_type;
// 更多字段...
};
// 操作sk_buff的示例
staticvoidskb_operations_demo(void)
{
structsk_buff *skb;
// 分配skb
skb = alloc_skb(1500, GFP_KERNEL);
if (!skb)
return;
// 预留空间
skb_reserve(skb, NET_IP_ALIGN);
// 添加数据
unsignedchar *data = skb_put(skb, 100);
memset(data, 0, 100);
// 添加协议头
skb_push(skb, sizeof(struct iphdr));
skb_push(skb, sizeof(struct ethhdr));
// 设置网络层头
skb_reset_mac_header(skb);
skb_set_network_header(skb, sizeof(struct ethhdr));
// 克隆skb
structsk_buff *clone = skb_clone(skb, GFP_ATOMIC);
// 释放skb
kfree_skb(skb);
if (clone)
kfree_skb(clone);
}
1.3 网络设备结构
// netdev_demo.c - 网络设备操作
#include<linux/netdevice.h>
// 简单网络设备驱动
structsimple_priv {
structnet_device_statsstats;
spinlock_t lock;
};
// 发送数据包
staticnetdev_tx_tsimple_xmit(struct sk_buff *skb, struct net_device *dev)
{
structsimple_priv *priv = netdev_priv(dev);
// 更新统计
priv->stats.tx_packets++;
priv->stats.tx_bytes += skb->len;
// 模拟发送
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
// 设置MAC地址
staticintsimple_set_mac(struct net_device *dev, void *addr)
{
structsockaddr *sa = addr;
if (!is_valid_ether_addr(sa->sa_data))
return -EADDRNOTAVAIL;
memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN);
return0;
}
// 网络设备操作函数
staticconststructnet_device_opssimple_netdev_ops = {
.ndo_open = simple_open,
.ndo_stop = simple_stop,
.ndo_start_xmit = simple_xmit,
.ndo_set_mac_address = simple_set_mac,
.ndo_get_stats = simple_get_stats,
};
// 注册网络设备
staticintsimple_init(void)
{
structnet_device *dev;
structsimple_priv *priv;
// 分配网络设备
dev = alloc_etherdev(sizeof(struct simple_priv));
if (!dev)
return -ENOMEM;
// 初始化私有数据
priv = netdev_priv(dev);
spin_lock_init(&priv->lock);
// 设置设备属性
dev->netdev_ops = &simple_netdev_ops;
dev->features = NETIF_F_HW_CSUM;
// 注册设备
if (register_netdev(dev)) {
free_netdev(dev);
return -ENODEV;
}
return0;
}
二、网络包接收流程
2.1 NAPI机制
// napi_demo.c - NAPI接收机制
#include<linux/netdevice.h>
structnapi_device {
structnet_device *netdev;
structnapi_structnapi;
structsk_buff_headrx_queue;
};
// NAPI轮询函数
staticintnapi_poll(struct napi_struct *napi, int budget)
{
structnapi_device *dev = container_of(napi, struct napi_device, napi);
int work_done = 0;
while (work_done < budget) {
structsk_buff *skb;
// 从硬件获取数据包
skb = get_packet_from_hw();
if (!skb)
break;
// 设置协议
skb->protocol = eth_type_trans(skb, dev->netdev);
// 递交给网络栈
netif_receive_skb(skb);
work_done++;
}
// 如果处理完所有包,退出轮询模式
if (work_done < budget) {
napi_complete_done(napi, work_done);
// 重新启用中断
enable_irq(dev->irq);
}
return work_done;
}
// 中断处理函数
staticirqreturn_trx_interrupt(int irq, void *dev_id)
{
structnapi_device *dev = dev_id;
// 禁用中断
disable_irq_nosync(irq);
// 调度NAPI
napi_schedule(&dev->napi);
return IRQ_HANDLED;
}
// 初始化NAPI
staticvoidinit_napi(struct napi_device *dev)
{
netif_napi_add(dev->netdev, &dev->napi, napi_poll, 64);
napi_enable(&dev->napi);
}
2.2 GRO(Generic Receive Offload)
// gro_demo.c - GRO处理
#include<linux/netdevice.h>
// GRO接收处理
staticstruct sk_buff *gro_receive(struct napi_struct *napi,
struct sk_buff *skb)
{
// GRO处理,合并小包
return napi_gro_receive(napi, skb);
}
// GRO刷新
staticvoidgro_flush(struct napi_struct *napi)
{
// 刷新待合并的包
napi_gro_flush(napi, false);
}
三、Netfilter框架
3.1 Netfilter钩子
// netfilter_hook.c - Netfilter钩子示例
#include<linux/netfilter.h>
#include<linux/netfilter_ipv4.h>
#include<linux/ip.h>
#include<linux/tcp.h>
// 钩子函数
staticunsignedintnf_hook_func(void *priv,
struct sk_buff *skb,
conststruct nf_hook_state *state)
{
structiphdr *iph;
structtcphdr *tcph;
// 获取IP头
iph = ip_hdr(skb);
if (!iph)
return NF_ACCEPT;
// 只处理TCP包
if (iph->protocol != IPPROTO_TCP)
return NF_ACCEPT;
// 获取TCP头
tcph = tcp_hdr(skb);
// 过滤示例:阻止目标端口80的包
if (ntohs(tcph->dest) == 80) {
printk(KERN_INFO "Blocking HTTP traffic\n");
return NF_DROP;
}
// 记录包信息
printk(KERN_INFO "Packet: %pI4:%d -> %pI4:%d\n",
&iph->saddr, ntohs(tcph->source),
&iph->daddr, ntohs(tcph->dest));
return NF_ACCEPT;
}
// Netfilter操作结构
staticstructnf_hook_opsnf_ops[] = {
{
.hook = nf_hook_func,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_FIRST,
},
{
.hook = nf_hook_func,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_FIRST,
},
};
staticint __init netfilter_init(void)
{
int ret;
// 注册钩子
ret = nf_register_net_hooks(&init_net, nf_ops, ARRAY_SIZE(nf_ops));
if (ret < 0) {
printk(KERN_ERR "Failed to register hooks\n");
return ret;
}
printk(KERN_INFO "Netfilter hooks registered\n");
return0;
}
staticvoid __exit netfilter_exit(void)
{
// 注销钩子
nf_unregister_net_hooks(&init_net, nf_ops, ARRAY_SIZE(nf_ops));
printk(KERN_INFO "Netfilter hooks unregistered\n");
}
3.2 连接跟踪
// conntrack_demo.c - 连接跟踪示例
#include<net/netfilter/nf_conntrack.h>
#include<net/netfilter/nf_conntrack_core.h>
// 连接跟踪钩子
staticunsignedintconntrack_hook(void *priv,
struct sk_buff *skb,
conststruct nf_hook_state *state)
{
structnf_conn *ct;
enumip_conntrack_infoctinfo;
// 获取连接跟踪信息
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return NF_ACCEPT;
// 检查连接状态
if (ctinfo == IP_CT_NEW) {
printk(KERN_INFO "New connection\n");
} elseif (ctinfo == IP_CT_ESTABLISHED) {
printk(KERN_INFO "Established connection\n");
} elseif (ctinfo == IP_CT_RELATED) {
printk(KERN_INFO "Related connection\n");
}
// 获取连接信息
printk(KERN_INFO "Connection: %pI4:%u -> %pI4:%u\n",
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip,
ntohs(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port),
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip,
ntohs(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port));
return NF_ACCEPT;
}
3.3 自定义iptables模块
// xt_example.c - xtables扩展模块
#include<linux/netfilter/x_tables.h>
#include<linux/netfilter_ipv4/ip_tables.h>
// 匹配函数
staticboolexample_mt(conststruct sk_buff *skb,
struct xt_action_param *par)
{
conststructiphdr *iph = ip_hdr(skb);
// 自定义匹配逻辑
if (iph->ttl > 64) {
returntrue;
}
returnfalse;
}
// 目标函数
staticunsignedintexample_tg(struct sk_buff *skb,
conststruct xt_action_param *par)
{
structiphdr *iph = ip_hdr(skb);
// 修改TTL
iph->ttl = 64;
// 重新计算校验和
ip_send_check(iph);
return XT_CONTINUE;
}
// 注册匹配器
staticstructxt_matchexample_match = {
.name = "example",
.family = NFPROTO_IPV4,
.match = example_mt,
.matchsize = 0,
.me = THIS_MODULE,
};
// 注册目标
staticstructxt_targetexample_target = {
.name = "EXAMPLE",
.family = NFPROTO_IPV4,
.target = example_tg,
.targetsize = 0,
.me = THIS_MODULE,
};
四、XDP(eXpress Data Path)
4.1 XDP程序基础
// xdp_prog.c - XDP程序示例
#include<linux/bpf.h>
#include<linux/if_ether.h>
#include<linux/ip.h>
#include<linux/tcp.h>
#include<bpf/bpf_helpers.h>
// XDP程序入口
SEC("xdp")
intxdp_prog(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
// 检查以太网头
structethhdr *eth = data;
if ((void *)(eth + 1) > data_end)
return XDP_PASS;
// 只处理IPv4包
if (eth->h_proto != htons(ETH_P_IP))
return XDP_PASS;
// 检查IP头
structiphdr *ip = (void *)(eth + 1);
if ((void *)(ip + 1) > data_end)
return XDP_PASS;
// 丢弃特定源地址的包
if (ip->saddr == htonl(0x0a000001)) { // 10.0.0.1
return XDP_DROP;
}
// 统计包数
__u32 key = 0;
structstats *s = bpf_map_lookup_elem(&stats_map, &key);
if (s) {
__sync_fetch_and_add(&s->packets, 1);
__sync_fetch_and_add(&s->bytes, data_end - data);
}
return XDP_PASS;
}
// BPF Map定义
structstats {
__u64 packets;
__u64 bytes;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, __u32);
__type(value, struct stats);
} stats_map SEC(".maps");
char _license[] SEC("license") = "GPL";
4.2 XDP加载器
// xdp_loader.c - 加载XDP程序
#include<bpf/libbpf.h>
#include<bpf/bpf.h>
#include<net/if.h>
staticintload_xdp_program(constchar *ifname, constchar *progname)
{
structbpf_object *obj;
structbpf_program *prog;
int prog_fd;
int ifindex;
// 获取网络接口索引
ifindex = if_nametoindex(ifname);
if (!ifindex) {
fprintf(stderr, "Invalid interface: %s\n", ifname);
return-1;
}
// 加载BPF对象文件
obj = bpf_object__open_file(progname, NULL);
if (!obj) {
fprintf(stderr, "Failed to open BPF object\n");
return-1;
}
// 加载BPF程序
if (bpf_object__load(obj)) {
fprintf(stderr, "Failed to load BPF object\n");
bpf_object__close(obj);
return-1;
}
// 查找XDP程序
prog = bpf_object__find_program_by_title(obj, "xdp");
if (!prog) {
fprintf(stderr, "Failed to find XDP program\n");
bpf_object__close(obj);
return-1;
}
prog_fd = bpf_program__fd(prog);
// 附加到网络接口
if (bpf_set_link_xdp_fd(ifindex, prog_fd, XDP_FLAGS_UPDATE_IF_NOEXIST) < 0) {
fprintf(stderr, "Failed to attach XDP program\n");
bpf_object__close(obj);
return-1;
}
printf("XDP program attached to %s\n", ifname);
return0;
}
// 卸载XDP程序
staticintunload_xdp_program(constchar *ifname)
{
int ifindex = if_nametoindex(ifname);
if (!ifindex)
return-1;
return bpf_set_link_xdp_fd(ifindex, -1, 0);
}
4.3 XDP重定向
// xdp_redirect.c - XDP包重定向
SEC("xdp")
intxdp_redirect_prog(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
structethhdr *eth = data;
if ((void *)(eth + 1) > data_end)
return XDP_PASS;
// 交换源和目的MAC地址
__u8 tmp[ETH_ALEN];
memcpy(tmp, eth->h_source, ETH_ALEN);
memcpy(eth->h_source, eth->h_dest, ETH_ALEN);
memcpy(eth->h_dest, tmp, ETH_ALEN);
// 重定向到同一接口(发送回去)
return bpf_redirect(ctx->ingress_ifindex, 0);
}
// XDP CPU重定向
SEC("xdp")
intxdp_cpu_redirect(struct xdp_md *ctx)
{
// 根据哈希值选择CPU
__u32 hash = bpf_get_hash_recalc(ctx);
__u32 cpu = hash % bpf_num_possible_cpus();
// 重定向到指定CPU
return bpf_redirect_map(&cpu_map, cpu, 0);
}
// CPU Map定义
struct {
__uint(type, BPF_MAP_TYPE_CPUMAP);
__uint(max_entries, 64);
__type(key, __u32);
__type(value, __u32);
} cpu_map SEC(".maps");
五、高级网络特性
5.1 TC(Traffic Control)
// tc_bpf.c - TC BPF程序
#include<linux/pkt_cls.h>
SEC("tc")
inttc_prog(struct __sk_buff *skb)
{
void *data_end = (void *)(long)skb->data_end;
void *data = (void *)(long)skb->data;
// 限速逻辑
__u32 key = 0;
structrate_limit *rl = bpf_map_lookup_elem(&rate_map, &key);
if (!rl)
return TC_ACT_OK;
__u64 now = bpf_ktime_get_ns();
__u64 tokens = (now - rl->last_time) * rl->rate / 1000000000;
rl->tokens = min(rl->tokens + tokens, rl->burst);
rl->last_time = now;
if (rl->tokens >= skb->len) {
rl->tokens -= skb->len;
return TC_ACT_OK;
}
// 超过速率限制,丢包
return TC_ACT_SHOT;
}
structrate_limit {
__u64 rate; // 字节/秒
__u64 burst; // 突发大小
__u64 tokens; // 当前令牌
__u64 last_time; // 上次更新时间
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, __u32);
__type(value, struct rate_limit);
} rate_map SEC(".maps");
5.2 AF_XDP(零拷贝socket)
// af_xdp.c - AF_XDP socket示例
#include<linux/if_xdp.h>
#include<linux/if_link.h>
staticintcreate_xsk_socket(constchar *ifname, __u32 queue_id)
{
structxsk_socket_configcfg = {
.rx_size = 2048,
.tx_size = 2048,
.libbpf_flags = 0,
.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
.bind_flags = XDP_ZEROCOPY, // 零拷贝模式
};
structxsk_umem_configumem_cfg = {
.fill_size = 2048,
.comp_size = 2048,
.frame_size = 4096,
.frame_headroom = 0,
};
// 创建UMEM
void *buffer = mmap(NULL, NUM_FRAMES * FRAME_SIZE,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
structxsk_umem *umem;
xsk_umem__create(&umem, buffer, NUM_FRAMES * FRAME_SIZE,
&fill_queue, &comp_queue, &umem_cfg);
// 创建XSK socket
structxsk_socket *xsk;
xsk_socket__create(&xsk, ifname, queue_id, umem,
&rx_queue, &tx_queue, &cfg);
return0;
}
六、性能优化
6.1 RSS(Receive Side Scaling)
// rss_config.c - RSS配置
staticvoidconfigure_rss(struct net_device *dev)
{
// 设置RSS哈希键
u8 rss_key[40];
get_random_bytes(rss_key, sizeof(rss_key));
// 配置RSS重定向表
u32 reta[128];
for (int i = 0; i < 128; i++) {
reta[i] = i % dev->real_num_rx_queues;
}
// 应用RSS配置
dev->netdev_ops->ndo_set_rss(dev, rss_key, reta);
}
6.2 网络栈性能调优
#!/bin/bash
# network_tuning.sh - 网络性能调优脚本
# 增加网络缓冲区
echo'net.core.rmem_max = 134217728' >> /etc/sysctl.conf
echo'net.core.wmem_max = 134217728' >> /etc/sysctl.conf
echo'net.ipv4.tcp_rmem = 4096 87380 134217728' >> /etc/sysctl.conf
echo'net.ipv4.tcp_wmem = 4096 65536 134217728' >> /etc/sysctl.conf
# 增加连接跟踪表大小
echo'net.netfilter.nf_conntrack_max = 1000000' >> /etc/sysctl.conf
# 启用RPS/RFS
echo f > /sys/class/net/eth0/queues/rx-0/rps_cpus
echo 32768 > /proc/sys/net/core/rps_sock_flow_entries
# 启用XPS
echo 1 > /sys/class/net/eth0/queues/tx-0/xps_cpus
# 设置中断亲和性
echo 2 > /proc/irq/24/smp_affinity
sysctl -p
编译和测试
Makefile
# 内核模块
obj-m += netfilter_hook.o
obj-m += xdp_loader.o
# BPF程序
CLANG = clang
LLC = llc
xdp_prog.o: xdp_prog.c
$(CLANG) -O2 -target bpf -c $< -o $@
tc_prog.o: tc_bpf.c
$(CLANG) -O2 -target bpf -c $< -o $@
all: modules bpf
modules:
$(MAKE) -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
bpf: xdp_prog.o tc_prog.o
clean:
$(MAKE) -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
rm -f *.o
test:
# 加载XDP程序
sudo ./xdp_loader eth0 xdp_prog.o
# 测试TC程序
sudo tc qdisc add dev eth0 clsact
sudo tc filter add dev eth0 ingress bpf obj tc_prog.o sec tc
实践检查清单
下一步
掌握网络子系统后,可以: