一、源码文件总览
| |
|---|
include/net/neighbour.h | |
include/uapi/linux/neighbour.h | |
net/core/neighbour.c | |
net/ipv4/arp.c | |
net/ipv6/ndisc.c | |
二、核心数据结构
2.1 struct neigh_hash_table
文件:include/net/neighbour.h,约第 104 行
#define NEIGH_NUM_HASH_RND 4structneigh_hash_table {structneighbour __rcu **hash_buckets;// 哈希桶数组unsignedint hash_shift; // 桶数 = 1 << hash_shift __u32 hash_rnd[NEIGH_NUM_HASH_RND]; // 4个随机种子structrcu_headrcu;// RCU 延迟释放};
要点:
- 哈希桶通过
rcu_dereference_bh() 读取,用 rcu_assign_pointer() 写入,读端完全无锁。 hash_rnd[4] 在 neigh_hash_alloc() 时随机生成,每次扩容重新生成,防止碰撞攻击。- 扩容时分配新
neigh_hash_table,通过 rcu_assign_pointer(tbl->nht, new_nht) 原子切换,旧表通过 call_rcu() 延迟释放。
2.2 struct neigh_table
文件:include/net/neighbour.h,约第 114 行
structneigh_table {int family;unsignedint entry_size;unsignedint key_len; __be16 protocol;/* 协议无关抽象:函数指针 */ __u32 (*hash)(constvoid *pkey,const struct net_device *dev, __u32 *hash_rnd);bool (*key_eq)(const struct neighbour *, constvoid *pkey);int (*constructor)(struct neighbour *);int (*pconstructor)(struct pneigh_entry *);void (*pdestructor)(struct pneigh_entry *);void (*proxy_redo)(struct sk_buff *skb);bool (*allow_add)(const struct net_device *dev, struct netlink_ext_ack *extack);char *id;/* 参数 — 注意:parms 是内嵌结构体,不是指针! */structneigh_parmsparms;// 默认参数structlist_headparms_list;/* GC 阈值 */int gc_interval;int gc_thresh1;int gc_thresh2;int gc_thresh3;unsignedlong last_flush;structdelayed_workgc_work;// 注意:是 delayed_work,不是 timer_liststructtimer_listproxy_timer;structsk_buff_headproxy_queue;atomic_t entries;atomic_t gc_entries; // GC 链表上的表项数structlist_headgc_list;// GC 链表头(FIFO)rwlock_t lock;structneigh_statistics __percpu *stats;structneigh_hash_table __rcu *nht;// RCU 保护的哈希表指针structpneigh_entry **phash_buckets;};
关键设计:
| |
|---|
parms | 默认参数直接内嵌,设备级参数通过 neigh_parms_clone() 克隆 |
gc_work | 不是 timer_list!在进程上下文执行,可以调用更多内核函数 |
nht | |
gc_list | GC 只扫描此链表,复杂度 O(gc_entries) 而非 O(entries) |
2.3 struct neigh_parms
文件:include/net/neighbour.h,约第 48 行
structneigh_parms {possible_net_t net;structnet_device *dev;structlist_headlist;int (*neigh_setup)(struct neighbour *);structneigh_table *tbl;void *sysctl_table;int dead;refcount_t refcnt;structrcu_headrcu_head;int reachable_time;int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX);};
参数通过 data[] 数组 + 枚举索引访问:
enum { NEIGH_VAR_MCAST_PROBES, NEIGH_VAR_UCAST_PROBES, NEIGH_VAR_APP_PROBES, NEIGH_VAR_MCAST_REPROBES, NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, NEIGH_VAR_GC_STALETIME, NEIGH_VAR_QUEUE_LEN_BYTES, NEIGH_VAR_PROXY_QLEN, NEIGH_VAR_ANYCAST_DELAY, NEIGH_VAR_PROXY_DELAY, NEIGH_VAR_LOCKTIME,/* "default" only: */ NEIGH_VAR_GC_INTERVAL, NEIGH_VAR_GC_THRESH1, NEIGH_VAR_GC_THRESH2, NEIGH_VAR_GC_THRESH3, NEIGH_VAR_MAX};
用 NEIGH_VAR(p, attr) 宏访问,用 data_state bitmap 追踪哪些参数被用户修改过。
2.4 struct neighbour
文件:include/net/neighbour.h,约第 77 行
structneighbour {structneighbour __rcu *next;// 哈希链:RCU 单链表(不是 hlist!)structneigh_table *tbl;structneigh_parms *parms;unsignedlong confirmed; // 最后一次确认可达的时间unsignedlong updated; // 最后一次更新时间rwlock_t lock; // 保护 ha / nud_state / timer / arp_queuerefcount_t refcnt;unsignedint arp_queue_len_bytes;structsk_buff_headarp_queue;structtimer_listtimer;unsignedlong used;atomic_t probes; __u8 flags; __u8 nud_state; __u8 type; __u8 dead; // 1 = 已从哈希表摘除,等待释放 u8 protocol;seqlock_t ha_lock; // 顺序锁,保护 MAC 地址unsignedchar ha[ALIGN(MAX_ADDR_LEN, sizeof(unsignedlong))] __aligned(8); // MAC 地址structhh_cachehh;// 硬件头缓存int (*output)(struct neighbour *, struct sk_buff *);conststructneigh_ops *ops;structlist_headgc_list;// 挂在 tbl->gc_list 上structrcu_headrcu;structnet_device *dev; u8 primary_key[0]; // 变长数组:IP 地址} __randomize_layout;
内存布局:
分配大小 = tbl->entry_size + dev->neigh_priv_len┌──────────────────────────────────────┐│ struct neighbour (固定部分) │├──────────────────────────────────────┤│ primary_key[0] → key_len 字节的 IP │├──────────────────────────────────────┤│ 协议私有数据 (entry_size - sizeof - key_len) │├──────────────────────────────────────┤│ 设备私有数据 (dev->neigh_priv_len) │└──────────────────────────────────────┘
关键字段说明:
| |
|---|
next | RCU 单链表,用 rcu_assign_pointer / rcu_dereference_bh 访问 |
ha_lock | seqlock:读 MAC 时无需写锁,只检查序列号是否变化 |
hh | 硬件头缓存:REACHABLE 状态下缓存以太网头,避免每次调用 hard_header |
output | 根据状态动态切换:REACHABLE→neigh_connected_output,INCOMPLETE→neigh_resolve_output,dead→neigh_blackhole |
dead | 创建时 = 1,插入哈希表后置 0;删除时置 1,防止 GC 重复清理 |
__randomize_layout | |
2.5 struct neigh_ops
文件:include/net/neighbour.h,约第 95 行
structneigh_ops {int family;void (*solicit)(struct neighbour *, struct sk_buff *);void (*error_report)(struct neighbour *, struct sk_buff *);int (*output)(struct neighbour *, struct sk_buff *);int (*connected_output)(struct neighbour *, struct sk_buff *);};
IPv4 ARP 注册的三个实例(net/ipv4/arp.c):
| output | connected_output | |
|---|
arp_generic_ops | neigh_resolve_output | neigh_connected_output | |
arp_hh_ops | neigh_resolve_output | neigh_hh_output | |
arp_direct_ops | dev_queue_xmit | dev_queue_xmit | |
三、邻居状态机
3.1 NUD 状态定义
文件:include/uapi/linux/neighbour.h
#define NUD_INCOMPLETE 0x01#define NUD_REACHABLE 0x02#define NUD_STALE 0x04#define NUD_DELAY 0x08#define NUD_PROBE 0x10#define NUD_FAILED 0x20#define NUD_NOARP 0x40#define NUD_PERMANENT 0x80
辅助宏(include/net/neighbour.h 第 23 行):
#define NUD_IN_TIMER (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE)#define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY)#define NUD_CONNECTED (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE)
3.2 状态转换图
设计精髓(源码注释原文):
"It corresponds to default IPv6 settings and is not overridable, because it is really reasonable choice." — neigh_rand_reach_time(),随机化 reachable_time,防止同步
四、核心函数详解
4.1 neigh_event_send() — 内联入口
文件:include/net/neighbour.h,约第 297 行
staticinlineintneigh_event_send(struct neighbour *neigh, struct sk_buff *skb){unsignedlong now = jiffies;if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now);if (!(neigh->nud_state & (NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))return __neigh_event_send(neigh, skb);return0;}
逻辑: 快速路径检查。如果已经处于 CONNECTED / DELAY / PROBE 状态,直接返回 0(不需要做任何事)。否则调用完整的 __neigh_event_send()。
4.2 __neigh_event_send() — 触发解析
文件:net/core/neighbour.c,第 1105 行
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb){int rc;bool immediate_probe = false; write_lock_bh(&neigh->lock); rc = 0;if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))goto out_unlock_bh;if (neigh->dead)goto out_dead;if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {/* 从 NUD_NONE 进入,需要启动解析 */if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + NEIGH_VAR(neigh->parms, APP_PROBES)) {unsignedlong next, now = jiffies; atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); neigh_add_timer(neigh, next); immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; write_unlock_bh(&neigh->lock); kfree_skb(skb);return1; } } elseif (neigh->nud_state & NUD_STALE) {/* STALE → DELAY:先等上层确认,不立即发 ARP */ neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh_del_timer(neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); }/* NUD_INCOMPLETE 状态下,报文入队 */if (neigh->nud_state == NUD_INCOMPLETE) {if (skb) {while (neigh->arp_queue_len_bytes + skb->truesize > NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {structsk_buff *buff; buff = __skb_dequeue(&neigh->arp_queue);if (!buff)break; neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb(buff); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; }out_unlock_bh:if (immediate_probe) neigh_probe(neigh);else write_unlock(&neigh->lock); local_bh_enable(); trace_neigh_event_send_done(neigh, rc);return rc;out_dead:if (neigh->nud_state & NUD_STALE)goto out_unlock_bh; write_unlock_bh(&neigh->lock); kfree_skb(skb); trace_neigh_event_send_dead(neigh, 1);return1;}EXPORT_SYMBOL(__neigh_event_send);
要点:
immediate_probe:当从 NONE 进入 INCOMPLETE 时,设置此标志,解锁后直接调用 neigh_probe() 发送第一个 ARP 探测报文(避免持锁发送)。write_lock_bh(&neigh->lock) 然后 write_unlock(&neigh->lock) + local_bh_enable() — 注意这里手动管理 BH 开关,是内核内部模式。QUEUE_LEN_BYTES 控制排队总字节数(而非报文个数),防止内存耗尽。
4.3 neigh_timer_handler() — 定时器回调
文件:net/core/neighbour.c,第 1016 行
staticvoidneigh_timer_handler(struct timer_list *t){unsignedlong now, next;structneighbour *neigh = from_timer(neigh, t, timer);unsignedint state;int notify = 0; write_lock(&neigh->lock); state = neigh->nud_state; now = jiffies; next = now + HZ;if (!(state & NUD_IN_TIMER))goto out;if (state & NUD_REACHABLE) {if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) {/* 还没超时,重新设置定时器 */ neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } elseif (time_before_eq(now, neigh->used + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {/* 仍在使用期内,进入 DELAY 等上层确认 */ neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_suspect(neigh); next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else {/* 已不使用,进入 STALE */ neigh_dbg(2, "neigh %p is suspected\n", neigh); neigh->nud_state = NUD_STALE; neigh->updated = jiffies; neigh_suspect(neigh); notify = 1; } } elseif (state & NUD_DELAY) {if (time_before_eq(now, neigh->confirmed + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {/* 上层已确认,回到 REACHABLE */ neigh_dbg(2, "neigh %p is now reachable\n", neigh); neigh->nud_state = NUD_REACHABLE; neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else {/* 超时未确认,进入 PROBE */ neigh_dbg(2, "neigh %p is probed\n", neigh); neigh->nud_state = NUD_PROBE; neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } } else {/* NUD_PROBE | NUD_INCOMPLETE */ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); }/* 探测次数用尽 → FAILED */if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { neigh->nud_state = NUD_FAILED; notify = 1; neigh_invalidate(neigh);goto out; }if (neigh->nud_state & NUD_IN_TIMER) {if (time_before(next, jiffies + HZ/100)) next = jiffies + HZ/100;if (!mod_timer(&neigh->timer, next)) neigh_hold(neigh); }if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { neigh_probe(neigh); } else {out: write_unlock(&neigh->lock); }if (notify) neigh_update_notify(neigh, 0); trace_neigh_timer_handler(neigh, 0); neigh_release(neigh);}
要点:
from_timer(neigh, t, timer) — 从 timer_list 指针反查 neighbour 结构体地址(container_of 宏)。- REACHABLE 超时后不是直接进 PROBE,而是先判断是否在
used + delay_probe_time 内:如果是,进 DELAY(等上层确认);否则直接进 STALE。 neigh_max_probes(neigh) 返回 app_probes + ucast_probes + mcast_probes 之和。- 定时器在
neigh_add_timer() 时增加引用计数,neigh_timer_handler 结束时 neigh_release(),防止定时器运行期间邻居项被释放。
4.4 __neigh_update() — 更新邻居状态
文件:net/core/neighbour.c,第 1227 行
neigh_update() 是它的简化包装(第 1411 行),只多传一个 NULL 作为 extack 参数。
staticint __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack){ u8 old;int err;int notify = 0;structnet_device *dev;int update_isrouter = 0; trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); write_lock_bh(&neigh->lock); dev = neigh->dev; old = neigh->nud_state; err = -EPERM;/* 1. 权限检查:NOARP/PERMANENT 状态不允许非 ADMIN 更新 */if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT)))goto out;if (neigh->dead) { NL_SET_ERR_MSG(extack, "Neighbor entry is now dead");goto out; }/* 2. 新状态非 VALID:删除定时器和连接状态 */if (!(new & NUD_VALID)) { neigh_del_timer(neigh);if (old & NUD_CONNECTED) neigh_suspect(neigh); neigh->nud_state = new; err = 0; notify = old & NUD_VALID;if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; }goto out; }/* 3. 处理 lladdr(MAC 地址)*/if (!dev->addr_len) { lladdr = neigh->ha; } elseif (lladdr) {if ((old & NUD_VALID) && !memcmp(lladdr, neigh->ha, dev->addr_len)) lladdr = neigh->ha; /* 相同,无需更新 */ } else { err = -EINVAL;if (!(old & NUD_VALID)) { NL_SET_ERR_MSG(extack, "No link layer address given");goto out; } lladdr = neigh->ha; }/* 4. 如果新状态是 CONNECTED,更新 confirmed 时间戳 */if (new & NUD_CONNECTED) neigh->confirmed = jiffies;/* 5. 状态转移 */if (new != old) { neigh_del_timer(neigh);if (new & NUD_PROBE) atomic_set(&neigh->probes, 0);if (new & NUD_IN_TIMER) neigh_add_timer(neigh, jiffies + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0)); neigh->nud_state = new; notify = 1; }/* 6. 写入 MAC 地址(用 seqlock 保护)*/if (lladdr != neigh->ha) { write_seqlock(&neigh->ha_lock);memcpy(&neigh->ha, lladdr, dev->addr_len); write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh);if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; }/* 7. 发送排队的报文 */if (!(old & NUD_VALID)) {structsk_buff *skb;while (neigh->nud_state & NUD_VALID && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {structdst_entry *dst = skb_dst(skb);structneighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); rcu_read_lock(); n2 = NULL;if (dst) { n2 = dst_neigh_lookup_skb(dst, skb);if (n2) n1 = n2; } n1->output(n1, skb);if (n2) neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } ...}
要点:
neigh_update() vs __neigh_update():neigh_update() 是公开 API(供 neigh_event_ns() 等调用),__neigh_update() 多一个 extack 参数用于 netlink 错误报告。lladdr != neigh->ha 用指针比较(是否同一地址),不是内容比较。memcmp 在后面实际写入时做。- 排队报文发送时必须先解锁再调用
output(),因为 output() 可能会重新进入邻居子系统(避免死锁)。
五、哈希表实现
5.1 查找:___neigh_lookup_noref()
文件:include/net/neighbour.h,约第 175 行
staticinlinestructneighbour *___neigh_lookup_noref(structneigh_table *tbl,bool (*key_eq)(conststructneighbour *n, constvoid *pkey), __u32 (*hash)(constvoid *pkey,conststructnet_device *dev, __u32 *hash_rnd),constvoid *pkey,structnet_device *dev){structneigh_hash_table *nht = rcu_dereference_bh(tbl->nht);structneighbour *n; u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); n != NULL; n = rcu_dereference_bh(n->next)) {if (n->dev == dev && key_eq(n, pkey))return n; }returnNULL;}
要点: 哈希函数 hash 是函数指针,IPv4 注册 arp_hashfn(jhash_2words),IPv6 注册 ndisc_hashfn。
5.2 创建:neigh_alloc() + ___neigh_create()
neigh_alloc()
文件:net/core/neighbour.c,约第 397 行
static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev,bool exempt_from_gc){structneighbour *n = NULL;unsignedlong now = jiffies;int entries;if (!exempt_from_gc) { entries = atomic_inc_return(&tbl->gc_entries) - 1;if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) {if (!neigh_forced_gc(tbl) && entries >= tbl->gc_thresh3) { net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id); NEIGH_CACHE_STAT_INC(tbl, table_fulls);goto out_entries; } } } n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);if (!n)goto out_entries; __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; INIT_LIST_HEAD(&n->gc_list); atomic_inc(&tbl->entries);return n; ...}
___neigh_create()
文件:net/core/neighbour.c,约第 577 行
staticstructneighbour *___neigh_create(structneigh_table *tbl,constvoid *pkey,structnet_device *dev,boolexempt_from_gc, boolwant_ref){structneighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc); ...memcpy(n->primary_key, pkey, key_len); n->dev = dev; dev_hold(dev);if (tbl->constructor && (error = tbl->constructor(n)) < 0)goto out_neigh_release;if (dev->netdev_ops->ndo_neigh_construct) { error = dev->netdev_ops->ndo_neigh_construct(dev, n);if (error < 0)goto out_neigh_release; } n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, ...);if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1);/* 检查是否已存在(并发创建)*/ hash_val = ...;for (n1 = ...; n1 != NULL; n1 = ...) {if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { rc = n1;goto out_tbl_unlock; } } n->dead = 0;if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); rcu_assign_pointer(n->next, ...); rcu_assign_pointer(nht->hash_buckets[hash_val], n); write_unlock_bh(&tbl->lock);return n; ...}
要点:confirmed 初始化为一个"过去"的时间,使得初次 REACHABLE 后定时器到期时能正确进入 STALE(而不是因为 confirmed 太新而误认为仍然存活)。
5.3 扩容:neigh_hash_grow()
文件:net/core/neighbour.c,约第 460 行
static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,unsignedlong new_shift){unsignedint i, hash;structneigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); old_nht = rcu_dereference_protected(tbl->nht, ...); new_nht = neigh_hash_alloc(new_shift);if (!new_nht)return old_nht;for (i = 0; i < (1 << old_nht->hash_shift); i++) {structneighbour *n, *next;for (n = ...; n != NULL; n = next) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); next = rcu_dereference_protected(n->next, ...); rcu_assign_pointer(n->next, rcu_dereference_protected( new_nht->hash_buckets[hash], ...)); rcu_assign_pointer(new_nht->hash_buckets[hash], n); } } rcu_assign_pointer(tbl->nht, new_nht); call_rcu(&old_nht->rcu, neigh_hash_free_rcu);return new_nht;}
要点: 扩容使用新的hash_rnd[],所有表项重新哈希,缓解哈希碰撞攻击。但扩容期间持有 tbl->lock 写锁,会阻塞所有写操作。
六、垃圾回收(GC)机制
6.1 neigh_update_gc_list() — 维护 GC 链表
文件:net/core/neighbour.c,约第 108 行
staticvoidneigh_update_gc_list(struct neighbour *n){bool on_gc_list, exempt_from_gc; write_lock_bh(&n->tbl->lock); write_lock(&n->lock); exempt_from_gc = n->nud_state & NUD_PERMANENT || n->flags & NTF_EXT_LEARNED; on_gc_list = !list_empty(&n->gc_list);if (exempt_from_gc && on_gc_list) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } elseif (!exempt_from_gc && !on_gc_list) { list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); } write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock);}
6.2 neigh_forced_gc() — 强制 GC
文件:net/core/neighbour.c,约第 219 行
staticintneigh_forced_gc(struct neigh_table *tbl){int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2;unsignedlong tref = jiffies - 5 * HZ;structneighbour *n, *tmp;int shrunk = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {if (refcount_read(&n->refcnt) == 1) {bool remove = false; write_lock(&n->lock);if ((n->nud_state == NUD_FAILED) || time_after(tref, n->updated)) remove = true; write_unlock(&n->lock);if (remove && neigh_remove_one(n, tbl)) shrunk++;if (shrunk >= max_clean)break; } } tbl->last_flush = jiffies; write_unlock_bh(&tbl->lock);return shrunk;}
要点:tref = jiffies - 5*HZ:5 秒内更新过的表项不被强制清理(给一个"宽限期")。
6.3 三级 GC 策略
| | |
|---|
gc_entries < gc_thresh1 | | |
gc_entries >= gc_thresh2 | delayed_work gc_work | 异步扫描 gc_list,清理过期 STALE 项 |
gc_entries >= gc_thresh3 | neigh_alloc() | 调用 neigh_forced_gc(),同步清理 |
七、硬件头缓存(hh_cache)
7.1 neigh_output() — 发送路径选择
文件:include/net/neighbour.h,约第 327 行
staticinlineintneigh_output(struct neighbour *n, struct sk_buff *skb,bool skip_cache){conststructhh_cache *hh = &n->hh;if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache)return neigh_hh_output(hh, skb); /* 快速路径 */elsereturn n->output(n, skb); /* 慢速路径 */}
7.2 neigh_hh_output() — 快速路径
文件:include/net/neighbour.h,约第 355 行
用 seqlock 读 hh_data,直接拷贝到 skb 头部,然后 dev_queue_xmit()。
八、总结
8.1 架构亮点
- 协议无关抽象:
neigh_table 的 hash/key_eq/constructor 函数指针让 IPv4 和 IPv6 共用同一框架 - 独立 GC 链表:
gc_list 与哈希表分离,GC 复杂度 O(gc_entries) 而非 O(entries) - RCU 无锁读:哈希查找全程
rcu_read_lock_bh(),无写锁竞争 - 延迟确认:STALE → DELAY → PROBE 三阶段设计,优先等上层确认
- hh_cache 快速路径:CONNECTED 状态下直接拷贝缓存头,避免
hard_header 开销 - seqlock 保护 MAC:读 MAC 无需写锁
8.2 关键锁层次
| | |
|---|
tbl->lock | | |
neigh->lock | ha、nud_state、timer、arp_queue | |
neigh->ha_lock | | |
rcu_read_lock_bh() | | |
参考资料