IP 层收发报文简要剖析3--ip输入报文分片重组
阅读原文时间:2023年07月12日阅读:2

在ip_local_deliver中,如果检测到是分片包,则需要将报文进行重组。其所有的分片被重新组合后才能提交到上层协议,每一个被重新组合的数据包文用ipq结构实例来表示

struct ipq {
struct inet_frag_queue q;

u32        user;//分片来源  
\_\_be32        saddr;//原地址  
\_\_be32        daddr;//目的地址  
\_\_be16        id;//ip报文序列号  
u8        protocol;//上层协议号  

//这四个字段来自ip首部是为了确定来自哪个ip数据报文
u8 ecn; /* RFC3168 support */
u16 max_df_size; /* largest frag with DF set seen */
int iif;
int vif; /* L3 master device index */
unsigned int rid;//已收到的分片计数器
struct inet_peer *peer;//记录发送方信息
//通过rid peer 可以防止Dos攻击
};

网络空间分段管理结构

struct inet_frags {
struct inet_frag_bucket hash[INETFRAGS_HASHSZ];//哈希队列

struct work\_struct    frags\_work;//工作队列  
unsigned int next\_bucket;  
unsigned long last\_rebuild\_jiffies;  
bool rebuild;

/\* The first call to hashfn is responsible to initialize  
 \* rnd. This is best done with net\_get\_random\_once.  
 \*  
 \* rnd\_seqlock is used to let hash insertion detect  
 \* when it needs to re-lookup the hash chain to use.  
 \*/  
u32            rnd;//随机数  
seqlock\_t        rnd\_seqlock;//  
int            qsize;//队列长度

unsigned int        (\*hashfn)(const struct inet\_frag\_queue \*);  
bool            (\*match)(const struct inet\_frag\_queue \*q,  
                 const void \*arg);//分段队列匹配函数  
void            (\*constructor)(struct inet\_frag\_queue \*q,  
                       const void \*arg);  
void            (\*destructor)(struct inet\_frag\_queue \*);  
void            (\*frag\_expire)(unsigned long data);//队列过期处理函数  
struct kmem\_cache    \*frags\_cachep;  
const char        \*frags\_cache\_name;  

};

struct netns_frags {
/* The percpu_counter "mem" need to be cacheline aligned.
* mem.count must not share cacheline with other writers
*/
struct percpu_counter mem ____cacheline_aligned_in_smp;

/\* sysctls \*/  
int            timeout;超时时间  
int            high\_thresh;内存使用上限  
int            low\_thresh;内存使用下限  
int            max\_dist;  

};

/**
* struct inet_frag_queue - fragment queue
*
* @lock: spinlock protecting the queue
* @timer: queue expiration timer
* @list: hash bucket list
* @refcnt: reference count of the queue
* @fragments: received fragments head
* @fragments_tail: received fragments tail
* @stamp: timestamp of the last received fragment
* @len: total length of the original datagram
* @meat: length of received fragments so far
* @flags: fragment queue flags
* @max_size: maximum received fragment size
* @net: namespace that this frag belongs to
* @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
*/
struct inet_frag_queue {//inet分段队列头
spinlock_t lock;smp环境下 需要
struct timer_list timer;队列定时器,组装非常耗时,不能无休止的等待分片的到达
struct hlist_node list;哈希节点,链入inet分段管理结构的哈希队列
atomic_t refcnt;计数器
struct sk_buff *fragments;分段数据包队列
struct sk_buff *fragments_tail;
ktime_t stamp;时间戳
int len;数据包结束位置offset+len
int meat;与原数据长度的差距,如果和原数据包长度一样代表接收完成
__u8 flags;
u16 max_size;
struct netns_frags *net;指向网络空寂分段管理结构
struct hlist_node list_evictor;
};

1.1、 IP分组的初始化

void __init ipfrag_init(void)
{
ip4_frags_ctl_register();
register_pernet_subsys(&ip4_frags_ops);//向内核注册ipv4分段管理函数
ip4_frags.hashfn = ip4_hashfn;//设置计算hash的函数
//设置初始化ip 分段队列的构造函数
ip4_frags.constructor = ip4_frag_init;
//析构函数
ip4_frags.destructor = ip4_frag_free;
//队列机构长度
ip4_frags.qsize = sizeof(struct ipq);
//对比ip分段队列hook
ip4_frags.match = ip4_frag_match;
//设置分段队列过期处理函数
ip4_frags.frag_expire = ip_expire;
ip4_frags.frags_cache_name = ip_frag_cache_name;
if (inet_frags_init(&ip4_frags))
panic("IP: failed to allocate ip4_frags cache\n");
}

int inet_frags_init(struct inet_frags *f)
{
int i;
//初始化工作队列
INIT_WORK(&f->frags_work, inet_frag_worker);

for (i = 0; i < INETFRAGS\_HASHSZ; i++) {  
    struct inet\_frag\_bucket \*hb = &f->hash\[i\];//初始化hash 队列头

    spin\_lock\_init(&hb->chain\_lock);  
    INIT\_HLIST\_HEAD(&hb->chain);  
}

seqlock\_init(&f->rnd\_seqlock);  
f->last\_rebuild\_jiffies = 0;  
f->frags\_cachep = kmem\_cache\_create(f->frags\_cache\_name, f->qsize, 0, 0,  
                    NULL);  
if (!f->frags\_cachep)  
    return -ENOMEM;

return 0;  

}
EXPORT_SYMBOL(inet_frags_init);

int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
struct net *net = dev_net(skb->dev);

/\* 分片重组 \*/  
if (ip\_is\_fragment(ip\_hdr(skb))) {  
    if (ip\_defrag(net, skb, IP\_DEFRAG\_LOCAL\_DELIVER))  
        return 0;  
}

/\* 经过LOCAL\_IN钩子点 \*/  
return NF\_HOOK(NFPROTO\_IPV4, NF\_INET\_LOCAL\_IN,  
           net, NULL, skb, skb->dev, NULL,  
           ip\_local\_deliver\_finish);  

}

1.2、 ip分片报文重组的处理

/* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
int vif = l3mdev_master_ifindex_rcu(dev);
struct ipq *qp;
//递增计数
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
skb_orphan(skb);

/\* Lookup (or create) queue header\* 查找或创建IP分片队列  \*/  
qp = ip\_find(net, ip\_hdr(skb), user, vif);  
if (qp) {/\* 分片队列存在 \*/  
    int ret;

    spin\_lock(&qp->q.lock);

    ret = ip\_frag\_queue(qp, skb);//分片数据包入队重组数据包

    spin\_unlock(&qp->q.lock);  
    ipq\_put(qp);  
    return ret;  
}  
/\* 创建新的ip分片队列失败,内存不足递增失败计数\*/  
\_\_IP\_INC\_STATS(net, IPSTATS\_MIB\_REASMFAILS);  
kfree\_skb(skb);  
return -ENOMEM;  

}
EXPORT_SYMBOL(ip_defrag);

1.2.2 ip_find 根据ip首部以及user标志 在ipq散列表中查找对应的ipq。

/* Find the correct entry in the "incomplete datagrams" queue for
* this IP datagram, and create new one, if nothing is found.
enum ip_defrag_users {
IP_DEFRAG_LOCAL_DELIVER,
IP_DEFRAG_CALL_RA_CHAIN,
IP_DEFRAG_CONNTRACK_IN,
__IP_DEFRAG_CONNTRACK_IN_END = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX,
IP_DEFRAG_CONNTRACK_OUT,
__IP_DEFRAG_CONNTRACK_OUT_END = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
IP_DEFRAG_CONNTRACK_BRIDGE_IN,
__IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
IP_DEFRAG_VS_IN,
IP_DEFRAG_VS_OUT,
IP_DEFRAG_VS_FWD,
IP_DEFRAG_AF_PACKET,
IP_DEFRAG_MACVLAN,
};
*/
static struct ipq *ip_find(struct net *net, struct iphdr *iph,
u32 user, int vif)
{
struct inet_frag_queue *q;
struct ip4_create_arg arg;
unsigned int hash;
/* 记录ip头和输入信息 */
arg.iph = iph;
arg.user = user;
arg.vif = vif;
/* 通过id,源地址,目的地址,协议计算hash */
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
/* 根据hash值查找或创建队列 */
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL;
}
return container_of(q, struct ipq, q);
}

struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key,
unsigned int hash)
{
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
int depth = 0;
/* 分片内存已经超过了低限 */
if (frag_mem_limit(nf) > nf->low_thresh)
/* 进行节点回收 */
inet_frag_schedule_worker(f); //工作队列回调函数为inet_frag_worker

hash &= (INETFRAGS\_HASHSZ - 1);  
hb = &f->hash\[hash\]; /\* 找到hash桶 \*/

spin\_lock(&hb->chain\_lock);  
hlist\_for\_each\_entry(q, &hb->chain, list) { /\* 遍历链表 \*/  
    if (q->net == nf && f->match(q, key)) {  
        atomic\_inc(&q->refcnt); /\* 增加引用计数 \*/  
        spin\_unlock(&hb->chain\_lock);  
        return q;  
    }  
    depth++;/\* 记录查找深度 \*/  
}  
spin\_unlock(&hb->chain\_lock);  

/* 未找到 */
/* 桶节点的链表深度不超过限定 */
if (depth <= INETFRAGS_MAXDEPTH)
return inet_frag_create(nf, f, key);/* 创建节点返回 */

if (inet\_frag\_may\_rebuild(f)) {  
    /\* 如果已经超过了重建间隔时间,则重建 \*/  
    if (!f->rebuild)  
        f->rebuild = true;  
    inet\_frag\_schedule\_worker(f);  
}

return ERR\_PTR(-ENOBUFS);  

}
EXPORT_SYMBOL(inet_frag_find);

如果查找不到则会创建一个ipq 并将其插入链表中

static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frags *f,
void *arg)
{
struct inet_frag_queue *q;

q = inet\_frag\_alloc(nf, f, arg);//分配队列头结构空间  
if (!q)  
    return NULL;

return inet\_frag\_intern(nf, q, f, arg);  

}
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
struct inet_frags *f,
void *arg)
{
struct inet_frag_queue *q;

if (frag\_mem\_limit(nf) > nf->high\_thresh) {//内存超过警戒线 回收内存  
    inet\_frag\_schedule\_worker(f);  
    return NULL;  
}

q = kmem\_cache\_zalloc(f->frags\_cachep, GFP\_ATOMIC);  
if (!q)  
    return NULL;

q->net = nf;//记录下网络空间的分段管理结构指针  
f->constructor(q, arg);//之前初始化时,构造函数来初始化-ip4\_frag\_init  
add\_frag\_mem\_limit(nf, f->qsize);//sum 网络空间的分段内存

setup\_timer(&q->timer, f->frag\_expire, (unsigned long)q);//定时器initand run  
spin\_lock\_init(&q->lock);  
atomic\_set(&q->refcnt, 1);

return q;  

}
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);//获取分段队列指针
struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
frags);

struct net \*net = container\_of(ipv4, struct net, ipv4);

const struct ip4\_create\_arg \*arg = a;//ipv4的分段信息指针

qp->protocol = arg->iph->protocol;//IP层头部协议  
qp->id = arg->iph->id;//ip层id  
qp->ecn = ip4\_frag\_ecn(arg->iph->tos);  
qp->saddr = arg->iph->saddr;  
qp->daddr = arg->iph->daddr;  
qp->vif = arg->vif;  
qp->user = arg->user;  
//记录对方信息  
qp->peer = q->net->max\_dist ?  
    inet\_getpeer\_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :  
    NULL;  

}

static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
struct inet_frag_queue *qp_in,
struct inet_frags *f,
void *arg)
{
struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
struct inet_frag_queue *qp;

#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could have been created on other cpu before
* we acquired hash bucket lock.
*/
hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
spin_unlock(&hb->chain_lock);
qp_in->flags |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
}
}
#endif
qp = qp_in;
if (!mod_timer(&qp->timer, jiffies + nf->timeout))
atomic_inc(&qp->refcnt);

atomic\_inc(&qp->refcnt);//链入inet分段管理结构的hash队列  
hlist\_add\_head(&qp->list, &hb->chain);

spin\_unlock(&hb->chain\_lock);

return qp;  

}

1/2/3 分片数据包加入重组数据包

/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
int ihl, end;
int err = -ENOENT;
u8 ecn;

if (qp->q.flags & INET\_FRAG\_COMPLETE) //分段队列接收完成 则释放此分片返回  
    goto err;  

/*数据包没有分段标志or 分段队列间隔过大
//重现调整分段队列是否出错
如果不是本地生成的分片,则调用ip_frag_too_far 检测
是否存在 dos攻击,存在攻击则调用邋ip_frag_reinit释放
所用分片
*/
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
unlikely(ip_frag_too_far(qp)) &&
unlikely(err = ip_frag_reinit(qp))) {
ipq_kill(qp);//将ipq从散列表中移除停止定时器 计数器减一
// 调用ipq_unlink 设置ipq为complete状态,只有complete状态才能释放
goto err;
}

ecn = ip4\_frag\_ecn(ip\_hdr(skb)->tos);  
offset = ntohs(ip\_hdr(skb)->frag\_off);  
flags = offset & ~IP\_OFFSET;  
offset &= IP\_OFFSET;  
offset <<= 3;        /\* offset is in 8-byte chunks \*/  
ihl = ip\_hdrlen(skb);  

/* 获取ip首部中的数据标志位 片的偏移 首部长度 */
/* Determine the position of this fragment. */
end = offset + skb->len - skb_network_offset(skb) - ihl;
err = -EINVAL;
/**/
/* Is this the final fragment?
如果是最后一个片则先对分片进行检测
*/
if ((flags & IP_MF) == 0) {
/* If we already have some bits beyond end
* or have different end, the segment is corrupted.
结束位置小于前一个位置,ipq已经有
last_in 标志且分片末尾不等于原始数据长度
*/
if (end < qp->q.len ||
((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
goto err;
qp->q.flags |= INET_FRAG_LAST_IN;
qp->q.len = end;
/*通过校验并设置为last_in标志,存储完整的数据长度*/
} else {
if (end&7) {//按8字节对其
end &= ~7;
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->ip_summed = CHECKSUM_NONE;
}
if (end > qp->q.len) {
/* 结束地址大于前一个分段数据地址
Some bits beyond end -> corruption.
如果设置了最后一个分段数据标志
表示最后一个包,则错误*/
if (qp->q.flags & INET_FRAG_LAST_IN)
goto err;
qp->q.len = end;//记录当前分段数据块的结束位置
}
}
if (end == offset)//等于起始位置 即分片区数据长度为0
goto err;

err = -ENOMEM;//去掉ip首部  
if (!pskb\_pull(skb, skb\_network\_offset(skb) + ihl))  
    goto err;  

//skb 数据长度为end-offset ip 有效载荷长度
err = pskb_trim_rcsum(skb, end - offset);
if (err)
goto err;

/\* Find out which fragments are in front and at the back of us  
 \* in the chain of fragments so far.  We must know where to put  
 \* this fragment, right?  
 \*/  
prev = qp->q.fragments\_tail;  
if (!prev || FRAG\_CB(prev)->offset < offset) {  
    next = NULL;  
    goto found;  
}  
prev = NULL;  
for (next = qp->q.fragments; next != NULL; next = next->next) {  
    if (FRAG\_CB(next)->offset >= offset)  
        break;    /\* bingo! \*/  
    prev = next;  
}/\*确定分片在链表中的位置,分片到达的时间顺序不同  
ipq 上的分片按照分片偏移值大小排序  
\*/

found:
/* We found where to put this one. Check for overlap with
* preceding fragment, and, if needed, align things so that
* any overlaps are eliminated.
检验和和上一个分片数据是否有重叠
*/
if (prev) {
int i = (FRAG_CB(prev)->offset + prev->len) - offset;

    if (i > 0) {//有重叠 调用pskb\_pull 消除重叠  
        offset += i;  
        err = -EINVAL;  
        if (end <= offset)  
            goto err;  
        err = -ENOMEM;  
        if (!pskb\_pull(skb, i))  
            goto err;  
        if (skb->ip\_summed != CHECKSUM\_UNNECESSARY)  
            skb->ip\_summed = CHECKSUM\_NONE;  
    }  
}

err = -ENOMEM;  

/*如果和后面一个分片的数据有重叠,
部分重叠还是完全重叠;
重叠部分数据超过下一个分片的数据长度,咋释放
下发一个分片并在检查与后面第二个分片的数据是否
有重叠,如果没有超过下一个则调整下一个分片。
如此反复直到对所有分片都检测完。
调整片的偏移以及分片总长度
*/
while (next && FRAG_CB(next)->offset < end) { int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */

    if (i < next->len) {  
        /\* Eat head of the next overlapped fragment  
         \* and leave the loop. The next ones cannot overlap.  
         \*/  
        if (!pskb\_pull(next, i))  
            goto err;  
        FRAG\_CB(next)->offset += i;  
        qp->q.meat -= i;  
        if (next->ip\_summed != CHECKSUM\_UNNECESSARY)  
            next->ip\_summed = CHECKSUM\_NONE;  
        break;  
    } else {  
        struct sk\_buff \*free\_it = next;

        /\* Old fragment is completely overridden with  
         \* new one drop it.  
         \*/  
        next = next->next;

        if (prev)  
            prev->next = next;  
        else  
            qp->q.fragments = next;

        qp->q.meat -= free\_it->len;  
        sub\_frag\_mem\_limit(qp->q.net, free\_it->truesize);  
        kfree\_skb(free\_it);  
    }  
}

FRAG\_CB(skb)->offset = offset;//当前片的偏移

/\* Insert this fragment in the chain of fragments.  
当前的片插入到ipq队列中相应的位置\*/  
skb->next = next;  
if (!next)  
    qp->q.fragments\_tail = skb;  
if (prev)  
    prev->next = skb;  
else  
    qp->q.fragments = skb;

dev = skb->dev;  
if (dev) {  
    qp->iif = dev->ifindex;  
    skb->dev = NULL;  
}  
qp->q.stamp = skb->tstamp;//更新时间搓  
qp->q.meat += skb->len;//sum ipq已收到分片的总长度  
qp->ecn |= ecn;  
//分片组装模块的所占内存的总长度  
add\_frag\_mem\_limit(qp->q.net, skb->truesize);  
if (offset == 0)//为第一个片 设置标志  
    qp->q.flags |= INET\_FRAG\_FIRST\_IN;

fragsize = skb->len + ihl;

if (fragsize > qp->q.max\_size)  
    qp->q.max\_size = fragsize;

if (ip\_hdr(skb)->frag\_off & htons(IP\_DF) &&  
    fragsize > qp->max\_df\_size)  
    qp->max\_df\_size = fragsize;

if (qp->q.flags == (INET\_FRAG\_FIRST\_IN | INET\_FRAG\_LAST\_IN) &&  
    qp->q.meat == qp->q.len) {//所有报文都到齐则重组  
    unsigned long orefdst = skb->\_skb\_refdst;

    skb->\_skb\_refdst = 0UL;  
    err = ip\_frag\_reasm(qp, prev, dev);  
    skb->\_skb\_refdst = orefdst;  
    return err;  
}

skb\_dst\_drop(skb);  
return -EINPROGRESS;

err:
kfree_skb(skb);
return err;
}

ip_frag_reasm 重组报文;

* Build a new IP datagram from all its fragments. */
/*
*用于组装已到齐的所有分片,当原始
* 数据包的所有分片都已到齐时,会调用此函
* 数组装分片。
*/
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
struct sk_buff *fp, *head = qp->q.fragments;
int len;
int ihlen;
int err;
u8 ecn;
/*
* 要开始组装了,因此调用ipq_kill()将此ipq结点从
* ipq散列表删除,并删除定时器。
*/
ipq_kill(qp);

ecn = ip\_frag\_ecn\_table\[qp->ecn\];  
if (unlikely(ecn == 0xff)) {  
    err = -EINVAL;  
    goto out\_fail;  
}  
/\* Make the one we just received the head. \*/  
if (prev) {  
    head = prev->next;  
    fp = skb\_clone(head, GFP\_ATOMIC);  
    if (!fp)  
        goto out\_nomem;

    fp->next = head->next;  
    if (!fp->next)  
        qp->q.fragments\_tail = fp;  
    prev->next = fp;

    skb\_morph(head, qp->q.fragments);  
    head->next = qp->q.fragments->next;

    consume\_skb(qp->q.fragments);  
    qp->q.fragments = head;  
}

WARN\_ON(!head);  
WARN\_ON(FRAG\_CB(head)->offset != 0);

/\* Allocate a new buffer for the datagram.  
计算原始报文的长度 超过64  KB\*/  
ihlen = ip\_hdrlen(head);  
len = ihlen + qp->q.len;

err = -E2BIG;  
if (len > 65535)  
    goto out\_oversize;

/\* Head of list must not be cloned.  
 \* 在组装分片时,所有的分片都会组装到第一个分片  
 \* 上,因此第一个分片是不能克隆的,如果是克隆的,  
 \* 则需为分片组装重新分配一个SKB。  
  \*/  
if (skb\_unclone(head, GFP\_ATOMIC))  
    goto out\_nomem;

/\* If the first fragment is fragmented itself, we split  
 \* it to two chunks: the first with data and paged part  
 \* and the second, holding only fragments. \*/  
 /\*  
 \* 分片队列的第一个SKB不能既带有数据,又带有分片,即其  
 \* frag\_list上不能有分片skb,如果有则重新分配一个SKB。最终的  
 \* 效果是,head自身不包括数据,其frag\_list上链接着所有分片的  
 \* SKB。这也是SKB的一种表现形式,不一定是一个连续的数据块,  
 \* 但最终会调用skb\_linearize()将这些数据都复制到一个连续的数据  
 \* 块中。  
 \*/  
if (skb\_has\_frag\_list(head)) {  
    struct sk\_buff \*clone;  
    int i, plen = 0;

    clone = alloc\_skb(0, GFP\_ATOMIC);  
    if (!clone)  
        goto out\_nomem;  
    clone->next = head->next;  
    head->next = clone;  
    skb\_shinfo(clone)->frag\_list = skb\_shinfo(head)->frag\_list;  
    skb\_frag\_list\_init(head);  
    for (i = 0; i < skb\_shinfo(head)->nr\_frags; i++)  
        plen += skb\_frag\_size(&skb\_shinfo(head)->frags\[i\]);  
    clone->len = clone->data\_len = head->data\_len - plen;  
    head->data\_len -= clone->len;  
    head->len -= clone->len;  
    clone->csum = 0;  
    clone->ip\_summed = head->ip\_summed;  
    add\_frag\_mem\_limit(qp->q.net, clone->truesize);  
}  

/*
* 把所有分片组装起来即将分片链接到第一个
* SKB的frag_list上,同时还需要遍历所有分片,
* 重新计算IP数据包长度以及校验和等。
*/
skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));

for (fp=head->next; fp; fp = fp->next) {  
    head->data\_len += fp->len;  
    head->len += fp->len;  
    if (head->ip\_summed != fp->ip\_summed)  
        head->ip\_summed = CHECKSUM\_NONE;  
    else if (head->ip\_summed == CHECKSUM\_COMPLETE)  
        head->csum = csum\_add(head->csum, fp->csum);  
    head->truesize += fp->truesize;  
}  
 /\*  
 \* 重置首部长度、片偏移、标志位和总长度。  
 \*/  
sub\_frag\_mem\_limit(qp->q.net, head->truesize);

head->next = NULL;  
head->dev = dev;  
head->tstamp = qp->q.stamp;  
IPCB(head)->frag\_max\_size = max(qp->max\_df\_size, qp->q.max\_size);

iph = ip\_hdr(head);  
iph->tot\_len = htons(len);  
iph->tos |= ecn;

/\* When we set IP\_DF on a refragmented skb we must also force a  
 \* call to ip\_fragment to avoid forwarding a DF-skb of size s while  
 \* original sender only sent fragments of size f (where f < s).  
 \*  
 \* We only set DF/IPSKB\_FRAG\_PMTU if such DF fragment was the largest  
 \* frag seen to avoid sending tiny DF-fragments in case skb was built  
 \* from one very small df-fragment and one large non-df frag.  
 \*/  
if (qp->max\_df\_size == qp->q.max\_size) {  
    IPCB(head)->flags |= IPSKB\_FRAG\_PMTU;  
    iph->frag\_off = htons(IP\_DF);  
} else {  
    iph->frag\_off = 0;  
}

ip\_send\_check(iph);

\_\_IP\_INC\_STATS(net, IPSTATS\_MIB\_REASMOKS);  
 /\*  
 \* 既然各分片都已处理完,释放ipq的分片队列。  
 \*/  
qp->q.fragments = NULL;  
qp->q.fragments\_tail = NULL;  
return 0;

out_nomem:
net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
err = -ENOMEM;
goto out_fail;
out_oversize:
net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
return err;
}

1/4/4 ipq散列表重组

static void inet_frag_secret_rebuild(struct inet_frags *f)
{
int i;

write\_seqlock\_bh(&f->rnd\_seqlock);//顺序锁

if (!inet\_frag\_may\_rebuild(f))  
    goto out;  

/* 获取新的用于计算hash的随机值 */
get_random_bytes(&f->rnd, sizeof(u32));

for (i = 0; i < INETFRAGS\_HASHSZ; i++) {  
    struct inet\_frag\_bucket \*hb;  
    struct inet\_frag\_queue \*q;  
    struct hlist\_node \*n;

    hb = &f->hash\[i\]; /\* 取的桶节点 \*/  
    spin\_lock(&hb->chain\_lock);

    hlist\_for\_each\_entry\_safe(q, n, &hb->chain, list) {  
        unsigned int hval = inet\_frag\_hashfn(f, q);

        if (hval != i) {/\* 节点不属于当前桶 \*/  
            struct inet\_frag\_bucket \*hb\_dest;

            hlist\_del(&q->list); /\* 从当前桶中删除该节点 \*/

            /\* Relink to new hash chain. \*/  
            hb\_dest = &f->hash\[hval\]; /\* 找到目标桶 \*/

            /\* This is the only place where we take  
             \* another chain\_lock while already holding  
             \* one.  As this will not run concurrently,  
             \* we cannot deadlock on hb\_dest lock below, if its  
             \* already locked it will be released soon since  
             \* other caller cannot be waiting for hb lock  
             \* that we've taken above.  
             \*/  
            spin\_lock\_nested(&hb\_dest->chain\_lock,  
                     SINGLE\_DEPTH\_NESTING);/\* 节点加入目标桶的链表中 \*/  
            hlist\_add\_head(&q->list, &hb\_dest->chain);  
            spin\_unlock(&hb\_dest->chain\_lock);  
        }  
    }  
    spin\_unlock(&hb->chain\_lock);  
}  

/* 设置重建标记和重建时间 */
f->rebuild = false;
f->last_rebuild_jiffies = jiffies;
out:
write_sequnlock_bh(&f->rnd_seqlock);
}

1/4/5 超时IP分片的清除

会定时清除规定 时间内没有完成重组的upq及其所有的分片

/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
static void ip_expire(unsigned long arg)
{
struct ipq *qp;
struct net *net;

qp = container\_of((struct inet\_frag\_queue \*) arg, struct ipq, q);  
net = container\_of(qp->q.net, struct net, ipv4.frags);

spin\_lock(&qp->q.lock);  

//ipq 已经是complete状态不处理 直接释放ipq以及其所有的分片
if (qp->q.flags & INET_FRAG_COMPLETE)
goto out;

ipq\_kill(qp);//将其从散列表移除  
\_\_IP\_INC\_STATS(net, IPSTATS\_MIB\_REASMFAILS);//数据统计

if (!inet\_frag\_evicting(&qp->q)) {//在回收队列中  
    struct sk\_buff \*head = qp->q.fragments;  
    const struct iphdr \*iph;  
    int err;

    \_\_IP\_INC\_STATS(net, IPSTATS\_MIB\_REASMTIMEOUT);

    if (!(qp->q.flags & INET\_FRAG\_FIRST\_IN) || !qp->q.fragments)  
        goto out;

    rcu\_read\_lock();  
    head->dev = dev\_get\_by\_index\_rcu(net, qp->iif);  
    if (!head->dev)  
        goto out\_rcu\_unlock;

    /\* skb has no dst, perform route lookup again \*/  
    iph = ip\_hdr(head);  
    err = ip\_route\_input\_noref(head, iph->daddr, iph->saddr,  
                   iph->tos, head->dev);  
    if (err)  
        goto out\_rcu\_unlock;

    /\* Only an end host needs to send an ICMP  
     \* "Fragment Reassembly Timeout" message, per RFC792.  
     \*/  
    if (frag\_expire\_skip\_icmp(qp->user) &&  
        (skb\_rtable(head)->rt\_type != RTN\_LOCAL))  
        goto out\_rcu\_unlock;

    /\* Send an ICMP "Fragment Reassembly Timeout" message. 发送ICMP 报文\*/  
    icmp\_send(head, ICMP\_TIME\_EXCEEDED, ICMP\_EXC\_FRAGTIME, 0);  

out_rcu_unlock:
rcu_read_unlock();
}
out:
spin_unlock(&qp->q.lock);
ipq_put(qp);
}

1/4/6 进行节点回收工作队列

为了控制ip组装所占用的内存,设置了两个阈值low_thresh 、high_thresh 当前ipq散列表所占用的内存存储在 mem变量中,这些全局变量存在如下结构中(netns_frags)

struct netns_frags {
/* The percpu_counter "mem" need to be cacheline aligned.
* mem.count must not share cacheline with other writers
*/
struct percpu_counter mem ____cacheline_aligned_in_smp;

/\* sysctls \*/  
int            timeout;  
int            high\_thresh;  
int            low\_thresh;  
int            max\_dist;  

};

当mem大于high_thres 时,需要对散列表清理,直到mem值降低到low_thres。这两个值可以通过proc修改

static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
struct inet_frag_queue *fq;
struct hlist_node *n;
unsigned int evicted = 0;
HLIST_HEAD(expired);

spin\_lock(&hb->chain\_lock);  

/* 遍历桶下的链表 */
hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
if (!inet_fragq_should_evict(fq))/* 未超过限定,无需回收 */
continue;

    if (!del\_timer(&fq->timer)) /\* 定时器无法删除 \*/  
        continue;  

/* 能够回收的节点加入到临时hash */
hlist_add_head(&fq->list_evictor, &expired);
++evicted;
}

spin\_unlock(&hb->chain\_lock);  

/* 依次调用回收函数进行回收 */
hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
f->frag_expire((unsigned long) fq);

return evicted;  

}

static void inet_frag_worker(struct work_struct *work)
{
/* 本次回收的桶节点数 */
unsigned int budget = INETFRAGS_EVICT_BUCKETS;
unsigned int i, evicted = 0;
struct inet_frags *f;

f = container\_of(work, struct inet\_frags, frags\_work);

BUILD\_BUG\_ON(INETFRAGS\_EVICT\_BUCKETS >= INETFRAGS\_HASHSZ);

local\_bh\_disable();  

/* 从上次回收完的下一个节点开始,进行回收 */
for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
evicted += inet_evict_bucket(f, &f->hash[i]);
/* 回收并统计回收数量 */
i = (i + 1) & (INETFRAGS_HASHSZ - 1);
/* 回收节点数超过最大值,停止 */
if (evicted > INETFRAGS_EVICT_MAX)
break;
}

f->next\_bucket = i;  /\* 记录下次需要开始回收的桶节点 \*/

local\_bh\_enable();  

/* 如果需要重建,则重建 */
if (f->rebuild && inet_frag_may_rebuild(f))
inet_frag_secret_rebuild(f);
}

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器

你可能感兴趣的文章