Netfilter 之 iptable_nat
阅读原文时间:2023年07月11日阅读:3
初始化

iptable_nat_table_init函数通过调用ipt_register_table完成NAT表注册和钩子函数注册的功能;该流程与iptable_filter的函数调用的函数一致,此处不再重复分析,详情请移步

static int __net_init iptable_nat_table_init(struct net *net)
{
struct ipt_replace *repl;
int ret;

 /\* nat表已经初始化过 \*/  
 if (net->ipv4.nat\_table)  
     return ;

 /\* 分配初始化表,用于下面的注册 \*/  
 repl = ipt\_alloc\_initial\_table(&nf\_nat\_ipv4\_table);  
 if (repl == NULL)  
     return -ENOMEM;  
 /\* 表注册,钩子函数注册 \*/  
 ret = ipt\_register\_table(net, &nf\_nat\_ipv4\_table, repl,  
              nf\_nat\_ipv4\_ops, &net->ipv4.nat\_table);  
 kfree(repl);  
 return ret;  

}

钩子函数分析
钩子函数以及钩子点

nf_nat_ipv4_ops是NAT相关钩子函数的数组,其调用顺序和钩子点见下面注释;其中filter工作在DNAT和SNAT之间;

这几个钩子函数都会调用nf_nat_ipv4_fn来完成NAT转换,本部分最后统一分析该函数;

/* 钩子函数数组 */
/* 顺序 DNAT->filter->SNAT */
/* 输入本机 PRE_ROUTING(DNAT)->LOCAL_IN(SNAT) */
/* 转发 PRE_ROUTING(DNAT)->POST_ROUTING(SNAT) */
/* 本机输出 LOCAL_OUT(DNAT)->POST_ROUTING(SNAT) */
static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_in,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST, /* DNAT */
},
/* After packet filtering, change source */
{
.hook = iptable_nat_ipv4_out,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC, /* SNAT */
},
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_local_fn,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST, /* DNAT */
},
/* After packet filtering, change source */
{
.hook = iptable_nat_ipv4_fn,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC, /* SNAT */
},
};

iptable_nat_ipv4_in

函数工作在PRE_ROUTING钩子点,进行DNAT转换;

/* PRE_ROUTING,DNAT */
static unsigned int iptable_nat_ipv4_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain);
}

nf_nat_ipv4_in函数在进行DNAT转换之前记录了目的地址,在进行转换之后,如果目的地址发生了改变,则需要释放skb中的路由缓存;NAT转换过程调用nf_nat_ipv4_fn完成,步骤见下面的该函数分析;

/* PRE_ROUTING, DNAT */
unsigned int
nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
{
unsigned int ret;
/* 获取目的地址 */
__be32 daddr = ip_hdr(skb)->daddr;

 /\* DNAT转换 \*/  
 ret = nf\_nat\_ipv4\_fn(priv, skb, state, do\_chain);

 /\* 转换之后,目的地址发生变化,释放路由缓存 \*/  
 if (ret != NF\_DROP && ret != NF\_STOLEN &&  
     daddr != ip\_hdr(skb)->daddr)  
     skb\_dst\_drop(skb);

 return ret;  

}

iptable_nat_ipv4_fn

函数工作在LOCAL_IN钩子点,进行SNAT转换;NAT转换过程调用nf_nat_ipv4_fn完成,步骤见下面的该函数分析;

/* LOCAL_IN,SNAT */
static unsigned int iptable_nat_ipv4_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain);
}

iptable_nat_ipv4_local_fn

函数工作在LOCAL_OUT钩子点,进行DNAT转换;

/* LOCAL_OUT,DNAT */
static unsigned int iptable_nat_ipv4_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
}

nf_nat_ipv4_local_fn函数在进行DNAT转换之后,如果地址发生变化,则需要重新进行路由查;NAT转换过程调用nf_nat_ipv4_fn完成,步骤见下面的该函数分析;

unsigned int
nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
{
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
unsigned int ret;
int err;

 /\* root is playing with raw sockets. \*/  
 if (skb->len < sizeof(struct iphdr) ||  
     ip\_hdrlen(skb) < sizeof(struct iphdr))  
     return NF\_ACCEPT;

 /\* DNAT转换 \*/  
 ret = nf\_nat\_ipv4\_fn(priv, skb, state, do\_chain);

 /\* 转换成功 \*/  
 if (ret != NF\_DROP && ret != NF\_STOLEN &&  
     (ct = nf\_ct\_get(skb, &ctinfo)) != NULL) {  
     enum ip\_conntrack\_dir dir = CTINFO2DIR(ctinfo);

     /\* ip地址发生变化 \*/  
     if (ct->tuplehash\[dir\].tuple.dst.u3.ip !=  
         ct->tuplehash\[!dir\].tuple.src.u3.ip) {  
         /\* 重新查路由 \*/  
         err = ip\_route\_me\_harder(state->net, skb, RTN\_UNSPEC);  
         if (err < )  
             ret = NF\_DROP\_ERR(err);  
     }  

#ifdef CONFIG_XFRM
else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.dst.u.all !=
ct->tuplehash[!dir].tuple.src.u.all) {
err = nf_xfrm_me_harder(state->net, skb, AF_INET);
if (err < )
ret = NF_DROP_ERR(err);
}
#endif
}
return ret;
}

iptable_nat_ipv4_out

函数工作在POST_ROUTING钩子点,进行SNAT转换;

/* POST_ROUTING,SNAT */
static unsigned int iptable_nat_ipv4_out(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain);
}

unsigned int
nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
{
#ifdef CONFIG_XFRM
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
int err;
#endif
unsigned int ret;

 /\* root is playing with raw sockets. \*/  
 if (skb->len < sizeof(struct iphdr) ||  
     ip\_hdrlen(skb) < sizeof(struct iphdr))  
     return NF\_ACCEPT;

 /\* SNAT转换 \*/  
 ret = nf\_nat\_ipv4\_fn(priv, skb, state, do\_chain);  

#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

     if ((ct->tuplehash\[dir\].tuple.src.u3.ip !=  
          ct->tuplehash\[!dir\].tuple.dst.u3.ip) ||  
         (ct->tuplehash\[dir\].tuple.dst.protonum != IPPROTO\_ICMP &&  
          ct->tuplehash\[dir\].tuple.src.u.all !=  
          ct->tuplehash\[!dir\].tuple.dst.u.all)) {  
         err = nf\_xfrm\_me\_harder(state->net, skb, AF\_INET);  
         if (err < )  
             ret = NF\_DROP\_ERR(err);  
     }  
 }  

#endif
return ret;
}

公共函数nf_nat_ipv4_fn

nf_nat_ipv4_fn完成具体的SNAT或者DNAT的转换流程,上面的四个钩子函数都会调用该函数;

unsigned int
nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
/* 获取是进行DNAT还是SNAT,其中PRE_ROUTING和LOCAL_OUT进行DNAT,LOCAL_IN和POST_ROUTING进行SNAT */
enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);

 /\* 获取skb关联的连接跟踪sf\_conn \*/  
 ct = nf\_ct\_get(skb, &ctinfo);  
 /\* Can't track?  It's not due to stress, or conntrack would  
  \* have dropped it.  Hence it's the user's responsibilty to  
  \* packet filter it out, or implement conntrack/NAT for that  
  \* protocol. 8) --RR  
  \*/  
 /\* 没有,返回accpet \*/  
 if (!ct)  
     return NF\_ACCEPT;

 /\* 获取NAT扩展 \*/  
 nat = nfct\_nat(ct);

 /\* 判断连接跟踪状态 \*/  
 switch (ctinfo) {  
 /\* 关联连接(或者icmp错误)或者关联连接的应答 \*/  
 case IP\_CT\_RELATED:  
 case IP\_CT\_RELATED\_REPLY:  
     /\* icmp协议的NAT操作 \*/  
     if (ip\_hdr(skb)->protocol == IPPROTO\_ICMP) {  
         if (!nf\_nat\_icmp\_reply\_translation(skb, ct, ctinfo,  
                            state->hook))  
             return NF\_DROP;  
         else  
             return NF\_ACCEPT;  
     }  
     /\* Fall thru... (Only ICMPs can be IP\_CT\_IS\_REPLY) \*/  
 case IP\_CT\_NEW:  
     /\* Seen it before?  This can happen for loopback, retrans,  
      \* or local packets.  
      \*/  
     /\* 尚未进行过NAT转换 \*/  
     if (!nf\_nat\_initialized(ct, maniptype)) {  
         unsigned int ret;

         /\* 进行规则匹配 \*/  
         ret = do\_chain(priv, skb, state, ct);  
         if (ret != NF\_ACCEPT)  
             return ret;

         /\* 打NAT转换标记 \*/  
         if (nf\_nat\_initialized(ct, HOOK2MANIP(state->hook)))  
             break;

         /\* 连接跟踪进行NAT \*/  
         ret = nf\_nat\_alloc\_null\_binding(ct, state->hook);  
         if (ret != NF\_ACCEPT)  
             return ret;  
     }  
     /\* 进行过NAT转换 \*/  
     else {  
         pr\_debug("Already setup manip %s for ct %p\\n",  
              maniptype == NF\_NAT\_MANIP\_SRC ? "SRC" : "DST",  
              ct);  
         /\* 出接口发生改变 \*/  
         if (nf\_nat\_oif\_changed(state->hook, ctinfo, nat,  
                        state->out))  
             goto oif\_changed;  
     }  
     break;

 default:  
     /\* ESTABLISHED \*/  
     NF\_CT\_ASSERT(ctinfo == IP\_CT\_ESTABLISHED ||  
              ctinfo == IP\_CT\_ESTABLISHED\_REPLY);  
     /\* 出接口发生改变 \*/  
     if (nf\_nat\_oif\_changed(state->hook, ctinfo, nat, state->out))  
         goto oif\_changed;  
 }

 /\* skb数据包进行NAT转换修改 \*/  
 return nf\_nat\_packet(ct, ctinfo, state->hook, skb);

oif_changed:
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_DROP;
}

unsigned int
nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
}

static unsigned int
__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
{
/* Force range to this IP; let proto decide mapping for
* per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
* Use reply in case it's already been mangled (eg local packet).
*/
/* 使用应答方向的ip地址,LOCAL_OUT会先经过mangle,可能改变了 */
union nf_inet_addr ip =
(manip == NF_NAT_MANIP_SRC ?
ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);

 /\* 设置range \*/  
 struct nf\_nat\_range range = {  
     .flags        = NF\_NAT\_RANGE\_MAP\_IPS,  
     .min\_addr    = ip,  
     .max\_addr    = ip,  
 };

 /\* 进行NAT转换 \*/  
 return nf\_nat\_setup\_info(ct, &range, manip);  

}

unsigned int
nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype)
{
struct nf_conntrack_tuple curr_tuple, new_tuple;

 /\* Can't setup nat info for confirmed ct. \*/  
 /\* 已经确认的,返回accpet \*/  
 if (nf\_ct\_is\_confirmed(ct))  
     return NF\_ACCEPT;

 NF\_CT\_ASSERT(maniptype == NF\_NAT\_MANIP\_SRC ||  
          maniptype == NF\_NAT\_MANIP\_DST);  
 BUG\_ON(nf\_nat\_initialized(ct, maniptype));

 /\* What we've got will look like inverse of reply. Normally  
  \* this is what is in the conntrack, except for prior  
  \* manipulations (future optimization: if num\_manips == 0,  
  \* orig\_tp = ct->tuplehash\[IP\_CT\_DIR\_ORIGINAL\].tuple)  
  \*/  
 /\* 从应答tuple反向得到当前tuple \*/  
 nf\_ct\_invert\_tuplepr(&curr\_tuple,  
              &ct->tuplehash\[IP\_CT\_DIR\_REPLY\].tuple);

 /\* 根据当前tuple和range得到NAT转换之后的的tuple \*/  
 get\_unique\_tuple(&new\_tuple, &curr\_tuple, range, ct, maniptype);

 /\* NAT转换之后和之前的tuple不同 \*/  
 if (!nf\_ct\_tuple\_equal(&new\_tuple, &curr\_tuple)) {  
     struct nf\_conntrack\_tuple reply;

     /\* Alter conntrack table so will recognize replies. \*/  
     /\* 通过新tuple得到reply\_tuple \*/  
     nf\_ct\_invert\_tuplepr(&reply, &new\_tuple);  
     /\* 加入到reply hash \*/  
     nf\_conntrack\_alter\_reply(ct, &reply);

     /\* 此时tuple类似如下 \*/  
     /\*  
         //内网10.1通过100.1访问200.1,经过SNAT之后得到tuple  
         tuple SNAT(10.1->200.1, 200.1->100.1) 

         //外网300.1通过100.1访问20.1,经过DNAT之后,得到tuple  
         tuple DNAT(300.1->100.1, 20.1->300.1)  
     \*/

     /\* Non-atomic: we own this at the moment. \*/  
     /\* 更新状态需要做NAT \*/  
     if (maniptype == NF\_NAT\_MANIP\_SRC)  
         ct->status |= IPS\_SRC\_NAT;  
     else  
         ct->status |= IPS\_DST\_NAT;

     /\* 扩展项的调整 \*/  
     if (nfct\_help(ct))  
         if (!nfct\_seqadj\_ext\_add(ct))  
             return NF\_DROP;  
 }

 /\* SNAT \*/  
 if (maniptype == NF\_NAT\_MANIP\_SRC) {  
     struct nf\_nat\_conn\_key key = {  
         .net = nf\_ct\_net(ct),  
         .tuple = &ct->tuplehash\[IP\_CT\_DIR\_ORIGINAL\].tuple,  
         .zone = nf\_ct\_zone(ct),  
     };  
     int err;

     /\* 加入到nf\_nat\_bysource\_table \*/  
     err = rhltable\_insert\_key(&nf\_nat\_bysource\_table,  
                   &key,  
                   &ct->nat\_bysource,  
                   nf\_nat\_bysource\_params);  
     if (err)  
         return NF\_DROP;  
 }

 /\* It's done. \*/  
 /\* NAT转换完成 \*/  
 if (maniptype == NF\_NAT\_MANIP\_DST)  
     ct->status |= IPS\_DST\_NAT\_DONE;  
 else  
     ct->status |= IPS\_SRC\_NAT\_DONE;

 return NF\_ACCEPT;  

}

/* 根据orig_tuple和range得到NAT转换之后的tuple */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig_tuple,
const struct nf_nat_range *range,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
const struct nf_conntrack_zone *zone;
const struct nf_nat_l3proto *l3proto;
const struct nf_nat_l4proto *l4proto;
struct net *net = nf_ct_net(ct);

 zone = nf\_ct\_zone(ct);

 rcu\_read\_lock();

 /\* 查找l3proto和l4proto \*/  
 l3proto = \_\_nf\_nat\_l3proto\_find(orig\_tuple->src.l3num);  
 l4proto = \_\_nf\_nat\_l4proto\_find(orig\_tuple->src.l3num,  
                 orig\_tuple->dst.protonum);

 /\* 1) If this srcip/proto/src-proto-part is currently mapped,  
  \* and that same mapping gives a unique tuple within the given  
  \* range, use that.  
  \*  
  \* This is only required for source (ie. NAT/masq) mappings.  
  \* So far, we don't do local source mappings, so multiple  
  \* manips not an issue.  
  \*/  
 /\* SNAT && 没有打RANDOM\_ALL标记 \*/  
 if (maniptype == NF\_NAT\_MANIP\_SRC &&  
     !(range->flags & NF\_NAT\_RANGE\_PROTO\_RANDOM\_ALL)) {  
     /\* try the original tuple first \*/  
     /\* 查看orig\_tuple是否满足范围要求 \*/  
     if (in\_range(l3proto, l4proto, orig\_tuple, range)) {  
         /\* tuple尚未被使用 \*/  
         if (!nf\_nat\_used\_tuple(orig\_tuple, ct)) {  
             /\* 使用原tuple \*/  
             \*tuple = \*orig\_tuple;  
             goto out;  
         }  
     }  
     /\* ori\_range不满足要求,则从bysource\_table中查找一个满足范围的tuple \*/  
     else if (find\_appropriate\_src(net, zone, l3proto, l4proto,  
                     orig\_tuple, tuple, range)) {  
         pr\_debug("get\_unique\_tuple: Found current src map\\n");  
         /\* tuple尚未被使用 \*/  
         if (!nf\_nat\_used\_tuple(tuple, ct))  
             goto out;  
     }  
 }

 /\* 从给定range中选择一个最少使用的组合 \*/  
 /\* 2) Select the least-used IP/proto combination in the given range \*/  
 \*tuple = \*orig\_tuple;  
 find\_best\_ips\_proto(zone, tuple, range, ct, maniptype);

 /\* 3) The per-protocol part of the manip is made to map into  
  \* the range to make a unique tuple.  
  \*/

 /\* Only bother mapping if it's not already in range and unique \*/  
 /\* 没有打RANDOM\_ALL标记 \*/  
 if (!(range->flags & NF\_NAT\_RANGE\_PROTO\_RANDOM\_ALL)) {  
     /\* 有SPECIFIED标记,对端口号进行检查 \*/  
     if (range->flags & NF\_NAT\_RANGE\_PROTO\_SPECIFIED) {  
         /\* 端口号已经在范围之内&&(端口最小最大范围相等||tuple没有使用) \*/  
         if (l4proto->in\_range(tuple, maniptype,  
                       &range->min\_proto,  
                       &range->max\_proto) &&  
             (range->min\_proto.all == range->max\_proto.all ||  
              !nf\_nat\_used\_tuple(tuple, ct)))  
             goto out;  
     }  
     /\* 没有SPECIFIED标记,端口号不变,tuple没有被使用 \*/  
     else if (!nf\_nat\_used\_tuple(tuple, ct)) {  
         goto out;  
     }  
 }

 /\* Last change: get protocol to try to obtain unique tuple. \*/  
 /\* 随机选择端口号 \*/  
 l4proto->unique\_tuple(l3proto, tuple, range, maniptype, ct);  

out:
rcu_read_unlock();
}

unsigned int nf_nat_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum,
struct sk_buff *skb)
{
const struct nf_nat_l3proto *l3proto;
const struct nf_nat_l4proto *l4proto;
/* 获取方向 */
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
unsigned long statusbit;
/* 获取进行SNAT还是DNAT */
enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);

 /\* 设置NAT标记 \*/  
 if (mtype == NF\_NAT\_MANIP\_SRC)  
     statusbit = IPS\_SRC\_NAT;  
 else  
     statusbit = IPS\_DST\_NAT;

 /\* Invert if this is reply dir. \*/  
 /\* 应答方向需要取反 \*/  
 if (dir == IP\_CT\_DIR\_REPLY)  
     statusbit ^= IPS\_NAT\_MASK;

 /\* Non-atomic: these bits don't change. \*/

 /\* 需要做NAT \*/  
 if (ct->status & statusbit) {  
     struct nf\_conntrack\_tuple target;

     /\* We are aiming to look like inverse of other direction. \*/  
     /\* 获取目标tuple \*/  
     /\*  
         //内网10.1通过100.1访问200.1,经过SNAT之后得到tuple  
         tuple SNAT(10.1->200.1, 200.1->100.1) 

         //外网300.1通过100.1访问20.1,经过DNAT之后,得到tuple  
         tuple DNAT(300.1->100.1, 20.1->300.1)  
     \*/  
     nf\_ct\_invert\_tuplepr(&target, &ct->tuplehash\[!dir\].tuple);

     /\* 获取l3proto,l4proto \*/  
     l3proto = \_\_nf\_nat\_l3proto\_find(target.src.l3num);  
     l4proto = \_\_nf\_nat\_l4proto\_find(target.src.l3num,  
                     target.dst.protonum);

     /\* 将ip地址和端口的NAT转换结果写入skb \*/  
     if (!l3proto->manip\_pkt(skb, , l4proto, &target, mtype))  
         return NF\_DROP;  
 }  
 return NF\_ACCEPT;  

}