netfilter之連結跟蹤做nat
上一節我們將了NAT是基於連結跟蹤實現的,當一條連結跟蹤建立要改變它的tuple的reply方向才能做nat,這個連結跟蹤的nat是函式nf_nat_setup_info實現
1、nf_nat_setup_info
nf_nat_setup_info對連結跟蹤的做NAT,只會改變連結跟蹤reply方向的ip、埠,不會改變資料包的ip、埠,資料包的DAT在上一節已經介紹了是在PRE_ROUTING、POST_ROUTING鏈的hook點根據連結跟蹤的reply方向對資料包做NAT。
nf_nat_setup_info主要做以下幾件事
(1)獲取連結跟蹤和nat關聯的結構體struct nf_conn_nat ,如果是空就直接返回
(2)呼叫nf_nat_initialized判斷是否已經做了連結跟蹤的NAT
(3)nf_ct_invert_tuplepr獲取reply方向的tuple然後取反賦值給curr_tuple也是是orig tuple
(4)get_unique_tuple這個函式是關鍵,這個就是得到一個新的tuple,new_tuple,這個new_tuple是做了NAT的orig方向。
(5)呼叫nf_ct_tuple_equal比較curr_tuple和new_tuple是否相等如果不相等就要做NAT改變連結跟蹤reply的tuple
(5)呼叫nf_ct_invert_tuplepr對new_reply取反呼叫orig方向的tuple reply
(6)nf_conntrack_alter_reply改變連結跟蹤tuple的reply方向完成連結跟蹤的NAT
(7)做了NAT的連結跟蹤如果沒有在nat_bysource連結串列中就要新增進去
(8)設定已經做NAT的標誌IPS_DST_NAT_DONE_BIT/IPS_SRC_NAT_DONE_BIT
unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); /* nat helper or nfctnetlink also setup binding */ nat = nfct_nat(ct); if (!nat) { nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); if (nat == NULL) { pr_debug("failed to add NAT extension\n"); return NF_ACCEPT; } } NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC || maniptype == IP_NAT_MANIP_DST); BUG_ON(nf_nat_initialized(ct, maniptype)); /* What we've got will look like inverse of reply. Normally this is what is in the conntrack, except for prior manipulations (future optimization: if num_manips == 0, orig_tp = conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ /*獲取reply方向的tuple做反方向複製給curr_tuple*/ nf_ct_invert_tuplepr(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); /*根據原始的tuple獲取新的唯一tuple*/ get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); /*新的orig方向和原始orig方向不相等就要做連結跟蹤的NAT 也就是改變tuple的reply方向*/ if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ /*新的tuple取反*/ nf_ct_invert_tuplepr(&reply, &new_tuple); /*將取反後的tuple賦值給reply方向 也就是連結跟蹤做NAT*/ nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ if (maniptype == IP_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; } /* Place in source hash if this is the first time. */ if (have_to_hash) { unsigned int srchash; srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_lock); /* nf_conntrack_alter_reply might re-allocate exntension aera */ nat = nfct_nat(ct); nat->ct = ct; hlist_add_head_rcu(&nat->bysource, &net->ipv4.nat_bysource[srchash]); spin_unlock_bh(&nf_nat_lock); } /* It's done. */ /*設定已經做了SNAT/DNAT標誌*/ if (maniptype == IP_NAT_MANIP_DST) set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); else set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); return NF_ACCEPT; }
2、nf_ct_invert_tuplepr
呼叫__nf_ct_l3proto_find獲取三層連結跟蹤的操作函式結構體struct nf_conntrack_l3proto例項,呼叫__nf_ct_l4proto_find獲取四層連結跟蹤的操作函式結構體struct nf_conntrack_l4proto例項,然後呼叫nf_ct_invert_tuple根據orig方向取反方向的tuple。
bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
const struct nf_conntrack_tuple *orig)
{
bool ret;
rcu_read_lock();
ret = nf_ct_invert_tuple(inverse, orig,
__nf_ct_l3proto_find(orig->src.l3num),
__nf_ct_l4proto_find(orig->src.l3num,
orig->dst.protonum));
rcu_read_unlock();
return ret;
}
nf_ct_invert_tuple呼叫三層、四層的invert_tuple根據orig的nf_conntrack_tuple獲取反方向的nf_conntrack_tuple。
bool
nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
memset(inverse, 0, sizeof(*inverse));
inverse->src.l3num = orig->src.l3num;
/*三層根據orig的nf_conntrack_tuple獲取反方向的nf_conntrack_tuple*/
if (l3proto->invert_tuple(inverse, orig) == 0)
return false;
inverse->dst.dir = !orig->dst.dir;
inverse->dst.protonum = orig->dst.protonum;
/*四層根據orig的nf_conntrack_tuple獲取反方向的nf_conntrack_tuple*/
return l4proto->invert_tuple(inverse, orig);
}
3、get_unique_tuple
這個函式主要是獲取唯一的做了nat的tuple。
(1)首先如果是SNAT就呼叫find_appropriate_src在nat_bysource連結串列中查詢已經做了NAT的tuple如果找到了而且沒有被其他使用就返回
(2)find_best_ips_proto做ip地址的nat
(3)四層協議做NAT,如果是IP_NAT_RANGE_PROTO_RANDOM標誌也就是隨機的,就呼叫四層協議的unique_tuple獲取唯一沒有被使用的埠做NAT,如果是IP_NAT_RANGE_PROTO_SPECIFIED也就是指定埠,就要呼叫in_range判斷此埠是否在合理返回內。
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig_tuple,
const struct nf_nat_range *range,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
struct net *net = nf_ct_net(ct);
const struct nf_nat_protocol *proto;
u16 zone = nf_ct_zone(ct);
/* 1) If this srcip/proto/src-proto-part is currently mapped,
and that same mapping gives a unique tuple within the given
range, use that.
This is only required for source (ie. NAT/masq) mappings.
So far, we don't do local source mappings, so multiple
manips not an issue. */
if (maniptype == IP_NAT_MANIP_SRC &&
!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
pr_debug("get_unique_tuple: Found current src map\n");
/*沒有被其他的使用就直接返回*/
if (!nf_nat_used_tuple(tuple, ct))
return;
}
}
/* 2) Select the least-used IP/proto combination in the given
range. */
*tuple = *orig_tuple;
/*IP地址做NAT*/
find_best_ips_proto(zone, tuple, range, ct, maniptype);
/* 3) The per-protocol part of the manip is made to map into
the range to make a unique tuple. */
rcu_read_lock();
/*查詢四層協議nat例項結構體struct nf_nat_protocol*/
proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
/* Change protocol info to have some randomization */
/*支援IP_NAT_RANGE_PROTO_RANDOM就呼叫四層協議函式unique_tuple
隨機獲取一個唯一的四層tuple*/
if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
proto->unique_tuple(tuple, range, maniptype, ct);
goto out;
}
/* Only bother mapping if it's not already in range and unique */
/*IP_NAT_RANGE_PROTO_SPECIFIED這個是使用者指定的四層埠要呼叫
ip_range判斷是否在合理範圍內,而且要判斷是否已經被其他使用*/
if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
!nf_nat_used_tuple(tuple, ct))
goto out;
/* Last change: get protocol to try to obtain unique tuple. */
/*呼叫unique_tuple後去選擇一個唯一的沒有被使用的四層埠完成nat*/
proto->unique_tuple(tuple, range, maniptype, ct);
out:
rcu_read_unlock();
}
3.1 find_appropriate_src
對應SNAT就會呼叫這個函式在已經做了NAT的表nat_bysource中查詢已經存在的tuple,如果找到了而且沒有被使用就對這個reply方向的tuple取反得到目標tuple。然後呼叫in_range判斷目標tuple是否在合理範圍內,
/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net, u16 zone,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *result,
const struct nf_nat_range *range)
{
unsigned int h = hash_by_src(net, zone, tuple);
const struct nf_conn_nat *nat;
const struct nf_conn *ct;
const struct hlist_node *n;
rcu_read_lock();
/*遍歷bysource連結串列*/
hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
ct = nat->ct;
/*找到了而且等於自己*/
if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
/* Copy source part from reply tuple. */
/*對reply方向的tuple取反得到目的tuple*/
nf_ct_invert_tuplepr(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst;
/*呼叫四層協議的in_range判斷是否在合理範圍內*/
if (in_range(result, range)) {
rcu_read_unlock();
return 1;
}
}
}
rcu_read_unlock();
return 0;
}
3.2 find_best_ips_proto
這個函式是根據range選擇一個合理範圍的Ip地址做NAT
static void
find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
const struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
__be32 *var_ipp;
/* Host order */
u_int32_t minip, maxip, j;
/* No IP mapping? Do nothing. */
if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
return;
if (maniptype == IP_NAT_MANIP_SRC)
var_ipp = &tuple->src.u3.ip;
else
var_ipp = &tuple->dst.u3.ip;
/* Fast path: only one choice. */
if (range->min_ip == range->max_ip) {
*var_ipp = range->min_ip;
return;
}
/* Hashing source and destination IPs gives a fairly even
* spread in practice (if there are a small number of IPs
* involved, there usually aren't that many connections
* anyway). The consistency means that servers see the same
* client coming from the same IP (some Internet Banking sites
* like this), even across reboots. */
minip = ntohl(range->min_ip);
maxip = ntohl(range->max_ip);
j = jhash_2words((__force u32)tuple->src.u3.ip,
range->flags & IP_NAT_RANGE_PERSISTENT ?
0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
j = ((u64)j * (maxip - minip + 1)) >> 32;
*var_ipp = htonl(minip + j);
}