1. 程式人生 > >netfilter之連結跟蹤做nat

netfilter之連結跟蹤做nat

上一節我們將了NAT是基於連結跟蹤實現的,當一條連結跟蹤建立要改變它的tuple的reply方向才能做nat,這個連結跟蹤的nat是函式nf_nat_setup_info實現

1、nf_nat_setup_info

nf_nat_setup_info對連結跟蹤的做NAT,只會改變連結跟蹤reply方向的ip、埠,不會改變資料包的ip、埠,資料包的DAT在上一節已經介紹了是在PRE_ROUTING、POST_ROUTING鏈的hook點根據連結跟蹤的reply方向對資料包做NAT。

nf_nat_setup_info主要做以下幾件事

(1)獲取連結跟蹤和nat關聯的結構體struct nf_conn_nat ,如果是空就直接返回

(2)呼叫nf_nat_initialized判斷是否已經做了連結跟蹤的NAT

(3)nf_ct_invert_tuplepr獲取reply方向的tuple然後取反賦值給curr_tuple也是是orig tuple

(4)get_unique_tuple這個函式是關鍵,這個就是得到一個新的tuple,new_tuple,這個new_tuple是做了NAT的orig方向。

(5)呼叫nf_ct_tuple_equal比較curr_tuple和new_tuple是否相等如果不相等就要做NAT改變連結跟蹤reply的tuple

(5)呼叫nf_ct_invert_tuplepr對new_reply取反呼叫orig方向的tuple reply

(6)nf_conntrack_alter_reply改變連結跟蹤tuple的reply方向完成連結跟蹤的NAT

(7)做了NAT的連結跟蹤如果沒有在nat_bysource連結串列中就要新增進去

(8)設定已經做NAT的標誌IPS_DST_NAT_DONE_BIT/IPS_SRC_NAT_DONE_BIT

unsigned int
nf_nat_setup_info(struct nf_conn *ct,
		  const struct nf_nat_range *range,
		  enum nf_nat_manip_type maniptype)
{
	struct net *net = nf_ct_net(ct);
	struct nf_conntrack_tuple curr_tuple, new_tuple;
	struct nf_conn_nat *nat;
	int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);

	/* nat helper or nfctnetlink also setup binding */
	nat = nfct_nat(ct);
	if (!nat) {
		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
		if (nat == NULL) {
			pr_debug("failed to add NAT extension\n");
			return NF_ACCEPT;
		}
	}

	NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC ||
		     maniptype == IP_NAT_MANIP_DST);
	BUG_ON(nf_nat_initialized(ct, maniptype));

	/* What we've got will look like inverse of reply. Normally
	   this is what is in the conntrack, except for prior
	   manipulations (future optimization: if num_manips == 0,
	   orig_tp =
	   conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
	   /*獲取reply方向的tuple做反方向複製給curr_tuple*/
	nf_ct_invert_tuplepr(&curr_tuple,
			     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

	/*根據原始的tuple獲取新的唯一tuple*/
	get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);

	/*新的orig方向和原始orig方向不相等就要做連結跟蹤的NAT
	也就是改變tuple的reply方向*/
	if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
		struct nf_conntrack_tuple reply;

		/* Alter conntrack table so will recognize replies. */
		/*新的tuple取反*/
		nf_ct_invert_tuplepr(&reply, &new_tuple);
		/*將取反後的tuple賦值給reply方向
		也就是連結跟蹤做NAT*/
		nf_conntrack_alter_reply(ct, &reply);

		/* Non-atomic: we own this at the moment. */
		if (maniptype == IP_NAT_MANIP_SRC)
			ct->status |= IPS_SRC_NAT;
		else
			ct->status |= IPS_DST_NAT;
	}

	/* Place in source hash if this is the first time. */
	if (have_to_hash) {
		unsigned int srchash;

		srchash = hash_by_src(net, nf_ct_zone(ct),
				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
		spin_lock_bh(&nf_nat_lock);
		/* nf_conntrack_alter_reply might re-allocate exntension aera */
		nat = nfct_nat(ct);
		nat->ct = ct;
		hlist_add_head_rcu(&nat->bysource,
				   &net->ipv4.nat_bysource[srchash]);
		spin_unlock_bh(&nf_nat_lock);
	}

	/* It's done. */
	/*設定已經做了SNAT/DNAT標誌*/
	if (maniptype == IP_NAT_MANIP_DST)
		set_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
	else
		set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);

	return NF_ACCEPT;
}

2、nf_ct_invert_tuplepr

呼叫__nf_ct_l3proto_find獲取三層連結跟蹤的操作函式結構體struct nf_conntrack_l3proto例項,呼叫__nf_ct_l4proto_find獲取四層連結跟蹤的操作函式結構體struct nf_conntrack_l4proto例項,然後呼叫nf_ct_invert_tuple根據orig方向取反方向的tuple。

bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
			  const struct nf_conntrack_tuple *orig)
{
	bool ret;

	rcu_read_lock();
	ret = nf_ct_invert_tuple(inverse, orig,
				 __nf_ct_l3proto_find(orig->src.l3num),
				 __nf_ct_l4proto_find(orig->src.l3num,
						      orig->dst.protonum));
	rcu_read_unlock();
	return ret;
}

nf_ct_invert_tuple呼叫三層、四層的invert_tuple根據orig的nf_conntrack_tuple獲取反方向的nf_conntrack_tuple。

bool
nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
		   const struct nf_conntrack_tuple *orig,
		   const struct nf_conntrack_l3proto *l3proto,
		   const struct nf_conntrack_l4proto *l4proto)
{
	memset(inverse, 0, sizeof(*inverse));

	inverse->src.l3num = orig->src.l3num;
	/*三層根據orig的nf_conntrack_tuple獲取反方向的nf_conntrack_tuple*/
	if (l3proto->invert_tuple(inverse, orig) == 0)
		return false;

	inverse->dst.dir = !orig->dst.dir;

	inverse->dst.protonum = orig->dst.protonum;
	/*四層根據orig的nf_conntrack_tuple獲取反方向的nf_conntrack_tuple*/
	return l4proto->invert_tuple(inverse, orig);
}

3、get_unique_tuple

這個函式主要是獲取唯一的做了nat的tuple。

(1)首先如果是SNAT就呼叫find_appropriate_src在nat_bysource連結串列中查詢已經做了NAT的tuple如果找到了而且沒有被其他使用就返回

(2)find_best_ips_proto做ip地址的nat

(3)四層協議做NAT,如果是IP_NAT_RANGE_PROTO_RANDOM標誌也就是隨機的,就呼叫四層協議的unique_tuple獲取唯一沒有被使用的埠做NAT,如果是IP_NAT_RANGE_PROTO_SPECIFIED也就是指定埠,就要呼叫in_range判斷此埠是否在合理返回內。

static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
		 const struct nf_conntrack_tuple *orig_tuple,
		 const struct nf_nat_range *range,
		 struct nf_conn *ct,
		 enum nf_nat_manip_type maniptype)
{
	struct net *net = nf_ct_net(ct);
	const struct nf_nat_protocol *proto;
	u16 zone = nf_ct_zone(ct);

	/* 1) If this srcip/proto/src-proto-part is currently mapped,
	   and that same mapping gives a unique tuple within the given
	   range, use that.

	   This is only required for source (ie. NAT/masq) mappings.
	   So far, we don't do local source mappings, so multiple
	   manips not an issue.  */
	if (maniptype == IP_NAT_MANIP_SRC &&
	    !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
		if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
			pr_debug("get_unique_tuple: Found current src map\n");
			/*沒有被其他的使用就直接返回*/
			if (!nf_nat_used_tuple(tuple, ct))
				return;
		}
	}

	/* 2) Select the least-used IP/proto combination in the given
	   range. */
	*tuple = *orig_tuple;
    /*IP地址做NAT*/
	find_best_ips_proto(zone, tuple, range, ct, maniptype);

	/* 3) The per-protocol part of the manip is made to map into
	   the range to make a unique tuple. */

	rcu_read_lock();
	/*查詢四層協議nat例項結構體struct nf_nat_protocol*/
	proto = __nf_nat_proto_find(orig_tuple->dst.protonum);

	/* Change protocol info to have some randomization */
	/*支援IP_NAT_RANGE_PROTO_RANDOM就呼叫四層協議函式unique_tuple
	隨機獲取一個唯一的四層tuple*/
	if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
		proto->unique_tuple(tuple, range, maniptype, ct);
		goto out;
	}

	/* Only bother mapping if it's not already in range and unique */
	/*IP_NAT_RANGE_PROTO_SPECIFIED這個是使用者指定的四層埠要呼叫
	ip_range判斷是否在合理範圍內,而且要判斷是否已經被其他使用*/
	if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
	     proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
	    !nf_nat_used_tuple(tuple, ct))
		goto out;

	/* Last change: get protocol to try to obtain unique tuple. */
	/*呼叫unique_tuple後去選擇一個唯一的沒有被使用的四層埠完成nat*/
	proto->unique_tuple(tuple, range, maniptype, ct);
out:
	rcu_read_unlock();
}

3.1 find_appropriate_src

對應SNAT就會呼叫這個函式在已經做了NAT的表nat_bysource中查詢已經存在的tuple,如果找到了而且沒有被使用就對這個reply方向的tuple取反得到目標tuple。然後呼叫in_range判斷目標tuple是否在合理範圍內,

/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net, u16 zone,
		     const struct nf_conntrack_tuple *tuple,
		     struct nf_conntrack_tuple *result,
		     const struct nf_nat_range *range)
{
	unsigned int h = hash_by_src(net, zone, tuple);
	const struct nf_conn_nat *nat;
	const struct nf_conn *ct;
	const struct hlist_node *n;

	rcu_read_lock();
	/*遍歷bysource連結串列*/
	hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
		ct = nat->ct;
		/*找到了而且等於自己*/
		if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
			/* Copy source part from reply tuple. */
			/*對reply方向的tuple取反得到目的tuple*/
			nf_ct_invert_tuplepr(result,
				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
			result->dst = tuple->dst;
			/*呼叫四層協議的in_range判斷是否在合理範圍內*/
			if (in_range(result, range)) {
				rcu_read_unlock();
				return 1;
			}
		}
	}
	rcu_read_unlock();
	return 0;
}

3.2 find_best_ips_proto

這個函式是根據range選擇一個合理範圍的Ip地址做NAT

static void
find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
		    const struct nf_nat_range *range,
		    const struct nf_conn *ct,
		    enum nf_nat_manip_type maniptype)
{
	__be32 *var_ipp;
	/* Host order */
	u_int32_t minip, maxip, j;

	/* No IP mapping?  Do nothing. */
	if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
		return;

	if (maniptype == IP_NAT_MANIP_SRC)
		var_ipp = &tuple->src.u3.ip;
	else
		var_ipp = &tuple->dst.u3.ip;

	/* Fast path: only one choice. */
	if (range->min_ip == range->max_ip) {
		*var_ipp = range->min_ip;
		return;
	}

	/* Hashing source and destination IPs gives a fairly even
	 * spread in practice (if there are a small number of IPs
	 * involved, there usually aren't that many connections
	 * anyway).  The consistency means that servers see the same
	 * client coming from the same IP (some Internet Banking sites
	 * like this), even across reboots. */
	minip = ntohl(range->min_ip);
	maxip = ntohl(range->max_ip);
	j = jhash_2words((__force u32)tuple->src.u3.ip,
			 range->flags & IP_NAT_RANGE_PERSISTENT ?
				0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
	j = ((u64)j * (maxip - minip + 1)) >> 32;
	*var_ipp = htonl(minip + j);
}