1. 程式人生 > >UDP之資料報接收過程(一)

UDP之資料報接收過程(一)

UDP資料報的接收要分兩部分來看:

  1. 網路層接收完資料包後遞交給UDP後,UDP的處理過程。該過程UDP需要做的工作就是接收資料包並對其進行校驗,校驗成功後將其放入接收佇列中等待使用者空間程式來讀取。
  2. 使用者空間程式呼叫read()等系統呼叫讀取已經放入接收佇列中的資料。

這篇筆記先來介紹第一部分。

1. 從IP層接收資料包udp_rcv()

該函式是在AF_INET協議族初始化時,由UDP註冊給網路層的回撥函式,當網路層程式碼處理完一個輸入資料包後,如果該資料包是發往本機的,並且其上層協議就是UDP,那麼會呼叫該回調函式。

int udp_rcv(struct sk_buff *skb)
{
return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); } @skb: 輸入資料包 @udptable:已繫結埠的UDP傳輸控制塊,將從該雜湊表查詢給skb屬於哪個套接字 @proto:L4協議號,到這裡可能是IPPROTO_UDP或者IPPROTO_UDPLITE int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { struct sock *sk; struct udphdr *uh; unsigned short
ulen; struct rtable *rt = skb_rtable(skb); __be32 saddr, daddr; struct net *net = dev_net(skb->dev); /* * Validate the packet. */ //調整SKB內部資料佈局,使得線性地址空間中至少包含UDP首部 if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto drop; /* No space for header. */ uh = udp_hdr(skb); ulen = ntohs(uh->
len); //skb中的資料長度不能小於UDP首部指示的資料包長度,即資料包是完整的 if (ulen > skb->len) goto short_packet; if (proto == IPPROTO_UDP) { //1. UDP資料包長度必須大於首部長度 //2. pskb_trim_rcum()會去掉可能的填充(UDP資料包過小,IP可能會填充),然後重新計算校驗和 if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen)) goto short_packet; uh = udp_hdr(skb); } //計算校驗和 if (udp4_csum_init(skb, uh, proto)) goto csum_error; //獲取資料包中的源IP和目的IP地址 saddr = ip_hdr(skb)->saddr; daddr = ip_hdr(skb)->daddr; //對於多播或者廣播報文的處理 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable); //根據報文的源埠號和目的埠號查詢udptable,尋找應該接收該資料包的傳輸控制塊 sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); //找到了處理該資料包的傳輸控制塊,呼叫udp_queue_rcv_skb()接收資料包 if (sk != NULL) { int ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 */ if (ret > 0) return -ret; return 0; } //到這裡,說明沒有傳輸控制塊接收該資料包,做些統計然後丟棄該資料包 //IPSec相關 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; nf_reset(skb); /* No socket. Drop packet silently, if checksum is wrong */ if (udp_lib_checksum_complete(skb)) goto csum_error; //累計輸入資料包錯誤統計值,並且回覆埠不可達ICMP報文 UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* * Hmm. We got an UDP packet to a port to which we * don't wanna listen. Ignore it. */ kfree_skb(skb); return 0; short_packet: LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", proto == IPPROTO_UDPLITE ? "-Lite" : "", &saddr, ntohs(uh->source), ulen, skb->len, &daddr, ntohs(uh->dest)); goto drop; csum_error: /* * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", proto == IPPROTO_UDPLITE ? "-Lite" : "", &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), ulen); drop: UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); return 0; }

疑惑:為何校驗和的計算和驗證要分udp4_csum_init()和udp_lib_checksum_complete()兩步完成???

1.1 查詢資料包所屬套接字

如上,非常關鍵的一步就是根據資料包中目的地址資訊尋找應該由誰來處理該資料包。

static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
						 __be16 sport, __be16 dport,
						 struct udp_table *udptable)
{
	struct sock *sk;
	const struct iphdr *iph = ip_hdr(skb);

	//在網路層可能已經為該資料包查詢過傳輸控制塊了,這時會將查詢結果記錄到skb->sk中
	if (unlikely(sk = skb_steal_sock(skb)))
		return sk;
	else
		//之前沒有查詢過,繼續查詢
		return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
					 iph->daddr, dport, inet_iif(skb),
					 udptable);
}

@dif: 該資料包的輸入網路裝置介面
static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
		__be16 sport, __be32 daddr, __be16 dport,
		int dif, struct udp_table *udptable)
{
	struct sock *sk, *result;
	struct hlist_nulls_node *node;
	//目的埠號為雜湊表的key
	unsigned short hnum = ntohs(dport);
	unsigned int hash = udp_hashfn(net, hnum);
	struct udp_hslot *hslot = &udptable->hash[hash];
	int score, badness;

	rcu_read_lock();
begin:
	//遍歷衝突鏈,尋找一個分值最高的儲存到result中
	result = NULL;
	badness = -1;
	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
		score = compute_score(sk, net, saddr, hnum, sport,
				      daddr, dport, dif);
		if (score > badness) {
			result = sk;
			badness = score;
		}
	}
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
	if (get_nulls_value(node) != hash)
		goto begin;

	if (result) {
		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
			result = NULL;
		else if (unlikely(compute_score(result, net, saddr, hnum, sport,
				  daddr, dport, dif) < badness)) {
			sock_put(result);
			goto begin;
		}
	}
	rcu_read_unlock();
	return result;
}

疑惑:查個表為什麼這麼複雜,這個分值什麼鬼???

2. udp_queue_rcv_skb()

找到資料包目的埠對應的傳輸控制塊後,會呼叫該函式接收該資料包。

/* returns:
 *  -1: error
 *   0: success
 *  >0: "udp encap" protocol resubmission
 *
 * Note that in the success and error cases, the skb is assumed to
 * have either been requeued or freed.
 */
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
	struct udp_sock *up = udp_sk(sk);
	int rc;
	int is_udplite = IS_UDPLITE(sk);

	//IPSec相關
	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
		goto drop;
	nf_reset(skb);
	//IPSeck相關處理
	if (up->encap_type) {
		/*
		 * This is an encapsulation socket so pass the skb to
		 * the socket's udp_encap_rcv() hook. Otherwise, just
		 * fall through and pass this up the UDP socket.
		 * up->encap_rcv() returns the following value:
		 * =0 if skb was successfully passed to the encap
		 *    handler or was discarded by it.
		 * >0 if skb should be passed on to UDP.
		 * <0 if skb should be resubmitted as proto -N
		 */

		/* if we're overly short, let UDP handle it */
		if (skb->len > sizeof(struct udphdr) &&
		    up->encap_rcv != NULL) {
			int ret;

			ret = (*up->encap_rcv)(sk, skb);
			if (ret <= 0) {
				UDP_INC_STATS_BH(sock_net(sk),
						 UDP_MIB_INDATAGRAMS,
						 is_udplite);
				return -ret;
			}
		}

		/* FALLTHROUGH -- it's a UDP Packet */
	}

	//UDPlite相關處理
	if ((is_udplite & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {

		/*
		 * MIB statistics other than incrementing the error count are
		 * disabled for the following two types of errors: these depend
		 * on the application settings, not on the functioning of the
		 * protocol stack as such.
		 *
		 * RFC 3828 here recommends (sec 3.3): "There should also be a
		 * way ... to ... at least let the receiving application block
		 * delivery of packets with coverage values less than a value
		 * provided by the application."
		 */
		if (up->pcrlen == 0) {          /* full coverage was set  */
			LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
				"%d while full coverage %d requested\n",
				UDP_SKB_CB(skb)->cscov, skb->len);
			goto drop;
		}
		/* The next case involves violating the min. coverage requested
		 * by the receiver. This is subtle: if receiver wants x and x is
		 * greater than the buffersize/MTU then receiver will complain
		 * that it wants x while sender emits packets of smaller size y.
		 * Therefore the above ...()->partial_cov statement is essential.
		 */
		if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
			LIMIT_NETDEBUG(KERN_WARNING
				"UDPLITE: coverage %d too small, need min %d\n",
				UDP_SKB_CB(skb)->cscov, up->pcrlen);
			goto drop;
		}
	}

	//如果設定了套介面過濾器時,那麼需要提前進行校驗和的處理,保證傳給過濾器的資料包一定是校驗通過的
	if (sk->sk_filter) {
		if (udp_lib_checksum_complete(skb))
			goto drop;
	}

	rc = 0;

	//鎖定socket
	bh_lock_sock(sk);
	//如果當前沒有使用者空間程式正在從接收佇列接收資料,那麼直接將SKB放入到接收佇列中即可
	if (!sock_owned_by_user(sk))
		rc = __udp_queue_rcv_skb(sk, skb);
	else
		//如果接收佇列已經被鎖定,那麼暫時將資料放入到後備佇列中,後備佇列中的資料在
		//release_sock()中被轉移到接收佇列中
		sk_add_backlog(sk, skb);
	bh_unlock_sock(sk);

	return rc;

drop:
	UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
	kfree_skb(skb);
	return -1;
}

2.1 接收資料到接收佇列

static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
	int is_udplite = IS_UDPLITE(sk);
	int rc;

	//呼叫sock_queue_rcv_skb()接收
	if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
		/* Note that an ENOMEM error is charged twice */
		if (rc == -ENOMEM) {
			//如果由於記憶體問題導致資料包接收失敗,進行統計
			UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
					 is_udplite);
			atomic_inc(&sk->sk_drops);
		}
		goto drop;
	}

	return 0;

drop:
	UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
	kfree_skb(skb);
	return -1;
}

int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
	int err = 0;
	int skb_len;

	//如果接收該資料包後,佔用記憶體過大,則接收失敗
	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
	    (unsigned)sk->sk_rcvbuf) {
		err = -ENOMEM;
		goto out;
	}
	//對於設定了套接字過濾器的呼叫其過濾器回撥,過濾失敗直接返回失敗
	err = sk_filter(sk, skb);
	if (err)
		goto out;
	//進行記憶體相關的統計,如果記憶體不足或者超過了接收快取上限,則接收失敗
	if (!sk_rmem_schedule(sk, skb->truesize)) {
		err = -ENOBUFS;
		goto out;
	}

	skb->dev = NULL;
	//輸入資料包由該套接字認領
	skb_set_owner_r(skb, sk);

	/* Cache the SKB length before we tack it onto the receive
	 * queue.  Once it is added it no longer belongs to us and
	 * may be freed by other threads of control pulling packets
	 * from the queue.
	 */
	skb_len = skb->len;
	//將該SKB加入到接收佇列中
	skb_queue_tail(&sk->sk_receive_queue, skb);
	//呼叫回撥通知可能由於資料不足而block的程序
	if (!sock_flag(sk, SOCK_DEAD))
		sk->sk_data_ready(sk, skb_len);
out:
	return err;
}

2.2 接收資料到後備佇列

在下半部接收時,如果傳輸控制塊已經被程序鎖定,那麼會先將資料放入到後備佇列中,等程序釋放傳輸控制塊時再進行處理,這種設計可以使得軟中斷能夠儘快的結束。

/* The per-socket spinlock must be held here. */
//呼叫該函式時,要確保已經使用自旋鎖sk_lock.slock
static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
	//將skb放入後備佇列的末尾
	if (!sk->sk_backlog.tail) {
		sk->sk_backlog.head = sk->sk_backlog.tail = skb;
	} else {
		sk->sk_backlog.tail->next = skb;
		sk->sk_backlog.tail = skb;
	}
	skb->next = NULL;
}

2.3 喚醒阻塞程序

將資料放入接收佇列後,需要喚醒那些因為資料不足而阻塞的程序,這是通過上面的sk->sk_data_ready()回撥實現的,對於UDP,該函式就是

static void sock_def_readable(struct sock *sk, int len)
{
	//先獲取讀鎖
	read_lock(&sk->sk_callback_lock);
	//如果有正在阻塞的程序,喚醒它們
	if (sk_has_sleeper(sk))
		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
						POLLRDNORM | POLLRDBAND);
	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
	read_unlock(&sk->sk_callback_lock);
}

static inline int sk_has_sleeper(struct sock *sk)
{
	/*
	 * We need to be sure we are in sync with the
	 * add_wait_queue modifications to the wait queue.
	 *
	 * This memory barrier is paired in the sock_poll_wait.
	 */
	smp_mb__after_lock();
	//block的程序都阻塞在了sk->sk_sleep等待佇列上
	return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
}