Linux内核网络栈实现分析（十）网络层之IP协议（下）

Linux内核网络栈实现分析（十）网络层之IP协议（下）
本文分析基于Linux Kernel 1.2.13

原创作品，转载请标明http://blog.csdn.net/yming0221/article/details/7552455

更多请查看专栏，地址http://blog.csdn.net/column/details/linux-kernel-net.html

作者：闫明

注：标题中的”（上）“，”（下）“表示分析过程基于数据包的传递方向：”（上）“表示分析是从底层向上分析、”（下）“表示分析是从上向下分析。

上篇博文分析传输层最终会调用函数ip_queue_xmit()函数，将发送数据的任务交给网络层，下面就分析了下该函数：

该函数的主要函数调用关系图如下：
[cpp] view plain copy
1. /*
2. * Queues a packet to be sent, and starts the transmitter
3. * if necessary. if free = 1 then we free the block after
4. * transmit, otherwise we don't. If free==2 we not only
5. * free the block but also don't assign a new ip seq number.
6. * This routine also needs to put in the total length,
7. * and compute the checksum
8. */
10. void ip_queue_xmit(struct sock *sk, //发送数据的队列所对应的sock结构
11. struct device *dev,//发送该数据包的网卡设备
12. struct sk_buff *skb,//封装好的sk_buff结构，要发送的数据在该结构中
13. int free)//主要配合TCP协议使用，用于数据包的重发，UDP等协议调用是free=1
14. {
15. struct iphdr *iph;//IP数据报首部指针
16. unsigned char *ptr;
18. /* Sanity check */
19. if (dev == NULL)
20. {
21. printk("IP: ip_queue_xmit dev = NULL\n");
22. return;
23. }
25. IS_SKB(skb);
27. /*
28. * Do some book-keeping in the packet for later
29. */
32. skb->dev = dev;//进一步完整sk_buff的相应字段
33. skb->when = jiffies;//用于TCP协议的超时重传
35. /*
36. * Find the IP header and set the length. This is bad
37. * but once we get the skb data handling code in the
38. * hardware will push its header sensibly and we will
39. * set skb->ip_hdr to avoid this mess and the fixed
40. * header length problem
41. */
43. ptr = skb->data;//指针指向sk_buff中的数据部分
44. ptr += dev->hard_header_len;//hard_header_len为硬件首部长度，在net_init.c的函数eth_setup()函数中设置的，dev->hard_header_len = ETH_HLEN; 以太网首部长度为14
45. iph = (struct iphdr *)ptr;//prt已经指向IP数据包的首部
46. skb->ip_hdr = iph;
47. iph->tot_len = ntohs(skb->len-dev->hard_header_len);//计算IP数据报的总长度
49. #ifdef CONFIG_IP_FIREWALL
50. if(ip_fw_chk(iph, dev, ip_fw_blk_chain, ip_fw_blk_policy, 0) != 1)
51. /* just don't send this packet */
52. return;
53. #endif
55. /*
56. * No reassigning numbers to fragments...
57. */
59. if(free!=2)
60. iph->id = htons(ip_id_count++);
61. else
62. free=1;
64. /* All buffers without an owner socket get freed */
65. if (sk == NULL)
66. free = 1;
68. skb->free = free;//设置skb的free值，free=1，发送后立即释放；free=2，不但释放缓存，而且不分配新的序列号
70. /*
71. * Do we need to fragment. Again this is inefficient.
72. * We need to somehow lock the original buffer and use
73. * bits of it.
74. */
75. //数据帧中的数据部分必须小于等于MTU
76. if(skb->len > dev->mtu + dev->hard_header_len)//发送的数据长度大于数据帧的数据部分和帧首部之和，则需要分片
77. {
78. ip_fragment(sk,skb,dev,0);//对数据报分片后继续调用ip _queue_xmit()函数发送数据
79. IS_SKB(skb);
80. kfree_skb(skb,FREE_WRITE);
81. return;
82. }
84. /*
85. * Add an IP checksum
86. */
88. ip_send_check(iph);//IP数据报首部检查
90. /*
91. * Print the frame when debugging
92. */
94. /*
95. * More debugging. You cannot queue a packet already on a list
96. * Spot this and moan loudly.
97. */
98. if (skb->next != NULL)//说明该数据包仍然存在于某个缓存队列
99. {
100. printk("ip_queue_xmit: next != NULL\n");
101. skb_unlink(skb);//将其从缓存链表中删除，否则可能导致内核错误
102. }
104. /*
105. * If a sender wishes the packet to remain unfreed
106. * we add it to his send queue. This arguably belongs
107. * in the TCP level since nobody else uses it. BUT
108. * remember IPng might change all the rules.
109. */
111. if (!free)//free=0
112. {
113. unsigned long flags;
114. /* The socket now has more outstanding blocks */
116. sk->packets_out++;
118. /* Protect the list for a moment */
119. save_flags(flags);
120. cli();
122. if (skb->link3 != NULL)//link3指向数据报道呃重发队列
123. {
124. printk("ip.c: link3 != NULL\n");
125. skb->link3 = NULL;
126. }
127. //sk中send_tail和send_head是用户缓存的单向链表表尾和表头
128. if (sk->send_head == NULL)
129. {
130. sk->send_tail = skb;
131. sk->send_head = skb;
132. }
133. else
134. {
135. sk->send_tail->link3 = skb;//link3指针用于数据包的连接
136. sk->send_tail = skb;
137. }
138. /* skb->link3 is NULL */
140. /* Interrupt restore */
141. restore_flags(flags);
142. }
143. else
144. /* Remember who owns the buffer */
145. skb->sk = sk;
147. /*
148. * If the indicated interface is up and running, send the packet.
149. */
151. ip_statistics.IpOutRequests++;
152. #ifdef CONFIG_IP_ACCT
153. ip_acct_cnt(iph,dev, ip_acct_chain);
154. #endif
156. #ifdef CONFIG_IP_MULTICAST //这部分是IP数据报的多播处理
158. /*
159. * Multicasts are looped back for other local users
160. */
162. .......................................
163. #endif
164. if((dev->flags&IFF_BROADCAST) && iph->daddr==dev->pa_brdaddr && !(dev->flags&IFF_LOOPBACK))//广播数据包的处理
165. ip_loopback(dev,skb);
167. if (dev->flags & IFF_UP)//设备状态正常
168. {
169. /*
170. * If we have an owner use its priority setting,
171. * otherwise use NORMAL
172. */
173. //调用设备接口层函数发送数据: dev_queue_xmit()函数
174. if (sk != NULL)
175. {
176. dev_queue_xmit(skb, dev, sk->priority);
177. }
178. else
179. {
180. dev_queue_xmit(skb, dev, SOPRI_NORMAL);
181. }
182. }
183. else//设备状态不正常
184. {
185. ip_statistics.IpOutDiscards++;
186. if (free)
187. kfree_skb(skb, FREE_WRITE);
188. }
189. }
这个函数中对长度过长的数据包进行了分片，ip_fragment()函数，该函数没有详细分析。
[cpp] view plain copy
1. void ip_fragment(struct sock *sk, struct sk_buff *skb, struct device *dev, int is_frag)
2. {
3. struct iphdr *iph;
4. unsigned char *raw;
5. unsigned char *ptr;
6. struct sk_buff *skb2;
7. int left, mtu, hlen, len;
8. int offset;
9. unsigned long flags;
11. /*
12. * Point into the IP datagram header.
13. */
15. raw = skb->data;
16. iph = (struct iphdr *) (raw + dev->hard_header_len);
18. skb->ip_hdr = iph;
20. /*
21. * Setup starting values.
22. */
24. hlen = (iph->ihl * sizeof(unsigned long));
25. left = ntohs(iph->tot_len) - hlen; /* Space per frame */
26. hlen += dev->hard_header_len; /* Total header size */
27. mtu = (dev->mtu - hlen); /* Size of data space */
28. ptr = (raw + hlen); /* Where to start from */
30. /*
31. * Check for any "DF" flag. [DF means do not fragment]
32. */
34. if (ntohs(iph->frag_off) & IP_DF)
35. {
36. /*
37. * Reply giving the MTU of the failed hop.
38. */
39. ip_statistics.IpFragFails++;
40. icmp_send(skb,ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, dev->mtu, dev);
41. return;
42. }
44. /*
45. * The protocol doesn't seem to say what to do in the case that the
46. * frame + options doesn't fit the mtu. As it used to fall down dead
47. * in this case we were fortunate it didn't happen
48. */
50. if(mtu<8)
51. {
52. /* It's wrong but it's better than nothing */
53. icmp_send(skb,ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED,dev->mtu, dev);
54. ip_statistics.IpFragFails++;
55. return;
56. }
58. /*
59. * Fragment the datagram.
60. */
62. /*
63. * The initial offset is 0 for a complete frame. When
64. * fragmenting fragments it's wherever this one starts.
65. */
67. if (is_frag & 2)
68. offset = (ntohs(iph->frag_off) & 0x1fff) << 3;
69. else
70. offset = 0;
73. /*
74. * Keep copying data until we run out.
75. */
77. while(left > 0)
78. {
79. len = left;
80. /* IF: it doesn't fit, use 'mtu' - the data space left */
81. if (len > mtu)
82. len = mtu;
83. /* IF: we are not sending upto and including the packet end
84. then align the next start on an eight byte boundary */
85. if (len < left)
86. {
87. len/=8;
88. len*=8;
89. }
90. /*
91. * Allocate buffer.
92. */
94. if ((skb2 = alloc_skb(len + hlen,GFP_ATOMIC)) == NULL)
95. {
96. printk("IP: frag: no memory for new fragment!\n");
97. ip_statistics.IpFragFails++;
98. return;
99. }
101. /*
102. * Set up data on packet
103. */
105. skb2->arp = skb->arp;
106. if(skb->free==0)
107. printk("IP fragmenter: BUG free!=1 in fragmenter\n");
108. skb2->free = 1;
109. skb2->len = len + hlen;
110. skb2->h.raw=(char *) skb2->data;
111. /*
112. * Charge the memory for the fragment to any owner
113. * it might possess
114. */
116. save_flags(flags);
117. if (sk)
118. {
119. cli();
120. sk->wmem_alloc += skb2->mem_len;
121. skb2->sk=sk;
122. }
123. restore_flags(flags);
124. skb2->raddr = skb->raddr; /* For rebuild_header - must be here */
126. /*
127. * Copy the packet header into the new buffer.
128. */
130. memcpy(skb2->h.raw, raw, hlen);
132. /*
133. * Copy a block of the IP datagram.
134. */
135. memcpy(skb2->h.raw + hlen, ptr, len);
136. left -= len;
138. skb2->h.raw+=dev->hard_header_len;
140. /*
141. * Fill in the new header fields.
142. */
143. iph = (struct iphdr *)(skb2->h.raw/*+dev->hard_header_len*/);
144. iph->frag_off = htons((offset >> 3));
145. /*
146. * Added AC : If we are fragmenting a fragment thats not the
147. * last fragment then keep MF on each bit
148. */
149. if (left > 0 || (is_frag & 1))
150. iph->frag_off |= htons(IP_MF);
151. ptr += len;
152. offset += len;
154. /*
155. * Put this fragment into the sending queue.
156. */
158. ip_statistics.IpFragCreates++;
160. ip_queue_xmit(sk, dev, skb2, 2);//还是调用ip_queue_xmit()函数来发送分片后的数据
161. }
162. ip_statistics.IpFragOKs++;
163. }
网络层的发送函数调用了设备接口层，相当于网络模型的链路层的发送函数dev_queue_xmit()

该函数的调用关系如下：
[cpp] view plain copy
1. /*
2. * Send (or queue for sending) a packet.
3. *
4. * IMPORTANT: When this is called to resend frames. The caller MUST
5. * already have locked the sk_buff. Apart from that we do the
6. * rest of the magic.
7. */
9. void dev_queue_xmit(struct sk_buff *skb, struct device *dev, int pri)
10. {
11. unsigned long flags;
12. int nitcount;
13. struct packet_type *ptype;
14. int where = 0; /* used to say if the packet should go */
15. /* at the front or the back of the */
16. /* queue - front is a retransmit try */
17. /* where=0 表示是刚从上层传递的新数据包；where=1 表示从硬件队列中取出的数据包*/
19. if (dev == NULL)
20. {
21. printk("dev.c: dev_queue_xmit: dev = NULL\n");
22. return;
23. }
25. if(pri>=0 && !skb_device_locked(skb))//锁定该skb再进行操作，避免造成内核的不一致情况
26. skb_device_lock(skb); /* Shove a lock on the frame */
27. #ifdef CONFIG_SLAVE_BALANCING
28. save_flags(flags);
29. cli();
30. if(dev->slave!=NULL && dev->slave->pkt_queue < dev->pkt_queue &&
31. (dev->slave->flags & IFF_UP))
32. dev=dev->slave;
33. restore_flags(flags);
34. #endif
35. #ifdef CONFIG_SKB_CHECK
36. IS_SKB(skb);
37. #endif
38. skb->dev = dev;
40. /*
41. * This just eliminates some race conditions, but not all...
42. */
44. if (skb->next != NULL) //这种条件似乎永远不能成立，因为发送数据包前，数据包已经从缓存队列摘下
45. {//以防内核代码有BUG
46. /*
47. * Make sure we haven't missed an interrupt.
48. */
49. printk("dev_queue_xmit: worked around a missed interrupt\n");
50. start_bh_atomic();
51. dev->hard_start_xmit(NULL, dev);
52. end_bh_atomic();
53. return;
54. }
56. /*
57. * Negative priority is used to flag a frame that is being pulled from the
58. * queue front as a retransmit attempt. It therefore goes back on the queue
59. * start on a failure.
60. */
62. if (pri < 0) //优先级小于0表示是从硬件队列中取出的数据包
63. {
64. pri = -pri-1;
65. where = 1;
66. }
68. if (pri >= DEV_NUMBUFFS)
69. {
70. printk("bad priority in dev_queue_xmit.\n");
71. pri = 1;
72. }
74. /*
75. * If the address has not been resolved. Call the device header rebuilder.
76. * This can cover all protocols and technically not just ARP either.
77. */
79. if (!skb->arp && dev->rebuild_header(skb->data, dev, skb->raddr, skb)) {//用于ARP协议，并重建MAC帧首部
80. return;
81. }
83. save_flags(flags);
84. cli();
85. if (!where) {//表示是新数据包，需要将其加入设备队列中
86. #ifdef CONFIG_SLAVE_BALANCING
87. skb->in_dev_queue=1;//该数据包在设备队列
88. #endif
89. skb_queue_tail(dev->buffs + pri,skb);//将发送数据包加入硬件队列
90. skb_device_unlock(skb); /* Buffer is on the device queue and can be freed safely */
91. skb = skb_dequeue(dev->buffs + pri);//从硬件队列中取出一个数据包
92. skb_device_lock(skb); /* New buffer needs locking down */
93. #ifdef CONFIG_SLAVE_BALANCING
94. skb->in_dev_queue=0;
95. #endif
96. }
97. restore_flags(flags);
99. /* copy outgoing packets to any sniffer packet handlers */
100. if(!where)//对于新的数据包，则遍历网络层协议队列，内核支持混杂模式
101. {
102. for (nitcount= dev_nit, ptype = ptype_base; nitcount > 0 && ptype != NULL; ptype = ptype->next)
103. {
104. /* Never send packets back to the socket
105. * they originated from - MvS (miquels@drinkel.ow.org)
106. */
107. if (ptype->type == htons(ETH_P_ALL) &&
108. (ptype->dev == dev || !ptype->dev) &&
109. ((struct sock *)ptype->data != skb->sk))
110. {
111. struct sk_buff *skb2;
112. if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL)
113. break;
114. /*
115. * The protocol knows this has (for other paths) been taken off
116. * and adds it back.
117. */
118. skb2->len-=skb->dev->hard_header_len;
119. ptype->func(skb2, skb->dev, ptype);//IP层函数对应func为ip_rcv()，将发送的数据回送一份给对应的网络层协议
120. nitcount--;//用于及时退出循环
121. }
122. }
123. }
124. start_bh_atomic();//开始原子操作
125. if (dev->hard_start_xmit(skb, dev) == 0) {//调用硬件的发送函数发送数据
126. end_bh_atomic();//结束原子操作
127. /*
128. * Packet is now solely the responsibility of the driver
129. */
130. return;//到这里说明数据包成功发送
131. }
132. //数据包没有成功发送，进行处理，将数据包从新加入硬件队列
133. end_bh_atomic();
135. /*
136. * Transmission failed, put skb back into a list. Once on the list it's safe and
137. * no longer device locked (it can be freed safely from the device queue)
138. */
139. cli();
140. #ifdef CONFIG_SLAVE_BALANCING
141. skb->in_dev_queue=1;
142. dev->pkt_queue++;
143. #endif
144. skb_device_unlock(skb);//对SKB解锁
145. skb_queue_head(dev->buffs + pri,skb);//这次采用头插法插入硬件发送队列
146. restore_flags(flags);
147. }
具体的硬件发送函数dev->hard_start_xmit的实现将做下篇博文中分析。
相关阅读:
Object-C（自学1）
在vue-cli@3.X中配置代理解决开发环境的跨域问题
 记一次发布/更新npm包的过程及包版本管理
 MAC OS上开启Nginx静态文件服务器
 vuecli3打包部署非根目录下配置vue.config.js publicPath
使用Anywhere开启一个nodejs静态文件服务器
 搭建node服务端并使用express()创建简单数据接口，最后返回前端请求的所需数据
 对正反向代理对理解
 Mac查看Python安装路径和版本
 onBlur方法在iOS和Android平台上的差异
原文地址：https://www.cnblogs.com/wangfengju/p/6173197.html