[转自 https://www.cnblogs.com/hustcat/archive/2009/09/17/1568738.html
https://www.cnblogs.com/hustcat/archive/2009/09/17/1568765.html ]
socket入门(1)
1、TCP/IP参考模型
为了实现各种网络的互连,国际标准化组织(ISO)制定了开放式系统互连(OSI)参考模型。尽管OSI的体系结构从理论上讲是比较完整的,但实际上,完全符合OSI各层协议的商用产品却很少进入市场。而使用TCP/IP 协议的产品却大量涌入市场,几乎所有的工作站都配有TCP/IP协议,使得TCP/IP 成为计算机网络的实际的国际标准。
2、套接字(socket)
socket是操作系统的重要组成部分之一,它是网络应用程序的基础。从层次上来说,它位于应用层,是操作系统为应用程序员提供的API,通过它,应用程序可以访问传输层协议。
- socket 位于传输层协议之上,屏蔽了不同网络协议之间的差异;
- socket是网络编程的入口,它提供了大量的系统调用,构成了网络程序的主体;
- 在Linux系统中,socket属于文件系统的一部分,网络通信可以被看作是对文件的读取,使得我们对网络的控制和对文件的控制一样方便。
2.1、套接字地址
在传输层上,通信端点可由Internet上3个参数描述:所用的协议、IP地址和端口号。这些内容由sockaddr描述:
typedef unsigned short sa_family_t;
//通用socket地址
struct sockaddr {
sa_family_t sa_family; /* address family, AF_xxx,协议簇*/
char sa_data[14]; /* 14 bytes of protocol address */
};
//usr/include/netinet/in.h
//INET地址簇的socket地址
struct in_addr {
__u32 s_addr;
};
struct sockaddr_in {
sa_family_t sin_family; /* Address family: AF_INET */
unsigned short int sin_port; /* Port number,端口*/
struct in_addr sin_addr; /* Internet address,IP地址*/
/* Pad to size of 'struct sockaddr' . */
unsigned char sin_zero[sizeof (struct sockaddr) -
sizeof (sa_family_t) -
sizeof (uint16_t) -
sizeof (struct in_addr)];
};
Linux 支持的套接字地址族:
套接字地址族 |
描述 |
UNIX |
UNIX 域套接字 |
INET |
通过 TCP/IP 协议支持的 Internet 地址族 |
AX25 |
Amater radio X25 |
APPLETALK |
Appletalk DDP |
IPX |
Novell IPX |
X25 |
X25 |
Linux 所支持的BSD套接字类型:
BSD 套接字类型 |
描述 |
流(stream) |
这种套接字提供了可靠的双向顺序数据流,可保证数据不会在传输过程中丢失、破坏或重复出现。流套接字通过 INET 地址族的 TCP 协议实现。 |
数据报(datagram) |
这种套接字也提供双向的数据传输,但是并不对数据的传输提供担保,也就是说,数据可能会以错误的顺序传递,甚至丢失或破坏。这种类型的套接字通过 INET 地址族的 UDP 协议实现。 |
原始(raw) |
利用这种类型的套接字,进程可以直接访问底层协议(因此称为原始)。例如,可在某个以太网设备上打开原始套接字,然后获取原始的 IP 数据传输信息。 |
可靠发送的消息 |
和数据报套接字类似,但保证数据被正确传输到目的端。 |
顺序数据包 |
和流套接字类似,但数据包大小是固定的。 |
数据包(packet) |
这并不是标准的 BSD 套接字类型,它是 Linux 专有的 BSD 套接字扩展,可允许进程直接在设备级访问数据包。 |
2.2、套接字操作
套接字(更确切的说是BSD套接字)为应用程序提供了基本的API,这些API是编写网络应用程序的基础。
socket入门(2)
3、套接字的实现
套接字最先是在UNIX的BSD版本实现的,所以也叫做BSD套接字,它隐藏了各个协议之间的差异,并向上提供统一的接口。Linux中实现套接字的基本结构:
3.1、BSD套接字
3.1.1、核心数据结构
为了实现BSD套接字,内核提供一个重要的数据结构struct socket,它的定义如下:
1 //BSD套接字(include/linux/net.h) 2 struct socket { 3 socket_state state; //套接字状态 4 unsigned long flags; 5 struct proto_ops *ops; //操作函数集 6 struct fasync_struct *fasync_list; 7 struct file *file;//每个BSD套接字都有一个inode结点,通过文件对象与其关联起来 8 struct sock *sk; //socket内部结构,与具体的协议簇(比如PF_INET)相关 9 wait_queue_head_t wait; 10 short type; //套接字类型:如SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, SOCK_RDM, SOCK_SEQPACKET, and SOCK_PACKET 11 unsigned char passcred; 12 }; 13 14 //BSD套接字操作函数集 15 struct proto_ops { 16 int family; 17 struct module *owner; 18 int (*release) (struct socket *sock); 19 int (*bind) (struct socket *sock, 20 struct sockaddr *myaddr, 21 int sockaddr_len); 22 int (*connect) (struct socket *sock, 23 struct sockaddr *vaddr, 24 int sockaddr_len, int flags); 25 int (*socketpair)(struct socket *sock1, 26 struct socket *sock2); 27 int (*accept) (struct socket *sock, 28 struct socket *newsock, int flags); 29 int (*getname) (struct socket *sock, 30 struct sockaddr *addr, 31 int *sockaddr_len, int peer); 32 unsigned int (*poll) (struct file *file, struct socket *sock, 33 struct poll_table_struct *wait); 34 int (*ioctl) (struct socket *sock, unsigned int cmd, 35 unsigned long arg); 36 int (*listen) (struct socket *sock, int len); 37 int (*shutdown) (struct socket *sock, int flags); 38 int (*setsockopt)(struct socket *sock, int level, 39 int optname, char __user *optval, int optlen); 40 int (*getsockopt)(struct socket *sock, int level, 41 int optname, char __user *optval, int __user *optlen); 42 int (*sendmsg) (struct kiocb *iocb, struct socket *sock, 43 struct msghdr *m, size_t total_len); 44 int (*recvmsg) (struct kiocb *iocb, struct socket *sock, 45 struct msghdr *m, size_t total_len, 46 int flags); 47 int (*mmap) (struct file *file, struct socket *sock, 48 struct vm_area_struct * vma); 49 ssize_t (*sendpage) (struct socket *sock, struct page *page, 50 int offset, size_t size, int flags); 51 }; 52 //BSD套接字状态 53 typedef enum { 54 SS_FREE = 0, /* not allocated */ 55 SS_UNCONNECTED, /* unconnected to any socket */ 56 SS_CONNECTING, /* in process of connecting */ 57 SS_CONNECTED, /* connected to socket */ 58 SS_DISCONNECTING /* in process of disconnecting */ 59 } socket_state;
3.1.2、BSD套接字初始化
1 //net/socket.c 2 //BSD套接字的初始化 3 void __init sock_init(void) 4 { 5 int i; 6 7 /* 8 * Initialize all address (protocol) families. 9 */ 10 11 for (i = 0; i < NPROTO; i++) 12 net_families[i] = NULL; //协议簇数组初始化 13 14 /* 15 * Initialize sock SLAB cache. 16 */ 17 //分配sock缓存 18 sk_init(); 19 20 #ifdef SLAB_SKB 21 /* 22 * Initialize skbuff SLAB cache 23 */ 24 skb_init(); 25 #endif 26 27 /* 28 * Initialize the protocols module. 29 */ 30 31 init_inodecache(); 32 33 //注册sockfs文件系统 34 register_filesystem(&sock_fs_type); 35 //安装sockfs 36 sock_mnt = kern_mount(&sock_fs_type); 37 /* The real protocol initialization is performed when 38 * do_initcalls is run. 39 */ 40 41 #ifdef CONFIG_NETFILTER 42 netfilter_init(); 43 #endif 44 } 45 46 47 //net/socket.c 48 //sockfs文件系统的安装点 49 static struct vfsmount *sock_mnt; 50 //sockfs文件系统类型 51 static struct file_system_type sock_fs_type = { 52 .name = "sockfs", 53 .get_sb = sockfs_get_sb, 54 .kill_sb = kill_anon_super, 55 }; 56 //地址簇及协议信息 57 static struct net_proto_family *net_families[NPROTO];
sock_init在系统初始化的被调用:
3.1.3、BSD套接字的系统调用
实际上,Linux内核只提供了一个与套接字相关的系统调用,即sys_socketcall,应用程序的所有套接字调用都会映射到这个系统调用上。
1 //BSD套接字调用入口(net/socket.c) 2 asmlinkage long sys_socketcall(int call, unsigned long __user *args) 3 { 4 unsigned long a[6]; 5 unsigned long a0,a1; 6 int err; 7 8 if(call<1||call>SYS_RECVMSG) 9 return -EINVAL; 10 11 /* copy_from_user should be SMP safe. */ 12 if (copy_from_user(a, args, nargs[call]))//从用户区拷贝参数 13 return -EFAULT; 14 15 a0=a[0]; 16 a1=a[1]; 17 18 switch(call) //调用相应的函数 19 { 20 case SYS_SOCKET: 21 err = sys_socket(a0,a1,a[2]); 22 break; 23 case SYS_BIND: 24 err = sys_bind(a0,(struct sockaddr __user *)a1, a[2]); 25 break; 26 case SYS_CONNECT: 27 err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); 28 break; 29 case SYS_LISTEN: 30 err = sys_listen(a0,a1); 31 break; 32 case SYS_ACCEPT: 33 err = sys_accept(a0,(struct sockaddr __user *)a1, (int __user *)a[2]); 34 break; 35 case SYS_GETSOCKNAME: 36 err = sys_getsockname(a0,(struct sockaddr __user *)a1, (int __user *)a[2]); 37 break; 38 case SYS_GETPEERNAME: 39 err = sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); 40 break; 41 case SYS_SOCKETPAIR: 42 err = sys_socketpair(a0,a1, a[2], (int __user *)a[3]); 43 break; 44 case SYS_SEND: 45 err = sys_send(a0, (void __user *)a1, a[2], a[3]); 46 break; 47 case SYS_SENDTO: 48 err = sys_sendto(a0,(void __user *)a1, a[2], a[3], 49 (struct sockaddr __user *)a[4], a[5]); 50 break; 51 case SYS_RECV: 52 err = sys_recv(a0, (void __user *)a1, a[2], a[3]); 53 break; 54 case SYS_RECVFROM: 55 err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], 56 (struct sockaddr __user *)a[4], (int __user *)a[5]); 57 break; 58 case SYS_SHUTDOWN: 59 err = sys_shutdown(a0,a1); 60 break; 61 case SYS_SETSOCKOPT: 62 err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); 63 break; 64 case SYS_GETSOCKOPT: 65 err = sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); 66 break; 67 case SYS_SENDMSG: 68 err = sys_sendmsg(a0, (struct msghdr __user *) a1, a[2]); 69 break; 70 case SYS_RECVMSG: 71 err = sys_recvmsg(a0, (struct msghdr __user *) a1, a[2]); 72 break; 73 default: 74 err = -EINVAL; 75 break; 76 } 77 return err; 78 } 79 80 //include/asm/unistd.h 81 #define __NR_socketcall 102 //系统调用号
下面来看一下sys_socket的实现:
1 //net/socket.c 2 /*创建socket 3 **首先建立一个socket数据结构,然后将其“映射”到一个已打开的文件. 4 */ 5 asmlinkage long sys_socket(int family, int type, int protocol) 6 { 7 int retval; 8 struct socket *sock; 9 //创建socket 10 retval = sock_create(family, type, protocol, &sock); 11 if (retval < 0) 12 goto out; 13 //将socket映射到文件描述符 14 retval = sock_map_fd(sock); 15 if (retval < 0) 16 goto out_release; 17 18 out: 19 /* It may be already another descriptor 8) Not kernel problem. */ 20 return retval; 21 22 out_release: 23 sock_release(sock); 24 return retval; 25 } 26 27 int sock_create(int family, int type, int protocol, struct socket **res) 28 { 29 return __sock_create(family, type, protocol, res, 0); 30 } 31 32 static int __sock_create(int family, int type, int protocol, struct socket **res, int kern) 33 { 34 int i; 35 int err; 36 struct socket *sock; 37 38 /* 39 * Check protocol is in range 40 */ 41 //检查协议是否可用 42 if (family < 0 || family >= NPROTO) 43 return -EAFNOSUPPORT; 44 if (type < 0 || type >= SOCK_MAX) 45 return -EINVAL; 46 47 /* Compatibility. 48 49 This uglymoron is moved from INET layer to here to avoid 50 deadlock in module load. 51 */ 52 if (family == PF_INET && type == SOCK_PACKET) { 53 static int warned; 54 if (!warned) { 55 warned = 1; 56 printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET) ", current->comm); 57 } 58 family = PF_PACKET; 59 } 60 61 err = security_socket_create(family, type, protocol, kern); 62 if (err) 63 return err; 64 65 #if defined(CONFIG_KMOD) 66 /* Attempt to load a protocol module if the find failed. 67 * 68 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user 69 * requested real, full-featured networking support upon configuration. 70 * Otherwise module support will break! 71 */ 72 if (net_families[family]==NULL) 73 { 74 request_module("net-pf-%d",family); 75 } 76 #endif 77 78 net_family_read_lock(); 79 if (net_families[family] == NULL) { 80 i = -EAFNOSUPPORT; 81 goto out; 82 } 83 84 /* 85 * Allocate the socket and allow the family to set things up. if 86 * the protocol is 0, the family is instructed to select an appropriate 87 * default. 88 */ 89 //从sockfs分配一个inode,并为之分配一个套接字结构 90 if (!(sock = sock_alloc())) 91 { 92 printk(KERN_WARNING "socket: no more sockets "); 93 i = -ENFILE; /* Not exactly a match, but its the 94 closest posix thing */ 95 goto out; 96 } 97 //设置类型 98 sock->type = type; 99 100 /* 101 * We will call the ->create function, that possibly is in a loadable 102 * module, so we have to bump that loadable module refcnt first. 103 */ 104 i = -EAFNOSUPPORT; 105 if (!try_module_get(net_families[family]->owner)) 106 goto out_release; 107 108 //调用具体协议的create函数 109 if ((i = net_families[family]->create(sock, protocol)) < 0) 110 goto out_module_put; 111 /* 112 * Now to bump the refcnt of the [loadable] module that owns this 113 * socket at sock_release time we decrement its refcnt. 114 */ 115 if (!try_module_get(sock->ops->owner)) { 116 sock->ops = NULL; 117 goto out_module_put; 118 } 119 /* 120 * Now that we're done with the ->create function, the [loadable] 121 * module can have its refcnt decremented 122 */ 123 module_put(net_families[family]->owner); 124 *res = sock; 125 security_socket_post_create(sock, family, type, protocol, kern); 126 127 out: 128 net_family_read_unlock(); 129 return i; 130 out_module_put: 131 module_put(net_families[family]->owner); 132 out_release: 133 sock_release(sock); 134 goto out; 135 } 136 /////////////////////////////////////////////////////////// 137 138 int sock_map_fd(struct socket *sock) 139 { 140 int fd; 141 struct qstr this; 142 char name[32]; 143 144 /* 145 * Find a file descriptor suitable for return to the user. 146 */ 147 //分配一个没有使用的描述符 148 fd = get_unused_fd(); 149 if (fd >= 0) { 150 struct file *file = get_empty_filp(); 151 152 if (!file) { 153 put_unused_fd(fd); 154 fd = -ENFILE; 155 goto out; 156 } 157 158 sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino); 159 this.name = name; 160 this.len = strlen(name); 161 this.hash = SOCK_INODE(sock)->i_ino; 162 163 //从sockfs文件系统中分配一个目录项对象 164 file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this); 165 if (!file->f_dentry) { 166 put_filp(file); 167 put_unused_fd(fd); 168 fd = -ENOMEM; 169 goto out; 170 } 171 file->f_dentry->d_op = &sockfs_dentry_operations; 172 173 //将目录项对象与sock的索引节点关联起来 174 d_add(file->f_dentry, SOCK_INODE(sock)); 175 file->f_vfsmnt = mntget(sock_mnt); 176 file->f_mapping = file->f_dentry->d_inode->i_mapping; 177 178 //设置sock对应的文件对象 179 sock->file = file; 180 181 //设置文件对象的操作函数 182 file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops; 183 file->f_mode = FMODE_READ | FMODE_WRITE; 184 file->f_flags = O_RDWR; 185 file->f_pos = 0; 186 fd_install(fd, file); 187 } 188 189 out: 190 return fd; 191 }
3.2、INET套接字
INET套接字就是支持 Internet 地址族的套接字,它位于TCP协议之上, BSD套接字之下,如下:
3.2.1、数据结构
1 //include/net/sock.h 2 //与特定协议相关的socket 3 struct sock { 4 /* 5 * Now struct tcp_tw_bucket also uses sock_common, so please just 6 * don't add nothing before this first member (__sk_common) --acme 7 */ 8 struct sock_common __sk_common; 9 #define sk_family __sk_common.skc_family 10 #define sk_state __sk_common.skc_state 11 #define sk_reuse __sk_common.skc_reuse 12 #define sk_bound_dev_if __sk_common.skc_bound_dev_if 13 #define sk_node __sk_common.skc_node 14 #define sk_bind_node __sk_common.skc_bind_node 15 #define sk_refcnt __sk_common.skc_refcnt 16 volatile unsigned char sk_zapped; 17 unsigned char sk_shutdown; 18 unsigned char sk_use_write_queue; 19 unsigned char sk_userlocks; 20 socket_lock_t sk_lock; 21 int sk_rcvbuf; 22 wait_queue_head_t *sk_sleep; 23 struct dst_entry *sk_dst_cache; 24 rwlock_t sk_dst_lock; 25 struct xfrm_policy *sk_policy[2]; 26 atomic_t sk_rmem_alloc; 27 struct sk_buff_head sk_receive_queue; 28 atomic_t sk_wmem_alloc; 29 struct sk_buff_head sk_write_queue; 30 atomic_t sk_omem_alloc; 31 int sk_wmem_queued; 32 int sk_forward_alloc; 33 unsigned int sk_allocation; 34 int sk_sndbuf; 35 unsigned long sk_flags; 36 char sk_no_check; 37 unsigned char sk_debug; 38 unsigned char sk_rcvtstamp; 39 unsigned char sk_no_largesend; 40 int sk_route_caps; 41 unsigned long sk_lingertime; 42 int sk_hashent; 43 /* 44 * The backlog queue is special, it is always used with 45 * the per-socket spinlock held and requires low latency 46 * access. Therefore we special case it's implementation. 47 */ 48 struct { 49 struct sk_buff *head; 50 struct sk_buff *tail; 51 } sk_backlog; 52 rwlock_t sk_callback_lock; 53 struct sk_buff_head sk_error_queue; 54 55 struct proto *sk_prot; 56 57 int sk_err, 58 sk_err_soft; 59 unsigned short sk_ack_backlog; 60 unsigned short sk_max_ack_backlog; 61 __u32 sk_priority; 62 unsigned short sk_type; 63 unsigned char sk_localroute; 64 unsigned char sk_protocol; 65 struct ucred sk_peercred; 66 int sk_rcvlowat; 67 long sk_rcvtimeo; 68 long sk_sndtimeo; 69 struct sk_filter *sk_filter; 70 void *sk_protinfo; 71 kmem_cache_t *sk_slab; 72 struct timer_list sk_timer; 73 struct timeval sk_stamp; 74 struct socket *sk_socket; 75 void *sk_user_data; 76 struct module *sk_owner; 77 struct page *sk_sndmsg_page; 78 __u32 sk_sndmsg_off; 79 struct sk_buff *sk_send_head; 80 int sk_write_pending; 81 void *sk_security; 82 __u8 sk_queue_shrunk; 83 /* three bytes hole, try to pack */ 84 void (*sk_state_change)(struct sock *sk); 85 void (*sk_data_ready)(struct sock *sk, int bytes); 86 void (*sk_write_space)(struct sock *sk); 87 void (*sk_error_report)(struct sock *sk); 88 int (*sk_backlog_rcv)(struct sock *sk, 89 struct sk_buff *skb); 90 void (*sk_destruct)(struct sock *sk); 91 }; 92 93 //底层协议的操作函数 94 struct proto { 95 void (*close)(struct sock *sk, 96 long timeout); 97 int (*connect)(struct sock *sk, 98 struct sockaddr *uaddr, 99 int addr_len); 100 int (*disconnect)(struct sock *sk, int flags); 101 102 struct sock * (*accept) (struct sock *sk, int flags, int *err); 103 104 int (*ioctl)(struct sock *sk, int cmd, 105 unsigned long arg); 106 int (*init)(struct sock *sk); 107 int (*destroy)(struct sock *sk); 108 void (*shutdown)(struct sock *sk, int how); 109 int (*setsockopt)(struct sock *sk, int level, 110 int optname, char __user *optval, 111 int optlen); 112 int (*getsockopt)(struct sock *sk, int level, 113 int optname, char __user *optval, 114 int __user *option); 115 int (*sendmsg)(struct kiocb *iocb, struct sock *sk, 116 struct msghdr *msg, size_t len); 117 int (*recvmsg)(struct kiocb *iocb, struct sock *sk, 118 struct msghdr *msg, 119 size_t len, int noblock, int flags, 120 int *addr_len); 121 int (*sendpage)(struct sock *sk, struct page *page, 122 int offset, size_t size, int flags); 123 int (*bind)(struct sock *sk, 124 struct sockaddr *uaddr, int addr_len); 125 126 int (*backlog_rcv) (struct sock *sk, 127 struct sk_buff *skb); 128 129 /* Keeping track of sk's, looking them up, and port selection methods. */ 130 void (*hash)(struct sock *sk); 131 void (*unhash)(struct sock *sk); 132 int (*get_port)(struct sock *sk, unsigned short snum); 133 134 /* Memory pressure */ 135 void (*enter_memory_pressure)(void); 136 atomic_t *memory_allocated; /* Current allocated memory. */ 137 atomic_t *sockets_allocated; /* Current number of sockets. */ 138 /* 139 * Pressure flag: try to collapse. 140 * Technical note: it is used by multiple contexts non atomically. 141 * All the sk_stream_mem_schedule() is of this nature: accounting 142 * is strict, actions are advisory and have some latency. 143 */ 144 int *memory_pressure; 145 int *sysctl_mem; 146 int *sysctl_wmem; 147 int *sysctl_rmem; 148 int max_header; 149 150 kmem_cache_t *slab; 151 int slab_obj_size; 152 153 struct module *owner; 154 155 char name[32]; 156 157 struct { 158 int inuse; 159 u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; 160 } stats[NR_CPUS]; 161 };
inet_init()函数:
1 //net/ipv4/af_inet.c 2 /*系统初始化时被调用 3 **调用路径:start_kernel() -->init() -->do_basic_setup() -->do_initcalls()-->inet_init() 4 */ 5 static int __init inet_init(void) 6 { 7 struct sk_buff *dummy_skb; 8 struct inet_protosw *q; 9 struct list_head *r; 10 int rc = -EINVAL; 11 12 if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)) { 13 printk(KERN_CRIT "%s: panic ", __FUNCTION__); 14 goto out; 15 } 16 17 rc = sk_alloc_slab(&tcp_prot, "tcp_sock"); 18 if (rc) { 19 sk_alloc_slab_error(&tcp_prot); 20 goto out; 21 } 22 rc = sk_alloc_slab(&udp_prot, "udp_sock"); 23 if (rc) { 24 sk_alloc_slab_error(&udp_prot); 25 goto out_tcp_free_slab; 26 } 27 rc = sk_alloc_slab(&raw_prot, "raw_sock"); 28 if (rc) { 29 sk_alloc_slab_error(&raw_prot); 30 goto out_udp_free_slab; 31 } 32 33 /* 34 * Tell SOCKET that we are alive 35 */ 36 //注册Internet协议簇的相关信息 37 (void)sock_register(&inet_family_ops); 38 39 /* 40 * Add all the base protocols. 41 */ 42 //添加基本的协议 43 if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) 44 printk(KERN_CRIT "inet_init: Cannot add ICMP protocol "); 45 if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) 46 printk(KERN_CRIT "inet_init: Cannot add UDP protocol "); 47 if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) 48 printk(KERN_CRIT "inet_init: Cannot add TCP protocol "); 49 #ifdef CONFIG_IP_MULTICAST 50 if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) 51 printk(KERN_CRIT "inet_init: Cannot add IGMP protocol "); 52 #endif 53 54 /* Register the socket-side information for inet_create. */ 55 for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) 56 INIT_LIST_HEAD(r); 57 58 //将inetsw_array中元素加入到inetsw链表中 59 for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) 60 inet_register_protosw(q); 61 62 /* 63 * Set the ARP module up 64 */ 65 66 arp_init(); //ARP协议初始化 67 68 /* 69 * Set the IP module up 70 */ 71 72 ip_init(); //IP协议初始化 73 74 tcp_v4_init(&inet_family_ops); 75 76 /* Setup TCP slab cache for open requests. */ 77 tcp_init(); 78 79 80 /* 81 * Set the ICMP layer up 82 */ 83 84 icmp_init(&inet_family_ops); 85 86 /* 87 * Initialise the multicast router 88 */ 89 #if defined(CONFIG_IP_MROUTE) 90 ip_mr_init(); 91 #endif 92 /* 93 * Initialise per-cpu ipv4 mibs 94 */ 95 96 if(init_ipv4_mibs()) 97 printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs "); ; 98 99 ipv4_proc_init(); 100 101 ipfrag_init(); 102 103 rc = 0; 104 out: 105 return rc; 106 out_tcp_free_slab: 107 sk_free_slab(&tcp_prot); 108 out_udp_free_slab: 109 sk_free_slab(&udp_prot); 110 goto out; 111 } 112 113 //net/ipv4/af_inet.c 114 //INET协议簇信息 115 static struct net_proto_family inet_family_ops = { 116 .family = PF_INET, 117 .create = inet_create, 118 .owner = THIS_MODULE, 119 }; 120 121 static struct list_head inetsw[SOCK_MAX]; 122 //该数组中的所有元素都会插入到inetsw的链表中 123 static struct inet_protosw inetsw_array[] = 124 { 125 { 126 .type = SOCK_STREAM, 127 .protocol = IPPROTO_TCP, 128 .prot = &tcp_prot, 129 .ops = &inet_stream_ops, 130 .capability = -1, 131 .no_check = 0, 132 .flags = INET_PROTOSW_PERMANENT, 133 }, 134 135 { 136 .type = SOCK_DGRAM, 137 .protocol = IPPROTO_UDP, 138 .prot = &udp_prot, 139 .ops = &inet_dgram_ops, 140 .capability = -1, 141 .no_check = UDP_CSUM_DEFAULT, 142 .flags = INET_PROTOSW_PERMANENT, 143 }, 144 145 146 { 147 .type = SOCK_RAW, 148 .protocol = IPPROTO_IP, /* wild card */ 149 .prot = &raw_prot, 150 .ops = &inet_sockraw_ops, 151 .capability = CAP_NET_RAW, 152 .no_check = UDP_CSUM_DEFAULT, 153 .flags = INET_PROTOSW_REUSE, 154 } 155 }; 156 157 //流套接字操作函数 158 struct proto_ops inet_stream_ops = { 159 .family = PF_INET, 160 .owner = THIS_MODULE, 161 .release = inet_release, 162 .bind = inet_bind, 163 .connect = inet_stream_connect, 164 .socketpair = sock_no_socketpair, 165 .accept = inet_accept, 166 .getname = inet_getname, 167 .poll = tcp_poll, 168 .ioctl = inet_ioctl, 169 .listen = inet_listen, 170 .shutdown = inet_shutdown, 171 .setsockopt = sock_common_setsockopt, 172 .getsockopt = sock_common_getsockopt, 173 .sendmsg = inet_sendmsg, 174 .recvmsg = sock_common_recvmsg, 175 .mmap = sock_no_mmap, 176 .sendpage = tcp_sendpage 177 }; 178 //tcp协议 179 static struct net_protocol tcp_protocol = { 180 .handler = tcp_v4_rcv, 181 .err_handler = tcp_v4_err, 182 .no_policy = 1, 183 }; 184 185 static struct net_protocol udp_protocol = { 186 .handler = udp_rcv, 187 .err_handler = udp_err, 188 .no_policy = 1, 189 }; 190 191 static struct net_protocol icmp_protocol = { 192 .handler = icmp_rcv, 193 }; 194 195 196 //net/ipv4/tcp_ipv4.c 197 //tcp协议的操作函数 198 struct proto tcp_prot = { 199 .name = "TCP", 200 .owner = THIS_MODULE, 201 .close = tcp_close, 202 .connect = tcp_v4_connect, 203 .disconnect = tcp_disconnect, 204 .accept = tcp_accept, 205 .ioctl = tcp_ioctl, 206 .init = tcp_v4_init_sock, 207 .destroy = tcp_v4_destroy_sock, 208 .shutdown = tcp_shutdown, 209 .setsockopt = tcp_setsockopt, 210 .getsockopt = tcp_getsockopt, 211 .sendmsg = tcp_sendmsg, 212 .recvmsg = tcp_recvmsg, 213 .backlog_rcv = tcp_v4_do_rcv, 214 .hash = tcp_v4_hash, 215 .unhash = tcp_unhash, 216 .get_port = tcp_v4_get_port, 217 .enter_memory_pressure = tcp_enter_memory_pressure, 218 .sockets_allocated = &tcp_sockets_allocated, 219 .memory_allocated = &tcp_memory_allocated, 220 .memory_pressure = &tcp_memory_pressure, 221 .sysctl_mem = sysctl_tcp_mem, 222 .sysctl_wmem = sysctl_tcp_wmem, 223 .sysctl_rmem = sysctl_tcp_rmem, 224 .max_header = MAX_TCP_HEADER, 225 .slab_obj_size = sizeof(struct tcp_sock), 226 };
sock_register()函数:
1 //注册协议簇 2 int sock_register(struct net_proto_family *ops) 3 { 4 int err; 5 6 if (ops->family >= NPROTO) { 7 printk(KERN_CRIT "protocol %d >= NPROTO(%d) ", ops->family, NPROTO); 8 return -ENOBUFS; 9 } 10 net_family_write_lock(); 11 err = -EEXIST; 12 if (net_families[ops->family] == NULL) { 13 net_families[ops->family]=ops; 14 err = 0; 15 } 16 net_family_write_unlock(); 17 printk(KERN_INFO "NET: Registered protocol family %d ", 18 ops->family); 19 return err; 20 }
inet_create()函数
1 //创建一个INET套接字 2 static int inet_create(struct socket *sock, int protocol) 3 { 4 struct sock *sk; 5 struct list_head *p; 6 struct inet_protosw *answer; 7 struct inet_opt *inet; 8 struct proto *answer_prot; 9 unsigned char answer_flags; 10 char answer_no_check; 11 int err; 12 13 sock->state = SS_UNCONNECTED; 14 15 /* Look for the requested type/protocol pair. */ 16 answer = NULL; 17 rcu_read_lock(); 18 list_for_each_rcu(p, &inetsw[sock->type]) { 19 answer = list_entry(p, struct inet_protosw, list); 20 21 /* Check the non-wild match. */ 22 if (protocol == answer->protocol) { 23 if (protocol != IPPROTO_IP) 24 break; 25 } else { 26 /* Check for the two wild cases. */ 27 if (IPPROTO_IP == protocol) { 28 protocol = answer->protocol; 29 break; 30 } 31 if (IPPROTO_IP == answer->protocol) 32 break; 33 } 34 answer = NULL; 35 } 36 37 err = -ESOCKTNOSUPPORT; 38 if (!answer) 39 goto out_rcu_unlock; 40 err = -EPERM; 41 if (answer->capability > 0 && !capable(answer->capability)) 42 goto out_rcu_unlock; 43 err = -EPROTONOSUPPORT; 44 if (!protocol) 45 goto out_rcu_unlock; 46 47 //BSD socket的操作函数 48 sock->ops = answer->ops; 49 answer_prot = answer->prot; 50 51 answer_no_check = answer->no_check; 52 answer_flags = answer->flags; 53 rcu_read_unlock(); 54 55 BUG_TRAP(answer_prot->slab != NULL); 56 57 err = -ENOBUFS; 58 sk = sk_alloc(PF_INET, GFP_KERNEL, 59 answer_prot->slab_obj_size, 60 answer_prot->slab); 61 if (sk == NULL) 62 goto out; 63 64 err = 0; 65 //特定协议套接字的操作函数 66 sk->sk_prot = answer_prot; 67 sk->sk_no_check = answer_no_check; 68 if (INET_PROTOSW_REUSE & answer_flags) 69 sk->sk_reuse = 1; 70 71 inet = inet_sk(sk); 72 73 if (SOCK_RAW == sock->type) { 74 inet->num = protocol; 75 if (IPPROTO_RAW == protocol) 76 inet->hdrincl = 1; 77 } 78 79 if (ipv4_config.no_pmtu_disc) 80 inet->pmtudisc = IP_PMTUDISC_DONT; 81 else 82 inet->pmtudisc = IP_PMTUDISC_WANT; 83 84 inet->id = 0; 85 //将sock与sk关联起来 86 sock_init_data(sock, sk); 87 sk_set_owner(sk, sk->sk_prot->owner); 88 89 sk->sk_destruct = inet_sock_destruct; 90 sk->sk_family = PF_INET; 91 sk->sk_protocol = protocol; 92 sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; 93 94 inet->uc_ttl = -1; 95 inet->mc_loop = 1; 96 inet->mc_ttl = 1; 97 inet->mc_index = 0; 98 inet->mc_list = NULL; 99 100 #ifdef INET_REFCNT_DEBUG 101 atomic_inc(&inet_sock_nr); 102 #endif 103 104 if (inet->num) { 105 /* It assumes that any protocol which allows 106 * the user to assign a number at socket 107 * creation time automatically 108 * shares. 109 */ 110 inet->sport = htons(inet->num); 111 /* Add to protocol hash chains. */ 112 sk->sk_prot->hash(sk); 113 } 114 //调用init函数 115 if (sk->sk_prot->init) { 116 err = sk->sk_prot->init(sk); 117 if (err) 118 sk_common_release(sk); 119 } 120 out: 121 return err; 122 out_rcu_unlock: 123 rcu_read_unlock(); 124 goto out; 125 }