结论
基于unix“一切皆文件”的思想,创建名为sockfs的伪文件系统(挂载点为socket:,由于不是/,所以无法直接看到,但可以通过ls -l /proc/$pid/fd/看到),实现通用文件系统的操作接口,这些接口中有socket对象的创建接口,令socket对象在创建后与file对象关联并返回相应的fd,从而使得通用的与文件相关的syscall(read/write)可以用于socket。
核心数据结构
/** * struct socket - general BSD socket * @state: socket state (%SS_CONNECTED, etc) * @type: socket type (%SOCK_STREAM, etc) * @flags: socket flags (%SOCK_NOSPACE, etc) * @ops: protocol specific socket operations * @file: File back pointer for gc * @sk: internal networking protocol agnostic socket representation * @wq: wait queue for several uses */struct socket { socket_state state; short type; unsigned long flags; struct socket_wq *wq; struct file *file; struct sock *sk; const struct proto_ops *ops;};
初始化
core_initcall(sock_init); /* early initcall */static int __init sock_init(void){ ... err = register_filesystem(&sock_fs_type); if (err) goto out_fs; sock_mnt = kern_mount(&sock_fs_type); ...}static struct dentry *sockfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data){ return mount_pseudo_xattr(fs_type, "socket:", &sockfs_ops, sockfs_xattr_handlers, &sockfs_dentry_operations, SOCKFS_MAGIC);}static struct vfsmount *sock_mnt __read_mostly;static struct file_system_type sock_fs_type = { .name = "sockfs", .mount = sockfs_mount, .kill_sb = kill_anon_super,};static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, .destroy_inode = sock_destroy_inode, .statfs = simple_statfs,};
从代码中得知,伪文件系统名为sockfs,挂载点为socket:。
创建socket
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol){ return __sys_socket(family, type, protocol);}int __sys_socket(int family, int type, int protocol){ int retval; struct socket *sock; int flags; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; retval = sock_create(family, type, protocol, &sock); if (retval < 0) return retval; return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));}int sock_create(int family, int type, int protocol, struct socket **res){ return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);}EXPORT_SYMBOL(sock_create);int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern){ ... sock = sock_alloc(); if (!sock) { net_warn_ratelimited("socket: no more sockets\n"); return -ENFILE; /* Not exactly a match, but its the closest posix thing */ } sock->type = type;#ifdef CONFIG_MODULES /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user * requested real, full-featured networking support upon configuration. * Otherwise module support will break! */ if (rcu_access_pointer(net_families[family]) == NULL) request_module("net-pf-%d", family);#endif rcu_read_lock(); pf = rcu_dereference(net_families[family]); err = -EAFNOSUPPORT; if (!pf) goto out_release; /* * We will call the ->create function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */ if (!try_module_get(pf->owner)) goto out_release; /* Now protected by module ref count */ rcu_read_unlock(); err = pf->create(net, sock, protocol, kern); if (err < 0) goto out_module_put; /* * Now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */ if (!try_module_get(sock->ops->owner)) goto out_module_busy; /* * Now that we're done with the ->create function, the [loadable] * module can have its refcnt decremented */ module_put(pf->owner); err = security_socket_post_create(sock, family, type, protocol, kern); if (err) goto out_sock_release; *res = sock; return 0; ...}EXPORT_SYMBOL(__sock_create);struct socket *sock_alloc(void){ struct inode *inode; struct socket *sock; inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; sock = SOCKET_I(inode); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_op = &sockfs_inode_ops; return sock;}EXPORT_SYMBOL(sock_alloc);static int sock_map_fd(struct socket *sock, int flags){ struct file *newfile; int fd = get_unused_fd_flags(flags); if (unlikely(fd < 0)) { sock_release(sock); return fd; } newfile = sock_alloc_file(sock, flags, NULL); if (likely(!IS_ERR(newfile))) { fd_install(fd, newfile); return fd; } put_unused_fd(fd); return PTR_ERR(newfile);}struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname){ struct file *file; if (!dname) dname = sock->sk ? sock->sk->sk_prot_creator->name : ""; file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname, O_RDWR | (flags & O_NONBLOCK), &socket_file_ops); if (IS_ERR(file)) { sock_release(sock); return file; } sock->file = file; file->private_data = sock; return file;}EXPORT_SYMBOL(sock_alloc_file);
用户通过socket这个syscall创建socket,在sockfs上使创建的socket与某个file关联(file->private_data = sock),并返回fd。这个fd通过file对象关联着socket,从而socket可以像文件一样被操作。
读写socket
static const struct file_operations socket_file_ops = { .owner = THIS_MODULE, .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, .poll = sock_poll, .unlocked_ioctl = sock_ioctl,#ifdef CONFIG_COMPAT .compat_ioctl = compat_sock_ioctl,#endif .mmap = sock_mmap, .release = sock_close, .fasync = sock_fasync, .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read,};static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to){ struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; struct msghdr msg = {.msg_iter = *to, .msg_iocb = iocb}; ssize_t res; if (file->f_flags & O_NONBLOCK) msg.msg_flags = MSG_DONTWAIT; if (iocb->ki_pos != 0) return -ESPIPE; if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0; res = sock_recvmsg(sock, &msg, msg.msg_flags); *to = msg.msg_iter; return res;}static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from){ struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; struct msghdr msg = {.msg_iter = *from, .msg_iocb = iocb}; ssize_t res; if (iocb->ki_pos != 0) return -ESPIPE; if (file->f_flags & O_NONBLOCK) msg.msg_flags = MSG_DONTWAIT; if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; res = sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res;}static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags){ return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);}int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags){ int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags); return err ?: sock_recvmsg_nosec(sock, msg, flags);}EXPORT_SYMBOL(sock_recvmsg);static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags){ return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);}int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags){ int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags); return err ?: sock_recvmsg_nosec(sock, msg, flags);}EXPORT_SYMBOL(sock_recvmsg);
通过sock->ops实现特定议的读写。