1、调用:从用户态陷入到内核态:
两件事:1.将用户参数拷贝至内核态;
2.调用do_mount完成挂载工作;
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; struct filename *kernel_dir; char *kernel_dev; unsigned long data_page; /* 从用户空间向内核空间拷贝数据:类型,挂载路径,参数等 */
/* 挂载的文件系统类型:nfs,ext3等*/ ret = copy_mount_string(type, &kernel_type); if (ret < 0) goto out_type; /* 挂载点路径 */ kernel_dir = getname(dir_name); if (IS_ERR(kernel_dir)) { ret = PTR_ERR(kernel_dir); goto out_dir; } /* mount设备名称 */ ret = copy_mount_string(dev_name, &kernel_dev); if (ret < 0) goto out_dev; /* mount选项信息,即-o */ ret = copy_mount_options(data, &data_page); if (ret < 0) goto out_data; /* mount入口,完成mount工作 */ ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags, (void *) data_page); free_page(data_page); out_data: kfree(kernel_dev); out_dev: putname(kernel_dir); out_dir: kfree(kernel_type); out_type: return ret; }
备注:struct filename *kernel_dir
struct filename { const char *name; /* pointer to actual string */ const __user char *uptr; /* original userland pointer */ struct audit_names *aname; bool separate; /* should "name" be freed? */ };
2、do_mount
在do_mount()函数中,作了一些标志位的检查,安全性检查等附加工作,然后根据不同的flag来调用不同的挂载函数,
这这里面调用了下面两个主要的函数:
long do_mount(const char *dev_name, const char *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; int retval = 0; int mnt_flags = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; /* Basic sanity checks */ if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) return -EINVAL; if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0;
/* 通过dir_name获取path,path中包含mount目录的dentry,LOOKUP_FLLOW查找标志,遇到链接继续查找 */ retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); if (retval) return retval; /* 安全相关 */ retval = security_sb_mount(dev_name, &path, type_page, flags, data_page); if (!retval && !may_mount()) retval = -EPERM; if (retval) goto dput_out; /* 对挂载标志的检查和初始化 */ /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; if (flags & MS_NODEV) mnt_flags |= MNT_NODEV; if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; if (flags & MS_NOATIME) mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_STRICTATIME)) == 0)) { mnt_flags &= ~MNT_ATIME_MASK; mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; } flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); if (flags & MS_REMOUNT)
/* 重新加载文件系统。这允许你改变现存文件系统的mountflag和数据,不用先卸载再挂上文件系统 */ retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) /* 执行bind挂载,使文件或者子目录树在文件系统内的另一个点上可视 */ retval = do_loopback(&path, dev_name, flags & MS_REC); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&path, flags); else if (flags & MS_MOVE) /* 移动子目录树 */ retval = do_move_mount(&path, dev_name); else retval = do_new_mount(&path, type_page, flags, mnt_flags, dev_name, data_page); dput_out: path_put(&path); return retval; }
3、do_new_mount()函数主要分成两大部分:
1、vfs_kern_mount 建立vfsmount对象和superblock对象,必要时从设备上获取文件系统元数据;
2、do_add_mount 将vfsmount对象加入到mount树和Hash Table中,以便在内存中形成一棵树结构;
1 static int do_new_mount(struct path *path, const char *fstype, int flags, 2 int mnt_flags, const char *name, void *data) 3 { 4 struct file_system_type *type; 5 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; 6 struct vfsmount *mnt; 7 int err; 8 9 if (!fstype) 10 return -EINVAL; 11 /* 根据类型名称获取该文件系统结构 file_system_type */ 12 type = get_fs_type(fstype); 13 if (!type) 14 return -ENODEV; 15 16 if (user_ns != &init_user_ns) { 17 if (!(type->fs_flags & FS_USERNS_MOUNT)) { 18 put_filesystem(type); 19 return -EPERM; 20 } 21 /* Only in special cases allow devices from mounts 22 * created outside the initial user namespace. 23 */ 24 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { 25 flags |= MS_NODEV; 26 mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; 27 } 28 } 29 30 mnt = vfs_kern_mount(type, flags, name, data); 31 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && 32 !mnt->mnt_sb->s_subtype) 33 mnt = fs_set_subtype(mnt, fstype); 34 35 put_filesystem(type); 36 if (IS_ERR(mnt)) 37 return PTR_ERR(mnt); 38 39 err = do_add_mount(real_mount(mnt), path, mnt_flags); 40 if (err) 41 mntput(mnt); 42 return err; 43 }
4、vfs_kernel_mount:
调用vfs_kern_mount()完成主要挂载
1 struct vfsmount * 2 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) 3 { 4 struct mount *mnt; 5 struct dentry *root; 6 7 if (!type) 8 return ERR_PTR(-ENODEV); 9 /* 分配挂载结构的对象空间 */ 10 mnt = alloc_vfsmnt(name); 11 if (!mnt) 12 return ERR_PTR(-ENOMEM); 13 14 if (flags & MS_KERNMOUNT) 15 mnt->mnt.mnt_flags = MNT_INTERNAL; 16 /* 获取挂载设备上文件系统的根目录 */ 17 root = mount_fs(type, flags, name, data); 18 if (IS_ERR(root)) { 19 free_vfsmnt(mnt); 20 return ERR_CAST(root); 21 } 22 /* 设置挂载点的dentry结构为读取的块设备的根目录 */ 23 mnt->mnt.mnt_root = root; 24 mnt->mnt.mnt_sb = root->d_sb; 25 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 26 mnt->mnt_parent = mnt; 27 br_write_lock(&vfsmount_lock); 28 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); 29 br_write_unlock(&vfsmount_lock); 30 return &mnt->mnt; 31 }
5、mount_fs:
主要是调用文件系统自身定义的mount函数
1 struct dentry * 2 mount_fs(struct file_system_type *type, int flags, const char *name, void *data) 3 { 4 struct dentry *root; 5 struct super_block *sb; 6 char *secdata = NULL; 7 int error = -ENOMEM; 8 9 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { 10 secdata = alloc_secdata(); 11 if (!secdata) 12 goto out; 13 14 error = security_sb_copy_data(data, secdata); 15 if (error) 16 goto out_free_secdata; 17 } 18 19 root = type->mount(type, flags, name, data); 20 if (IS_ERR(root)) { 21 error = PTR_ERR(root); 22 goto out_free_secdata; 23 } 24 sb = root->d_sb; 25 BUG_ON(!sb); 26 WARN_ON(!sb->s_bdi); 27 WARN_ON(sb->s_bdi == &default_backing_dev_info); 28 sb->s_flags |= MS_BORN; 29 /* 与安全相关,忽略 */ 30 error = security_sb_kern_mount(sb, flags, secdata); 31 if (error) 32 goto out_sb; 33 34 /* 35 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE 36 * but s_maxbytes was an unsigned long long for many releases. Throw 37 * this warning for a little while to try and catch filesystems that 38 * violate this rule. 39 */ 40 WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " 41 "negative value (%lld) ", type->name, sb->s_maxbytes); 42 43 up_write(&sb->s_umount); 44 free_secdata(secdata); 45 return root; 46 out_sb: 47 dput(root); 48 deactivate_locked_super(sb); 49 out_free_secdata: 50 free_secdata(secdata); 51 out: 52 return ERR_PTR(error); 53 }
6、type->mount在定义文件系统类型的时候指定,如ocfs2:即ocfs2_mount方法
1 static struct file_system_type ocfs2_fs_type = { 2 .owner = THIS_MODULE, 3 .name = "ocfs2", 4 .mount = ocfs2_mount, 5 .kill_sb = ocfs2_kill_sb, 6 7 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 8 .next = NULL 9 };
1 static struct dentry *ocfs2_mount(struct file_system_type *fs_type, 2 int flags, 3 const char *dev_name, 4 void *data) 5 { 6 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); 7 } 8 /* 主要完成sb内存对象的初始化和加入到全局sb的链表中 */ 9 struct dentry *mount_bdev(struct file_system_type *fs_type, 10 int flags, const char *dev_name, void *data, 11 int (*fill_super)(struct super_block *, void *, int)) 12 { 13 struct block_device *bdev; 14 struct super_block *s; 15 fmode_t mode = FMODE_READ | FMODE_EXCL; 16 int error = 0; 17 18 if (!(flags & MS_RDONLY)) 19 mode |= FMODE_WRITE; 20 /* 通过名称获取挂载设备的bdev对象 */ 21 bdev = blkdev_get_by_path(dev_name, mode, fs_type); 22 if (IS_ERR(bdev)) 23 return ERR_CAST(bdev); 24 25 /* 26 * once the super is inserted into the list by sget, s_umount 27 * will protect the lockfs code from trying to start a snapshot 28 * while we are mounting 29 */ 30 mutex_lock(&bdev->bd_fsfreeze_mutex); 31 if (bdev->bd_fsfreeze_count > 0) { 32 mutex_unlock(&bdev->bd_fsfreeze_mutex); 33 error = -EBUSY; 34 goto error_bdev; 35 }
/* 获取sb对象:查找或者创建 */ 36 s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC, 37 bdev); 38 mutex_unlock(&bdev->bd_fsfreeze_mutex); 39 if (IS_ERR(s)) 40 goto error_s; 41 /* 被mount文件系统的根目录已经存在 */ 42 if (s->s_root) { 43 if ((flags ^ s->s_flags) & MS_RDONLY) { 44 deactivate_locked_super(s); 45 error = -EBUSY; 46 goto error_bdev; 47 } 48 49 /* 50 * s_umount nests inside bd_mutex during 51 * __invalidate_device(). blkdev_put() acquires 52 * bd_mutex and can't be called under s_umount. Drop 53 * s_umount temporarily. This is safe as we're 54 * holding an active reference. 55 */ 56 up_write(&s->s_umount); 57 blkdev_put(bdev, mode); 58 down_write(&s->s_umount); 59 } else { 60 char b[BDEVNAME_SIZE]; 61 /* 不存在,则从磁盘读取sb的元数据并填充到sb的内存结构体中,并接入到全局sb中 */ 62 s->s_mode = mode; 63 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 64 sb_set_blocksize(s, block_size(bdev));
/* ocfs2_fill_super 函数 */ 65 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 66 if (error) { 67 deactivate_locked_super(s); 68 goto error; 69 } 70 71 s->s_flags |= MS_ACTIVE; 72 bdev->bd_super = s; 73 } 74 /* 返回mount的文件系统的根目录 */ 75 return dget(s->s_root); 76 77 error_s: 78 error = PTR_ERR(s); 79 error_bdev: 80 blkdev_put(bdev, mode); 81 error: 82 return ERR_PTR(error); 83 }
上述流程即:获取挂载设备的文件系统的root,然后挂到fs_mount对象上,而vfs_mount中记录挂载点的元数据信息,这样便可将设备的根目录挂到挂载点下;
7、回到3中do_add_mount函数:准备将得到的mnt结构加入全局文件系统树
do_new_mount--> do_add_mount--> graft_tree--> attach_recursive_mnt
1 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) 2 { 3 struct mountpoint *mp; 4 struct mount *parent; 5 int err; 6 7 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); 8 /* lock_mount确定本次挂载要挂载到哪个父挂载实例parent的哪个挂载点mp上 */ 9 mp = lock_mount(path); 10 if (IS_ERR(mp)) 11 return PTR_ERR(mp); 12 /* 通过vfs找到mount */ 13 parent = real_mount(path->mnt); 14 err = -EINVAL; 15 if (unlikely(!check_mnt(parent))) { 16 /* that's acceptable only for automounts done in private ns */ 17 if (!(mnt_flags & MNT_SHRINKABLE)) 18 goto unlock; 19 /* ... and for those we'd better have mountpoint still alive */ 20 if (!parent->mnt_ns) 21 goto unlock; 22 } 23 24 /* Refuse the same filesystem on the same mount point */ 25 err = -EBUSY; 26 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && 27 path->mnt->mnt_root == path->dentry) 28 goto unlock; 29 /* 新挂载的文件系统不能是个链接符号 */ 30 err = -EINVAL; 31 if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) 32 goto unlock; 33 /* 把newmnt加入到全局文件系统树中 */ 34 newmnt->mnt.mnt_flags = mnt_flags; 35 err = graft_tree(newmnt, parent, mp); 36 37 unlock: 38 unlock_mount(mp); 39 return err; 40 }
8、graft_tree():
从这个单词可以看出来,是将将要mount的目录树与当前目录的文件系统的目录树连接起来,很像嫁接技术,而原来文件系统的目录树没损伤;
先做一些检查类的操作,然后执行关键函数:attach_recursive_mnt(mnt, path, NULL);
1 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) 2 { 3 if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) 4 return -EINVAL; 5 6 if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != 7 S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) 8 return -ENOTDIR; 9 10 return attach_recursive_mnt(mnt, p, mp, NULL); 11 }
1 static int attach_recursive_mnt(struct mount *source_mnt, 2 struct mount *dest_mnt, 3 struct mountpoint *dest_mp, 4 struct path *parent_path) 5 { 6 LIST_HEAD(tree_list); 7 struct mount *child, *p; 8 int err; 9 10 if (IS_MNT_SHARED(dest_mnt)) { 11 err = invent_group_ids(source_mnt, true); 12 if (err) 13 goto out; 14 } 15 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); /* 将新mount目录的mnt_list与source_mnt的mnt_list连接起来 */ 16 if (err) 17 goto out_cleanup_ids; 18 19 br_write_lock(&vfsmount_lock); 20 21 if (IS_MNT_SHARED(dest_mnt)) { 22 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 23 set_mnt_shared(p); 24 } 25 if (parent_path) { 26 detach_mnt(source_mnt, parent_path); 27 attach_mnt(source_mnt, dest_mnt, dest_mp); 28 touch_mnt_namespace(source_mnt->mnt_ns); 29 } else { 30 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); /* 嫁接完成 */ 31 commit_tree(source_mnt); /* 将新文件系统的vfs_mount加入到mount_hashtable中 */ 32 } 33 34 list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { 35 list_del_init(&child->mnt_hash); 36 commit_tree(child); 37 } 38 br_write_unlock(&vfsmount_lock); 39 40 return 0; 41 42 out_cleanup_ids: 43 if (IS_MNT_SHARED(dest_mnt)) 44 cleanup_group_ids(source_mnt, NULL); 45 out: 46 return err; 47 }