open系统调用flow之do_last()
本文章来分析下open file系统调用最后一个主要的函数do_last()
open系统调用来到do_last()时,主要看其nd参数,这个结构体里的path、last成员,此时的path表示open file完整路径最低一级路径,last表示open file的file name,比如open file /data/test/test.txt,此时path结构体表示/data/test的路径;而last qstr struct表示test.txt
有了这个前提之后再来分析下do_last()
和之前link_path_walk()里lookup dentry类似,do_last同样会去查找last file对应的dentry。先调用lookup_fast()看看last所表示的file是否有对应的dentry,如果有,则会直接跳到finish_lookup,不会再走lookup_open() flow;如果没有则会走lookup_open() flow。
不管dcache里是否有对应的dentry,都会call到step_into(),这个函数根据path结构体更新nd.path,即nd.path表示last文件。
如果是走的lookup_open() flow,则接下来会调用vfs_open(),至于如果在dcache里找到了对应的dentry是否还会call vfs_open()没有确认过。
/* * Handle the last step of open() */ static int do_last(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; kuid_t dir_uid = nd->inode->i_uid; umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; int acc_mode = op->acc_mode; unsigned seq; struct inode *inode; struct path path; int error; nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { error = handle_dots(nd, nd->last_type); if (unlikely(error)) return error; goto finish_open; } if (!(open_flag & O_CREAT)) { //因为是是分析的file已经存在的case,所以是没有O_CREAT flag的 if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ error = lookup_fast(nd, &path, &inode, &seq); if (likely(error > 0)) goto finish_lookup; if (error < 0) return error; BUG_ON(nd->inode != dir->d_inode); BUG_ON(nd->flags & LOOKUP_RCU); } else { /* create side of things */ /* * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED * has been cleared when we got to the last component we are * about to look up */ error = complete_walk(nd); if (error) return error; audit_inode(nd->name, dir, LOOKUP_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) return -EISDIR; } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { error = mnt_want_write(nd->path.mnt); if (!error) got_write = true; /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } if (open_flag & O_CREAT) inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); error = lookup_open(nd, &path, file, op, got_write); if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); if (error) goto out; if (file->f_mode & FMODE_OPENED) { if ((file->f_mode & FMODE_CREATED) || !S_ISREG(file_inode(file)->i_mode)) will_truncate = false; audit_inode(nd->name, file->f_path.dentry, 0); goto opened; } if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; will_truncate = false; acc_mode = 0; path_to_nameidata(&path, nd); goto finish_open_created; } /* * If atomic_open() acquired write access it is dropped now due to * possible mount and symlink following (this might be optimized away if * necessary...) */ if (got_write) { mnt_drop_write(nd->path.mnt); got_write = false; } error = follow_managed(&path, nd); if (unlikely(error < 0)) return error; if (unlikely(d_is_negative(path.dentry))) { path_to_nameidata(&path, nd); return -ENOENT; } /* * create/update audit record if it already exists. */ audit_inode(nd->name, path.dentry, 0); if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { path_to_nameidata(&path, nd); return -EEXIST; } seq = 0; /* out of RCU mode, so the value doesn't matter */ inode = d_backing_inode(path.dentry); finish_lookup: error = step_into(nd, &path, 0, inode, seq); if (unlikely(error)) return error; finish_open: /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); if (error) return error; audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { error = -EISDIR; if (d_is_dir(nd->path.dentry)) goto out; error = may_create_in_sticky(dir_mode, dir_uid, d_backing_inode(nd->path.dentry)); if (unlikely(error)) goto out; } error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; if (!d_is_reg(nd->path.dentry)) will_truncate = false; if (will_truncate) { error = mnt_want_write(nd->path.mnt); if (error) goto out; got_write = true; } finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file); if (error) goto out; opened: error = ima_file_check(file, op->acc_mode); if (!error && will_truncate) error = handle_truncate(file); out: if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } if (got_write) mnt_drop_write(nd->path.mnt); return error; }
lookup_open()首先调用了d_lookup(),这个lookup同样是在dcache里查找last所表示的文件是否有对应的dentry,这个lookup是需要操作rcu锁以及dentry的d_lock自旋锁的。如果没有找到,说明这个文件之前没有被open过,还没有建立对应的dentry,则调用d_alloc_parallel给last分配一个dentry。
然后会走到no_open label执行dir_inode->i_op->lookup,这个lookup是具体文件系统的lookup函数。这部分alloc dentry和lookup和之前在分析link_path_walk flow时分析的lookup_slow() flow一样,这里不再赘述。
最后设置下path结构体,将path结构体里的dentry、mnt成员更新为表示当前文件:
static int lookup_open(struct nameidata *nd, struct path *path, struct file *file, const struct open_flags *op, bool got_write) { struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return -ENOENT; file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) return PTR_ERR(dentry); } if (d_in_lookup(dentry)) break; error = d_revalidate(dentry, nd->flags); if (likely(error > 0)) break; if (error) goto out_dput; d_invalidate(dentry); dput(dentry); dentry = NULL; } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ goto out_no_open; } /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the * file exists. But checking existence breaks atomicity. The trick is * to check access and if not granted clear O_CREAT from the flags. * * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ if (open_flag & O_CREAT) { if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); if (unlikely(!got_write)) { create_error = -EROFS; open_flag &= ~O_CREAT; if (open_flag & (O_EXCL | O_TRUNC)) goto no_open; /* No side effects, safe to clear O_CREAT */ } else { create_error = may_o_create(&nd->path, dentry, mode); if (create_error) { open_flag &= ~O_CREAT; if (open_flag & O_EXCL) goto no_open; } } } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) && unlikely(!got_write)) { /* * No O_CREATE -> atomicity not a requirement -> fall * back to lookup + open */ goto no_open; } if (dir_inode->i_op->atomic_open) { #这个是atomic open error = atomic_open(nd, dentry, path, file, op, open_flag, mode); if (unlikely(error == -ENOENT) && create_error) error = create_error; return error; } no_open: if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) { error = PTR_ERR(res); goto out_dput; } dput(dentry); dentry = res; } } /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { error = -EACCES; goto out_dput; } error = dir_inode->i_op->create(dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; fsnotify_create(dir_inode, dentry); } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } out_no_open: path->dentry = dentry; path->mnt = nd->path.mnt; return 0; out_dput: dput(dentry); return error; }
vfs_open,将表示file的path赋值给此file的f_path成员,之后就可以根据file struct得到此file的dentry(f_path.dentry)以及此文件所在的文件系统vfsmount(f_path.mnt)了。根据这个dentry又可以得到这个file的inode struct。
然后调用do_dentry_open()
int vfs_open(const struct path *path, struct file *file) { file->f_path = *path; return do_dentry_open(file, d_backing_inode(path->dentry), NULL); }
do_dentry_open参数说明:
f: 即表示此file的file struct;
inode: 即此file的inode struct;
open: 这里它为null
do_dentry_open()里会调用fops_get(inode->i_fop),将返回值赋值给f->f_op,fops_get()的返回值,对于文件系统来说,它是没有设置module类型的owner成员的,同时CONFIG_MODULES和CONFIG_MODULE_UNLOAD(表示是否支持卸载已经加载的ko)一般是define了的,所以是使用的module.c里的try_module_get(),在这个函数里,如果module指针是null,则直接return true,所以对于文件系统来说,是直接将inode->i_fop赋值给f->fop,即设置file的file_operations,这个file_operations是具体文件系统提供的,以ext4 fs为例,它是ext4_file_operations。然后如果具体文件系统提供的file_operations函数集里有open函数,则调用这个open函数,对应ext4 fs,它是有提供的,它是ext4_file_open(),传给它的参数是inode和f,即此file的indoe、file struct。至此open系统调用过程基本结束!
static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; int error; path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; f->f_op = &empty_fops; return 0; } /* Any file opened for execve()/uselib() has to be a regular file. */ if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) { error = -EACCES; goto cleanup_file; } if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) goto cleanup_file; error = __mnt_want_write(f->f_path.mnt); if (unlikely(error)) { put_write_access(inode); goto cleanup_file; } f->f_mode |= FMODE_WRITER; } /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); if (unlikely(WARN_ON(!f->f_op))) { error = -ENODEV; goto cleanup_all; } error = security_file_open(f); if (error) goto cleanup_all; error = break_lease(locks_inode(f), f->f_flags); if (error) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; if (!open) open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } f->f_mode |= FMODE_OPENED; if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(inode); if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); /* NB: we're sure to have correct a_ops only after f_op->open */ if (f->f_flags & O_DIRECT) { if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; } return 0; cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); if (f->f_mode & FMODE_WRITER) { put_write_access(inode); __mnt_drop_write(f->f_path.mnt); } cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; f->f_path.dentry = NULL; f->f_inode = NULL; return error; }