虚拟文件系统(VFS)为用户空间提供了文件系统相关的接口,用户程序可以通过标准的Unix文件系统调用对不同介质上的不同文件系统进行读写操作。
通用文件系统接口
VFS使得用户可以直接使用open()、read()和write()而无需考虑具体的文件系统和实际物理介质。标准系统调用也可以在不同的介质和文件系统之间执行,VFS负责这种不同介质和不同文件系统之间的协调,并对上提供一种通用的访问方法。
之所以这种通用接口对所有类型的文件系统都可以操作,是因为内核在它的底层文件系统之上建立了一个抽象层。这个抽象层提供了一个通用文件系统模型,支持各种文件系统。VFS定义了所有文件系统都支持的基本数据结构和接口,而实际文件系统都实现了这些基本接口。由于实际文件系统的代码在统一的接口和数据结构下隐藏了实现的细节,所以在VFS层和内核的其他部分来看,所有的文件系统都是相同的。
VFS中有四个主要的对象模型,分别是:
- 超级块对象:一个已安装的文件系统;
- 索引节点对象:代表一个文件;
- 目录项对象:代表路径的一个组成部分;
- 文件对象:代表文件,注意目录也是文件。
每种对象模型内核都定义了对应的操作对象,描述了内核针对该对象可以使用的方法。
超级块对象
每种文件系统都必须实现超级块,用于存储特定文件系统的信息,通常对应于存放在磁盘特定扇区中的文件系统超级块或文件系统控制块。对于非基于磁盘的文件系统,会在使用现场创建超级块并保存在内存中。
超级块用struct super_block结构体表示:
1400struct super_block { 1401 struct list_head s_list; /* Keep this first 指向超级块链表的指针 */ 1402 dev_t s_dev; /* search index; _not_ kdev_t 设备标志符 */ 1403 unsigned char s_dirt; /* 修改(脏)标志 */ 1404 unsigned char s_blocksize_bits; /* 块大小 单位bits */ 1405 unsigned long s_blocksize; /* 块大小 单位Bytes*/ 1406 loff_t s_maxbytes; /* Max file size */ 1407 struct file_system_type *s_type; 1408 const struct super_operations *s_op; /× 超级块方法 ×/ 1409 const struct dquot_operations *dq_op; /× 磁盘限额方法 ×/ 1410 const struct quotactl_ops *s_qcop; /× 限额控制方法 ×/ 1411 const struct export_operations *s_export_op; /× 导出方法 ×/ 1412 unsigned long s_flags; 1413 unsigned long s_magic; 1414 struct dentry *s_root; 1415 struct rw_semaphore s_umount; 1416 struct mutex s_lock; 1417 int s_count; 1418 atomic_t s_active; 1419#ifdef CONFIG_SECURITY 1420 void *s_security; 1421#endif 1422 const struct xattr_handler **s_xattr; 1423 1424 struct list_head s_inodes; /* all inodes */ 1425 struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ 1426#ifdef CONFIG_SMP 1427 struct list_head __percpu *s_files; 1428#else 1429 struct list_head s_files; 1430#endif 1431 /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */ 1432 struct list_head s_dentry_lru; /* unused dentry lru */ 1433 int s_nr_dentry_unused; /* # of dentry on lru */ 1434 1435 /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ 1436 spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp; 1437 struct list_head s_inode_lru; /* unused inode lru */ 1438 int s_nr_inodes_unused; /* # of inodes on lru */ 1439 1440 struct block_device *s_bdev; 1441 struct backing_dev_info *s_bdi; 1442 struct mtd_info *s_mtd; 1443 struct list_head s_instances; 1444 struct quota_info s_dquot; /* Diskquota specific options */ 1445 1446 int s_frozen; 1447 wait_queue_head_t s_wait_unfrozen; 1448 1449 char s_id[32]; /* Informational name */ 1450 u8 s_uuid[16]; /* UUID */ 1451 1452 void *s_fs_info; /* Filesystem private info */ 1453 fmode_t s_mode; 1454 1455 /* Granularity of c/m/atime in ns. 1456 Cannot be worse than a second */ 1457 u32 s_time_gran; 1458 1459 /* 1460 * The next field is for VFS *only*. No filesystems have any business 1461 * even looking at it. You had been warned. 1462 */ 1463 struct mutex s_vfs_rename_mutex; /* Kludge */ 1464 1465 /* 1466 * Filesystem subtype. If non-empty the filesystem type field 1467 * in /proc/mounts will be "type.subtype" 1468 */ 1469 char *s_subtype; 1470 1471 /* 1472 * Saved mount options for lazy filesystems using 1473 * generic_show_options() 1474 */ 1475 char __rcu *s_options; 1476 const struct dentry_operations *s_d_op; /* default d_op for dentries */ 1477 1478 /* 1479 * Saved pool identifier for cleancache (-1 means none) 1480 */ 1481 int cleancache_poolid; 1482 1483 struct shrinker s_shrink; /* per-sb shrinker handle */ 1484};
超级块对象中的s_op定义了超级块的操作函数表,用super_operations结构体表示,其中的每一项都定义了一种操作的函数指针:
1658struct super_operations { 1659 struct inode *(*alloc_inode)(struct super_block *sb); 1660 void (*destroy_inode)(struct inode *); 1661 1662 void (*dirty_inode) (struct inode *, int flags); 1663 int (*write_inode) (struct inode *, struct writeback_control *wbc); 1664 int (*drop_inode) (struct inode *); 1665 void (*evict_inode) (struct inode *); 1666 void (*put_super) (struct super_block *); 1667 void (*write_super) (struct super_block *); 1668 int (*sync_fs)(struct super_block *sb, int wait); 1669 int (*freeze_fs) (struct super_block *); 1670 int (*unfreeze_fs) (struct super_block *); 1671 int (*statfs) (struct dentry *, struct kstatfs *); 1672 int (*remount_fs) (struct super_block *, int *, char *); 1673 void (*umount_begin) (struct super_block *); 1674 1675 int (*show_options)(struct seq_file *, struct vfsmount *); 1676 int (*show_devname)(struct seq_file *, struct vfsmount *); 1677 int (*show_path)(struct seq_file *, struct vfsmount *); 1678 int (*show_stats)(struct seq_file *, struct vfsmount *); 1679#ifdef CONFIG_QUOTA 1680 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 1681 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 1682#endif 1683 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); 1684 int (*nr_cached_objects)(struct super_block *); 1685 void (*free_cached_objects)(struct super_block *, int); 1686};
在具体的文件系统中,存在一个指向超级块结构的指针,其内会向该结构体传递对应该文件系统的操作函数实现。上述的操作表也不必全部实现,文件系统可以将不需要的函数指针设置为NULL。另外,上述函数均在进程上下文中调用,必要时均可阻塞。
索引节点对象
索引节点对象包含了内核在操作文件和目录时需要的全部信息,这些信息可以从磁盘索引节点直接读入。
索引节点使用inode结构体表示,一个索引节点代表了文件系统中的一个文件,它也可以是设备或者管道这样的特殊文件。
744/* 745 * Keep mostly read-only and often accessed (especially for 746 * the RCU path lookup and 'stat' data) fields at the beginning 747 * of the 'struct inode' 748 */ 749struct inode { 750 umode_t i_mode; 751 unsigned short i_opflags; 752 uid_t i_uid; 753 gid_t i_gid; 754 unsigned int i_flags; 755 756#ifdef CONFIG_FS_POSIX_ACL 757 struct posix_acl *i_acl; 758 struct posix_acl *i_default_acl; 759#endif 760 761 const struct inode_operations *i_op; 762 struct super_block *i_sb; 763 struct address_space *i_mapping; 764 765#ifdef CONFIG_SECURITY 766 void *i_security; 767#endif 768 769 /* Stat data, not accessed from path walking */ 770 unsigned long i_ino; 771 /* 772 * Filesystems may only read i_nlink directly. They shall use the 773 * following functions for modification: 774 * 775 * (set|clear|inc|drop)_nlink 776 * inode_(inc|dec)_link_count 777 */ 778 union { 779 const unsigned int i_nlink; 780 unsigned int __i_nlink; 781 }; 782 dev_t i_rdev; 783 struct timespec i_atime; 784 struct timespec i_mtime; 785 struct timespec i_ctime; 786 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ 787 unsigned short i_bytes; 788 blkcnt_t i_blocks; 789 loff_t i_size; 790 791#ifdef __NEED_I_SIZE_ORDERED 792 seqcount_t i_size_seqcount; 793#endif 794 795 /* Misc */ 796 unsigned long i_state; 797 struct mutex i_mutex; 798 799 unsigned long dirtied_when; /* jiffies of first dirtying */ 800 801 struct hlist_node i_hash; 802 struct list_head i_wb_list; /* backing dev IO list */ 803 struct list_head i_lru; /* inode LRU list */ 804 struct list_head i_sb_list; 805 union { 806 struct list_head i_dentry; 807 struct rcu_head i_rcu; 808 }; 809 atomic_t i_count; 810 unsigned int i_blkbits; 811 u64 i_version; 812 atomic_t i_dio_count; 813 atomic_t i_writecount; 814 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ 815 struct file_lock *i_flock; 816 struct address_space i_data; 817#ifdef CONFIG_QUOTA 818 struct dquot *i_dquot[MAXQUOTAS]; 819#endif 820 struct list_head i_devices; 821 union { 822 struct pipe_inode_info *i_pipe; 823 struct block_device *i_bdev; 824 struct cdev *i_cdev; 825 }; 826 827 __u32 i_generation; 828 829#ifdef CONFIG_FSNOTIFY 830 __u32 i_fsnotify_mask; /* all events this inode cares about */ 831 struct hlist_head i_fsnotify_marks; 832#endif 833 834#ifdef CONFIG_IMA 835 atomic_t i_readcount; /* struct files open RO */ 836#endif 837 void *i_private; /* fs or device private pointer */ 838}; 839
其中,i_op定义了索引节点对象的所有操作方法:
1613struct inode_operations { 1614 struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); 1615 void * (*follow_link) (struct dentry *, struct nameidata *); 1616 int (*permission) (struct inode *, int); 1617 struct posix_acl * (*get_acl)(struct inode *, int); 1618 1619 int (*readlink) (struct dentry *, char __user *,int); 1620 void (*put_link) (struct dentry *, struct nameidata *, void *); 1621 1622 int (*create) (struct inode *,struct dentry *,int, struct nameidata *); 1623 int (*link) (struct dentry *,struct inode *,struct dentry *); 1624 int (*unlink) (struct inode *,struct dentry *); 1625 int (*symlink) (struct inode *,struct dentry *,const char *); 1626 int (*mkdir) (struct inode *,struct dentry *,int); 1627 int (*rmdir) (struct inode *,struct dentry *); 1628 int (*mknod) (struct inode *,struct dentry *,int,dev_t); 1629 int (*rename) (struct inode *, struct dentry *, 1630 struct inode *, struct dentry *); 1631 void (*truncate) (struct inode *); 1632 int (*setattr) (struct dentry *, struct iattr *); 1633 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); 1634 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 1635 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 1636 ssize_t (*listxattr) (struct dentry *, char *, size_t); 1637 int (*removexattr) (struct dentry *, const char *); 1638 void (*truncate_range)(struct inode *, loff_t, loff_t); 1639 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, 1640 u64 len); 1641} ____cacheline_aligned;
目录项对象
VFS把目录当作文件看待,而路径的每个部分则用目录项来表示。在路径中,包括普通文件在内,每个部分都是目录项对象。目录项对象的主要目的是为了方便路径名的查找等操作。
目录项用struct dentry 表示,它没有对应的磁盘数据结构,由于它并不会保存到磁盘上,所以也没有脏标志。
116struct dentry { 117 /* RCU lookup touched fields */ 118 unsigned int d_flags; /* protected by d_lock */ 119 seqcount_t d_seq; /* per dentry seqlock */ 120 struct hlist_bl_node d_hash; /* lookup hash list */ 121 struct dentry *d_parent; /* parent directory */ 122 struct qstr d_name; 123 struct inode *d_inode; /* Where the name belongs to - NULL is 124 * negative */ 125 unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ 126 127 /* Ref lookup also touches following */ 128 unsigned int d_count; /* protected by d_lock */ 129 spinlock_t d_lock; /* per dentry lock */ 130 const struct dentry_operations *d_op; 131 struct super_block *d_sb; /* The root of the dentry tree */ 132 unsigned long d_time; /* used by d_revalidate */ 133 void *d_fsdata; /* fs-specific data */ 134 135 struct list_head d_lru; /* LRU list */ 136 /* 137 * d_child and d_rcu can share memory 138 */ 139 union { 140 struct list_head d_child; /* child of parent list */ 141 struct rcu_head d_rcu; 142 } d_u; 143 struct list_head d_subdirs; /* our children */ 144 struct list_head d_alias; /* inode alias list */ 145};
如果VFS遍历路径名中的所有元素并将它们逐个解析成目录项对象,这将是一件非常耗时的工作。因此,内核将目录项对象缓存在目录项缓存中。目录项缓存包括三部分:
- 被使用的目录项链表:通过索引节点的i_dentry项连接相关的索引节点;
- 最近被使用的目录项双向链表
- 散列表,用来快速将指定路径解析为相关目录项对象
内核通过dentry_operations定义了目录项操作函数列表:
159struct dentry_operations { 160 int (*d_revalidate)(struct dentry *, struct nameidata *); 161 int (*d_hash)(const struct dentry *, const struct inode *, 162 struct qstr *); 163 int (*d_compare)(const struct dentry *, const struct inode *, 164 const struct dentry *, const struct inode *, 165 unsigned int, const char *, const struct qstr *); 166 int (*d_delete)(const struct dentry *); 167 void (*d_release)(struct dentry *); 168 void (*d_prune)(struct dentry *); 169 void (*d_iput)(struct dentry *, struct inode *); 170 char *(*d_dname)(struct dentry *, char *, int); 171 struct vfsmount *(*d_automount)(struct path *); 172 int (*d_manage)(struct dentry *, bool); 173} ____cacheline_aligned;
文件对象
文件对象代表已打开的文件,同一个文件可能对应多个文件对象(多个进程打开该文件),而一个文件对应的索引节点和目录项对象是唯一的。
文件对象通过结构体struct file表示,文件对象没有对应的磁盘数据,它在文件打开的时候创建,文件关闭的时候销毁。文件对象中也不记录脏标志(由inode记录)。
964struct file { 965 /* 966 * fu_list becomes invalid after file_free is called and queued via 967 * fu_rcuhead for RCU freeing 968 */ 969 union { 970 struct list_head fu_list; 971 struct rcu_head fu_rcuhead; 972 } f_u; 973 struct path f_path; 974#define f_dentry f_path.dentry 975#define f_vfsmnt f_path.mnt 976 const struct file_operations *f_op; 977 978 /* 979 * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR. 980 * Must not be taken from IRQ context. 981 */ 982 spinlock_t f_lock; 983#ifdef CONFIG_SMP 984 int f_sb_list_cpu; 985#endif 986 atomic_long_t f_count; 987 unsigned int f_flags; 988 fmode_t f_mode; 989 loff_t f_pos; 990 struct fown_struct f_owner; 991 const struct cred *f_cred; 992 struct file_ra_state f_ra; 993 994 u64 f_version; 995#ifdef CONFIG_SECURITY 996 void *f_security; 997#endif 998 /* needed for tty driver, and maybe others */ 999 void *private_data; 1000
1001#ifdef CONFIG_EPOLL 1002 /* Used by fs/eventpoll.c to link all the hooks to this file */ 1003 struct list_head f_ep_links; 1004#endif /* #ifdef CONFIG_EPOLL */ 1005 struct address_space *f_mapping; 1006#ifdef CONFIG_DEBUG_WRITECOUNT 1007 unsigned long f_mnt_write_state; 1008#endif 1009};
下面的结构定义了文件对象的操作函数列表:
1583struct file_operations { 1584 struct module *owner; 1585 loff_t (*llseek) (struct file *, loff_t, int); 1586 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); 1587 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); 1588 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 1589 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 1590 int (*readdir) (struct file *, void *, filldir_t); 1591 unsigned int (*poll) (struct file *, struct poll_table_struct *); 1592 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 1593 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 1594 int (*mmap) (struct file *, struct vm_area_struct *); 1595 int (*open) (struct inode *, struct file *); 1596 int (*flush) (struct file *, fl_owner_t id); 1597 int (*release) (struct inode *, struct file *); 1598 int (*fsync) (struct file *, loff_t, loff_t, int datasync); 1599 int (*aio_fsync) (struct kiocb *, int datasync); 1600 int (*fasync) (int, struct file *, int); 1601 int (*lock) (struct file *, int, struct file_lock *); 1602 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); 1603 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 1604 int (*check_flags)(int); 1605 int (*flock) (struct file *, int, struct file_lock *); 1606 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); 1607 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); 1608 int (*setlease)(struct file *, long, struct file_lock **); 1609 long (*fallocate)(struct file *file, int mode, loff_t offset, 1610 loff_t len); 1611};
参考:
《Linux内核设计与实现》
http://lxr.linux.no