• linux文件系统初探--Day8~11


    Day8中并没有代码的更新,只是介绍了一下struct file,以及open等系统调用关联文件系统中的很多操作。struct file保存内核看到的文件的特征信息。

    Day9中实现了支持page cache的文件操作,重点涉及file_operations的实现以及页缓存的相关知识。

    file_opearions

    file_operations的定义如下:

    struct file_operations {
    	struct module *owner;
    	loff_t (*llseek) (struct file *, loff_t, int);
    	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
    	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
    	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
    	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
    	int (*iterate) (struct file *, struct dir_context *);
    	int (*iterate_shared) (struct file *, struct dir_context *);
    	__poll_t (*poll) (struct file *, struct poll_table_struct *);
    	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
    	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
    	int (*mmap) (struct file *, struct vm_area_struct *);
    	unsigned long mmap_supported_flags;
    	int (*open) (struct inode *, struct file *);
    	int (*flush) (struct file *, fl_owner_t id);
    	int (*release) (struct inode *, struct file *);
    	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
    	int (*fasync) (int, struct file *, int);
    	int (*lock) (struct file *, int, struct file_lock *);
    	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
    	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
    	int (*check_flags)(int);
    	int (*flock) (struct file *, int, struct file_lock *);
    	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
    	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
    	int (*setlease)(struct file *, long, struct file_lock **, void **);
    	long (*fallocate)(struct file *file, int mode, loff_t offset,
    			  loff_t len);
    	void (*show_fdinfo)(struct seq_file *m, struct file *f);
    #ifndef CONFIG_MMU
    	unsigned (*mmap_capabilities)(struct file *);
    #endif
    	ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
    			loff_t, size_t, unsigned int);
    	int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
    			u64);
    	int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
    			u64);
    	int (*fadvise)(struct file *, loff_t, loff_t, int);
    } __randomize_layout;
    

    而下面是file.c中的实现:

    struct file_operations sfs_file_operations = {
    	.read           = do_sync_read,
    	.aio_read	= generic_file_aio_read,
    	.write          = do_sync_write,
    	.aio_write	= generic_file_aio_write,
    	.mmap           = generic_file_mmap,
    	.fsync          = simple_sync_file,
    	.sendfile       = generic_file_sendfile,
    	.llseek         = generic_file_llseek,
    };
    

    readwrite分别负责读写数据,参数分别为文件描述符、数据缓冲区、数据大小和文件偏移量。这里追一下read_write.c中read系统调用的代码就会发现,do_sync_read可用new_sync_read替代,do_sync_write类似。而且我们发现,实际上new_sync_read也调用了generic_file_read_iter;同时查看了ext2,ext4和通用块设备的file_operations,发现他们都没有设置readwrite成员,所以我们在实验中也不设置readwrite

    aio_readaio_write用于异步读写操作,在新版本内核中,更名为read_iterwrite_iter,对应的generic函数在filemap.c中,更名为generic_file_read_itergeneric_file_write_iter。这两个函数在这里不做具体分析。

    mmap将文件内容映射到虚拟地址空间,减少了一次拷贝操作。举例来说,通常情况下,一个文件写操作需要经过以下几次拷贝:用户态->内核态->写文件,而mmap则省略了用户态到内核态的拷贝,可以直接从用户态访问具体文件。具体实现是generic_file_mmap,没变化。

    fsync用于同步内存中和存储介质上的数据。这里不妨使用libfs.c中的generic_file_sync

    sendfile这个成员函数在新版本内核中被去掉了,关于sendfile的相关知识,可以查看这篇文章,总体上来讲,就是绕过用户态内核态的切换和复制,直接用DMA从一个内核传到另一个内核缓冲区。所以在本次实验中直接去掉。

    llseek很简单,就是调整文件描述符到指定的位置,改变当前的读写位置。generic_file_llseek在read_write.c中,具体代码在这里不细说了。

    address_space

    个人理解是:地址空间是内核针对缓存的一个统一抽象。

    地址空间是内核中最关键的数据结构之一,可以认为是内核最基本的抽象机制之一,其重要性堪比进程、文件等抽象结构。

    address_space定义如下:

    struct address_space {
    	struct inode		*host;		/* owner: inode, block_device */
    	struct radix_tree_root	i_pages;	/* cached pages */
    	atomic_t		i_mmap_writable;/* count VM_SHARED mappings */
    	struct rb_root_cached	i_mmap;		/* tree of private and shared mappings */
    	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
    	/* Protected by the i_pages lock */
    	unsigned long		nrpages;	/* number of total pages */
    	/* number of shadow or DAX exceptional entries */
    	unsigned long		nrexceptional;
    	pgoff_t			writeback_index;/* writeback starts here */
    	const struct address_space_operations *a_ops;	/* methods */
    	unsigned long		flags;		/* error bits */
    	spinlock_t		private_lock;	/* for use by the address_space */
    	gfp_t			gfp_mask;	/* implicit gfp mask for allocations */
    	struct list_head	private_list;	/* for use by the address_space */
    	void			*private_data;	/* ditto */
    	errseq_t		wb_err;
    } __attribute__((aligned(sizeof(long)))) __randomize_layout;
    

    host保存当前地址空间的所有者,inode或者是块设备。

    i_pages指向一个基数树的跟,这个基数树列出了当前地址空间中所有的物理内存页。

    nrpages保存缓存页的总数。

    a_ops指向关于address_space的操作结构,定义了一些处理地址空间的特定操作。下文详述。

    其他的内容我们目前不关注。必须指出的是,新内核中移除了backing_dev_info这个成员,主要保存了后备存储器中的设备信息,而后备存储器指定了地址空间中页的数据的来源,这部分功能在新内核中是怎样实现的还不清楚。

    address_space_operations

    address_space_operations定义如下:

    struct address_space_operations {
    	int (*writepage)(struct page *page, struct writeback_control *wbc);
    	int (*readpage)(struct file *, struct page *);
    
    	/* Write back some dirty pages from this mapping. */
    	int (*writepages)(struct address_space *, struct writeback_control *);
    
    	/* Set a page dirty.  Return true if this dirtied it */
    	int (*set_page_dirty)(struct page *page);
    
    	/*
    	 * Reads in the requested pages. Unlike ->readpage(), this is
    	 * PURELY used for read-ahead!.
    	 */
    	int (*readpages)(struct file *filp, struct address_space *mapping,
    			struct list_head *pages, unsigned nr_pages);
    
    	int (*write_begin)(struct file *, struct address_space *mapping,
    				loff_t pos, unsigned len, unsigned flags,
    				struct page **pagep, void **fsdata);
    	int (*write_end)(struct file *, struct address_space *mapping,
    				loff_t pos, unsigned len, unsigned copied,
    				struct page *page, void *fsdata);
    
    	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
    	sector_t (*bmap)(struct address_space *, sector_t);
    	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
    	int (*releasepage) (struct page *, gfp_t);
    	void (*freepage)(struct page *);
    	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
    	/*
    	 * migrate the contents of a page to the specified target. If
    	 * migrate_mode is MIGRATE_ASYNC, it must not block.
    	 */
    	int (*migratepage) (struct address_space *,
    			struct page *, struct page *, enum migrate_mode);
    	bool (*isolate_page)(struct page *, isolate_mode_t);
    	void (*putback_page)(struct page *);
    	int (*launder_page) (struct page *);
    	int (*is_partially_uptodate) (struct page *, unsigned long,
    					unsigned long);
    	void (*is_dirty_writeback) (struct page *, bool *, bool *);
    	int (*error_remove_page)(struct address_space *, struct page *);
    
    	/* swapfile support */
    	int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
    				sector_t *span);
    	void (*swap_deactivate)(struct file *file);
    };
    

    在file.c中,具体实现如下:

    struct address_space_operations sfs_aops = {
    	.readpage       = simple_readpage,
    	.write_begin	= simple_write_begin,
    	.write_end	= simple_write_end
    };
    

    readpage从存储器将一页读入页帧。

    write_beginwrite_end执行由write系统调用触发的写操作,begin将事务数据存储到日志,end将执行实际的写操作,在写入时,内核必须保证两个函数成对使用,并且顺序正确,否则日志机制会失效。当前将写操作划分为两部分已经成为一个约定俗成的传统。

    注意,原来的老版本代码中write_beginwrite_end分别命名为:prepare_writecommit_write

    实验结果

    可见,基本的文件夹创建、文件读写操作目前都已经支持。

    Day10 11

    samplefs整体已经完成构建,在day10和11中也只是对address_space_operations中的readpageswritepages进行了介绍,以及在inode_operations中加入了对硬链接和软链接的处理。

    参考资料

    linux文件系统二 VFS读写流程
    linux内核 address_space 结构

  • 相关阅读:
    spark on yarn模式下内存资源管理(笔记1)
    面试题10.3-变态跳台阶
    面试题10.2-青蛙跳
    面试题9-斐波那契数列
    面试题9-用两个栈来实现一个队列,完成队列的Push和Pop操作
    面试题6:输入一个链表,按链表值从尾到头的顺序返回一个ArrayList
    鸢尾花数据集-iris.data
    class之cls
    python 装饰器
    supervisor python开发的进程管理工具
  • 原文地址:https://www.cnblogs.com/LuoboLiam/p/14367373.html
Copyright © 2020-2023  润新知