• Linux Hugetlbfs内核源码简析-----(一)Hugetlbfs初始化


    一、引言

      为了实现虚拟内存管理机制,操作系统对内存实行分页管理。自内存“分页机制”提出之始,内存页面的默认大小便被设置为 4096 字节(4KB),虽然原则上内存页面大小是可配置的,但绝大多数的操作系统实现中仍然采用默认的 4KB 页面。当某些应用的需要使用的内存达到几G、甚至几十G的时候,4KB的内存页面将严重制约程序的性能。

      CPU缓存中有一组缓存专门用于缓存TLB,但其大小是有限的。当采用的默认页面大小为 4KB,其产生的TLB较大,因而将会产生较多 TLB Miss 和缺页中断,从而大大影响应用程序的性能。操作系统以 2MB 甚至更大作为分页的单位时,将会大大减少 TLB Miss 和缺页中断的数量,显著提高应用程序的性能。这也正是 Linux 内核引入大页面支持的直接原因。好处是很明显的,假设应用程序需要 2MB 的内存,如果操作系统以 4KB 作为分页的单位,则需要 512 个页面,进而在 TLB 中需要 512 个表项,同时也需要 512 个页表项,操作系统需要经历至少 512 次 TLB Miss 和 512 次缺页中断才能将 2MB 应用程序空间全部映射到物理内存;然而,当操作系统采用 2MB 作为分页的基本单位时,只需要一次 TLB Miss 和一次缺页中断,就可以为 2MB 的应用程序空间建立虚实映射,并在运行过程中无需再经历 TLB Miss 和缺页中断(假设未发生 TLB 项替换和 Swap)。

      为了能以最小的代价实现大页面支持,Linux 操作系统采用了基于 hugetlbfs 特殊文件系统 2M 字节大页面支持。这种采用特殊文件系统形式支持大页面的方式,使得应用程序可以根据需要灵活地选择虚存页面大小,而不会被强制使用 2MB 大页面。

    二、HugePage的使用

      本文的例子摘自 Linux 内核源码中提供的有关说明文档 (Documentation/vm/hugetlbpage.txt) 。使用 hugetlbfs 之前,首先需要在编译内核 (make menuconfig) 时配置CONFIG_HUGETLB_PAGECONFIG_HUGETLBFS选项,这两个选项均可在 File systems 内核配置菜单中找到。

      内核编译完成并成功启动内核之后,将 hugetlbfs 特殊文件系统挂载到根文件系统的某个目录上去,以使得 hugetlbfs 可以访问。命令如下:

      mount none /mnt/huge -t hugetlbfs

      此后,只要是在 /mnt/huge/ 目录下创建的文件,将其映射到内存中时都会使用 2MB 作为分页的基本单位。值得一提的是,hugetlbfs 中的文件是不支持读 / 写系统调用 ( 如read()write()等 ) 的,一般对它的访问都是以内存映射的形式进行的。为了更好地介绍大页面的应用,接下来将给出一个大页面应用的例子,该例子同样也是摘自于上述提到的内核文档,只是略有简化。

     1 清单 1. Linux 大页面应用示例
     2  #include <fcntl.h> 
     3  #include <sys/mman.h> 
     4  #include <errno.h> 
     5 
     6  #define MAP_LENGTH      (10*1024*1024) 
     7 
     8  int main() 
     9  { 
    10     int fd; 
    11     void * addr; 
    12 
    13     /* create a file in hugetlb fs */ 
    14     fd = open("/mnt/huge/test", O_CREAT | O_RDWR); 
    15     if(fd < 0){ 
    16         perror("Err: "); 
    17         return -1; 
    18     }   
    19 
    20     /* map the file into address space of current application process */ 
    21     addr = mmap(0, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 
    22     if(addr == MAP_FAILED){ 
    23         perror("Err: "); 
    24         close(fd); 
    25         unlink("/mnt/huge/test"); 
    26         return -1; 
    27     }   
    28 
    29     /* from now on, you can store application data on huage pages via addr */ 
    30 
    31     munmap(addr, MAP_LENGTH); 
    32     close(fd); 
    33     unlink("/mnt/huge/test"); 
    34     return 0; 
    35  }
     

      对于系统中大页面的统计信息可以在 Proc 特殊文件系统(/proc)中查到,如/proc/sys/vm/nr_hugepages给出了当前内核中配置的大页面的数目,也可以通过该文件配置大页面的数目,如:

      echo 20 > /proc/sys/vm/nr_hugepages

    三、Hugetlbfs的初始化(基于Linux-3.4.51)

    1、hugetlb的初始化

      hugetlb初始化是通过hugetlb_init()函数实现的,主要是初始化hstates[MAX_NUMNODES]全局数组以及创建sysfs相关目录文件。 

     1 static int __init hugetlb_init(void)
     2 {
     3     /* Some platform decide whether they support huge pages at boot
     4      * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
     5      * there is no such support
     6      */
     7     if (HPAGE_SHIFT == 0)
     8         return 0;
     9 
    10     if (!size_to_hstate(default_hstate_size)) {
    11         default_hstate_size = HPAGE_SIZE;  /*默认大小为2M*/
    12         if (!size_to_hstate(default_hstate_size))
    13        /* 初始化hstates[MAX_NUMNODES]数组,数组中只有一个成员;
    14         * HUGETLB_PAGE_ORDER = 9,即,h->order = 9;
    15         */
    16        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
    17     }
    18    /*由于hstates[]只有一个成员,default_hstate_idx = 0*/
    19     default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
    20    /*默认最大页数为0*/
    21     if (default_hstate_max_huge_pages)
    22         default_hstate.max_huge_pages = default_hstate_max_huge_pages;
    23 
    24   /*由于最大页数为0,没有为hstate[]分配任何页*/
    25   hugetlb_init_hstates();
    26   /*这个函数不知道干啥???*/
    27   gather_bootmem_prealloc();
    28   /*打印初始化后的相关信息*/
    29   report_hugepages();
    30   /*初始化/sys/kernel/mm/hugepages相关目录文件*/
    31   hugetlb_sysfs_init();
    32   /*初始化/sys/device/system/node/node*/hugepages相关目录文件*/
    33   hugetlb_register_all_nodes();
    34   return 0;
    35 }
    36 module_init(hugetlb_init);
    另外,hugepage的默认大小也可以通过配置内核启动参数“default_hugepagesz”指定,例如:default_hugepagesz=4M,指定default_hstate_size的大小为4M,其内核实现如下:
      
    1 static int __init hugetlb_default_setup(char *s)
    2 {
    3     default_hstate_size = memparse(s, &s);
    4     return 1;
    5 }
    6 __setup("default_hugepagesz=", hugetlb_default_setup);
    hugepage的大页是通过将N个连续的4k页作为一个混合页来实现大页面的。

    hugepage的页数也可以通过内核启动参数“hugepages”指定。例如:hugepages=1024,其内核实现如下:
     1 static int __init hugetlb_nrpages_setup(char *s)
     2 {
     3     unsigned long *mhp;
     4     static unsigned long *last_mhp;
     5     /*
     6      * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
     7      * so this hugepages= parameter goes to the "default hstate".
     8      */
     9     if (!max_hstate)
    10         mhp = &default_hstate_max_huge_pages;
    11     else
    12         mhp = &parsed_hstate->max_huge_pages;
    13     if (mhp == last_mhp) {
    14         printk(KERN_WARNING "hugepages= specified twice without "
    15             "interleaving hugepagesz=, ignoring
    ");
    16         return 1;
    17     }
    18     if (sscanf(s, "%lu", mhp) <= 0)
    19         *mhp = 0;
    20     /*
    21      * Global state is always initialized later in hugetlb_init.
    22      * But we need to allocate >= MAX_ORDER hstates here early to still
    23      * use the bootmem allocator.
    24      */
    25    /* parsed_hstate->order = 9, MAX_ORDER = 11, 不会调用hugetlb_hstate_alloc_pages();
    26     * 通过内核启动参数配置页面数,什么时候分配具体的内存页???
    27     */
    28     if (max_hstate && parsed_hstate->order >= MAX_ORDER)
    29         hugetlb_hstate_alloc_pages(parsed_hstate);
    30     last_mhp = mhp;
    31     return 1;
    32 }
    33 __setup("hugepages=", hugetlb_nrpages_setup);
    
    

    hugepage的页数也可以通过命令配置,echo 20 > /proc/sys/vm/nr_hugepages,此时,是通过系统调用实现的。内核实现如下:

    1 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
    2               void __user *buffer, size_t *length, loff_t *ppos)
    3 {
    4     return hugetlb_sysctl_handler_common(false, table, write,
    5                             buffer, length, ppos);
    6 }
    
    
     1 static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
     2              struct ctl_table *table, int write,
     3              void __user *buffer, size_t *length, loff_t *ppos)
     4 {
     5     struct hstate *h = &default_hstate;
     6     unsigned long tmp;
     7     int ret;
     8     tmp = h->max_huge_pages;
     9     if (write && h->order >= MAX_ORDER)
    10         return -EINVAL;
    11     table->data = &tmp;
    12     table->maxlen = sizeof(unsigned long);
    13   /*从用户空间将数值copy赋值给tabel->data,即tmp,并做相关检查*/
    14     ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
    15     if (ret)
    16         goto out;
    17     if (write) {        
    18           NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
    19         if (!(obey_mempolicy &&
    20                    init_nodemask_of_mempolicy(nodes_allowed))) {
    21             NODEMASK_FREE(nodes_allowed);
    22             nodes_allowed = &node_states[N_HIGH_MEMORY];
    23         }
    24      /*设置最大页数,并分配具体内存页*/
    25         h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
    26         if (nodes_allowed != &node_states[N_HIGH_MEMORY])
    27             NODEMASK_FREE(nodes_allowed);
    28     }
    29 out:
    30     return ret;
    31 }
    
    
     1 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
     2                         nodemask_t *nodes_allowed)
     3 {
     4     unsigned long min_count, ret;
     5     if (h->order >= MAX_ORDER)
     6         return h->max_huge_pages;
     7     /*
     8      * Increase the pool size
     9      * First take pages out of surplus state.  Then make up the
    10      * remaining difference by allocating fresh huge pages.
    11      *
    12      * We might race with alloc_buddy_huge_page() here and be unable
    13      * to convert a surplus huge page to a normal huge page. That is
    14      * not critical, though, it just means the overall size of the
    15      * pool might be one hugepage larger than it needs to be, but
    16      * within all the constraints specified by the sysctls.
    17      */
    18     spin_lock(&hugetlb_lock);
    19     while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
    20         if (!adjust_pool_surplus(h, nodes_allowed, -1))
    21             break;
    22     }
    23     while (count > persistent_huge_pages(h)) {
    24         /*
    25          * If this allocation races such that we no longer need the
    26          * page, free_huge_page will handle it by freeing the page
    27          * and reducing the surplus.
    28          */
    29         spin_unlock(&hugetlb_lock);
    30      /*分配内存页*/
    31         ret = alloc_fresh_huge_page(h, nodes_allowed);
    32         spin_lock(&hugetlb_lock);
    33         if (!ret)
    34             goto out;
    35         /* Bail for signals. Probably ctrl-c from user */
    36         if (signal_pending(current))
    37             goto out;
    38     }
    39     /*
    40      * Decrease the pool size
    41      * First return free pages to the buddy allocator (being careful
    42      * to keep enough around to satisfy reservations).  Then place
    43      * pages into surplus state as needed so the pool will shrink
    44      * to the desired size as pages become free.
    45      *
    46      * By placing pages into the surplus state independent of the
    47      * overcommit value, we are allowing the surplus pool size to
    48      * exceed overcommit. There are few sane options here. Since
    49      * alloc_buddy_huge_page() is checking the global counter,
    50      * though, we'll note that we're not allowed to exceed surplus
    51      * and won't grow the pool anywhere else. Not until one of the
    52      * sysctls are changed, or the surplus pages go out of use.
    53      */
    54     min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
    55     min_count = max(count, min_count);
    56     try_to_free_low(h, min_count, nodes_allowed);
    57     while (min_count < persistent_huge_pages(h)) {
    58         if (!free_pool_huge_page(h, nodes_allowed, 0))
    59             break;
    60     }
    61     while (count < persistent_huge_pages(h)) {
    62         if (!adjust_pool_surplus(h, nodes_allowed, 1))
    63             break;
    64     }
    65 out:
    66     ret = persistent_huge_pages(h);
    67     spin_unlock(&hugetlb_lock);
    68     return ret;
    69 }
     1 static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
     2 {
     3     struct page *page;
     4     int start_nid;
     5     int next_nid;
     6     int ret = 0;
     7     start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
     8     next_nid = start_nid;
     9     do {
    10      /* 从内存Node的zonelist上分配2^h->order个4K的内存页,返回第一个page的地址;
    11       * 如果分配不成功,从下一个内存Node上尝试;
    12       */
    13         page = alloc_fresh_huge_page_node(h, next_nid);
    14         if (page) {
    15             ret = 1;
    16             break;
    17         }
    18         next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
    19     } while (next_nid != start_nid);
    20     if (ret)
    21         count_vm_event(HTLB_BUDDY_PGALLOC);
    22     else
    23         count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
    24     return ret;
    25 }
     1 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
     2 {
     3     struct page *page;
     4     if (h->order >= MAX_ORDER)
     5         return NULL;
     6     /*__GFP_COMP标志:分配2^h->order个连续的4K大小的page,返回第一个Page的地址,并设置PG_compound标记*/
     7    page = alloc_pages_exact_node(nid,
     8    htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
     9                         __GFP_REPEAT|__GFP_NOWARN,
    10    huge_page_order(h));
    11     if (page) {
    12         if (arch_prepare_hugepage(page)) {
    13             __free_pages(page, huge_page_order(h));
    14             return NULL;
    15         }
    16      /* 1、将已分配的2^h->order个数的page中的第二个page的lru.next执行函数free_huge_page();
    17       * 2、在put_page()函数中,最后调用free_huge_page()-->enqueue_huge_page(),将page加入到h->hugepages_freelists[nid]链表;
    18       */
    19         prep_new_huge_page(h, page, nid);
    20     }
    21     return page;
    22 }

    2、hugetlbfs的初始化

     hugetlbfs的创建,主要是建立VFS层的super_block、dentry、inode之间的相关映射,同时也和hugetlb_init()函数中初始化的hstates[]数组关联起来了,也就和分配的大内存页关联起来了。如下图(有点乱):

     1 static int __init init_hugetlbfs_fs(void)
     2 {
     3     int error;
     4     struct vfsmount *vfsmount;
     5 
     6     /*初始化hugetlbfs回写数据结构*/
     7     error = bdi_init(&hugetlbfs_backing_dev_info);
     8     if (error)
     9         return error;
    10 
    11     error = -ENOMEM;
    12     /*创建slab缓存hugetlbfs_inode_cachep,后续hugetlbfs的inode从这里面分配*/
    13     hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
    14                     sizeof(struct hugetlbfs_inode_info),
    15                     0, 0, init_once);
    16     if (hugetlbfs_inode_cachep == NULL)
    17         goto out2;
    18 
    19     /*将hugetlbfs_fs_type加入到全局file_systems链表中*/
    20     error = register_filesystem(&hugetlbfs_fs_type);
    21     if (error)
    22         goto out;
    23 
    24     /* 创建hugetlbfs的super_block、entry、inode,并建立它们之间的相互映射,
    25    * 以及它们与hugetlbfs_fs_type、default_hstate、hugetlbfs_inode_cachep之间的映射关系
    26    */
    27     vfsmount = kern_mount(&hugetlbfs_fs_type);
    28 
    29     if (!IS_ERR(vfsmount)) {
    30         hugetlbfs_vfsmount = vfsmount;
    31         return 0;
    32     }
    33 
    34     error = PTR_ERR(vfsmount);
    35 
    36  out:
    37     kmem_cache_destroy(hugetlbfs_inode_cachep);
    38  out2:
    39     bdi_destroy(&hugetlbfs_backing_dev_info);
    40     return error;
    41 }
    42     

    有不足或错误之处,欢迎指出。

    参考:

    http://www.ibm.com/developerworks/cn/linux/l-cn-hugetlb/

  • 相关阅读:
    grapesjs 强大的web builder 框架
    浏览器指纹信息获取js 包的使用
    让自己的npm包支持npm WARN deprecated
    actionherojs 中间件参考流程
    dremio 集成lakefs
    dremio 20.1 官方release note 提供了
    edgedb 1.0 发布了
    posthog 开源产品分析工具
    lakefs 源码构建
    grouparoo 源码项目结构
  • 原文地址:https://www.cnblogs.com/MerlinJ/p/4053689.html
Copyright © 2020-2023  润新知