• linux下内存分配时overcommit使用


    一、用户态地址分配主要路径
              用户态所有地址空间的申请主要经过mmap系统调用,也有一些是基于brk系统调用,对于mmap系统调用,它在某些条件下会执行申请空间合理性的判断,而brk则是一定进行合理性判断,下面是mmap相关的一些代码do_mmap_pgoff,它主要就是在于如果新分配的空间是私有可写空间,就会进行一次当前系统页面状况的检查,也就是执行security_vm_enough_memory函数:
    if (accountable && (!(flags & MAP_NORESERVE) || 一般用户态申请都是满足这个条件,从而进入下面分支
        sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
    if (vm_flags & VM_SHARED) {
    /* Check memory availability in shmem_file_setup? */
    vm_flags |= VM_ACCOUNT;
    } else if (vm_flags & VM_WRITE) {//私有可写空间进行剩余空间检查
    /*
     * Private writable mapping: check memory availability
     */
    charged = len >> PAGE_SHIFT;
    if (security_vm_enough_memory(charged))
    return -ENOMEM;
    vm_flags |= VM_ACCOUNT;
    }
    }
    二、检查时主要代码security_vm_enough_memory
    security_vm_enough_memory函数相关的主要代码如下,默认情况下,系统的sysctl_overcommit_memory配置为OVERCOMMIT_GUESS,也就是零:
    security_vm_enough_memory-->>cap_vm_enough_memory-->>__vm_enough_memory
    if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
    unsigned long n;
     
    free = global_page_state(NR_FILE_PAGES);可写回文件
    free += nr_swap_pages;
     
    /*
     * Any slabs which are created with the
     * SLAB_RECLAIM_ACCOUNT flag claim to have contents
     * which are reclaimable, under pressure.  The dentry
     * cache and most inode caches should fall into this
     */
    free += global_page_state(NR_SLAB_RECLAIMABLE);//可回收内核中slab结构
     
    /*
     * Leave the last 3% for root
     */
    if (!cap_sys_admin)
    free -= free / 32;
     
    if (free > pages)
    return 0;
     
    /*
     * nr_free_pages() is very expensive on large systems,
     * only call if we're about to fail.
     */
    n = nr_free_pages();//当前空闲页面
     
    /*
     * Leave reserved pages. The pages are not for anonymous pages.
     */
    if (n <= totalreserve_pages)
    goto error;
    else
    n -= totalreserve_pages;
     
    /*
     * Leave the last 3% for root
     */
    if (!cap_sys_admin)
    n -= n / 32;
    free += n;
     
    if (free > pages)
    return 0;
     
    goto error;
    }
    此时判断逻辑就是判断系统当前 文件页面(即这些页面有对应的文件,可以写回文件从而释放内存页面) + 可回收slab空间(内核结构) + 空闲页面数量(不需要做任何处理可以马上使用的页面)。在注释中说明,nr_free_pages在一些内存(节点)较多的系统中消耗可能会比较大,所以当前两种类型页面数量不能满足时才会判断空闲页面。这里还减少了系统保留页面totalreserve_pages。
    三、这些空间的查看:
    1、可用空间计算
    总大小为1715983个页面
    tsecer@harry :cat /proc/vmstat  |head -20
    nr_free_pages 129372
    nr_inactive_anon 214454
    nr_active_anon 719778
    nr_inactive_file 469531
    nr_active_file 425033
    nr_unevictable 0
    nr_mlock 0
    nr_anon_pages 308366
    nr_mapped 296513
    nr_file_pages 1520481
    nr_dirty 124
    nr_writeback 0
    nr_slab_reclaimable 66130
    nr_slab_unreclaimable 5662
    nr_page_table_pages 4093
    nr_kernel_stack 829
    nr_unstable 0
    nr_bounce 0
    nr_vmscan_write 86855
    nr_writeback_temp 0
    2、reserve空间的计算
    内核中计算代码
    /*
     * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
     * or min_free_kbytes changes.
     */
    static void calculate_totalreserve_pages(void)
    {
    struct pglist_data *pgdat;
    unsigned long reserve_pages = 0;
    enum zone_type i, j;
     
    for_each_online_pgdat(pgdat) {
    for (i = 0; i < MAX_NR_ZONES; i++) {
    struct zone *zone = pgdat->node_zones + i;
    unsigned long max = 0;
     
    /* Find valid and maximum lowmem_reserve in the zone */
    for (j = i; j < MAX_NR_ZONES; j++) {
    if (zone->lowmem_reserve[j] > max)
    max = zone->lowmem_reserve[j];
    }
     
    /* we treat pages_high as reserved pages. */
    max += zone->pages_high;
     
    if (max > zone->present_pages)//下面的例子中zone      DMA将会满足这个分支
    max = zone->present_pages;
    reserve_pages += max;
    }
    }
    totalreserve_pages = reserve_pages;
    }
     
    用户态查看这些变量,计算其总大小为 7 + 3833 + 2011 + 4292 + 2292 = 12435
    tsecer@harry :cat /proc/zoneinfo 
    Node 0, zone      DMA
      pages free     3979
            min      5
            low      6
            high     7
            scanned  0
            spanned  4096
            present  3833
        nr_free_pages 3979
        nr_inactive_anon 0
        nr_active_anon 0
        nr_inactive_file 0
        nr_active_file 0
        nr_unevictable 0
        nr_mlock     0
        nr_anon_pages 0
        nr_mapped    0
        nr_file_pages 0
        nr_dirty     0
        nr_writeback 0
        nr_slab_reclaimable 0
        nr_slab_unreclaimable 0
        nr_page_table_pages 0
        nr_kernel_stack 0
        nr_unstable  0
        nr_bounce    0
        nr_vmscan_write 0
        nr_writeback_temp 0
        nr_isolated_anon 0
        nr_isolated_file 0
        nr_shmem     0
        numa_hit     0
        numa_miss    0
        numa_foreign 0
        numa_interleave 0
        numa_local   0
        numa_other   0
            protection: (0, 3768, 8060, 8060)
      pagesets
        cpu: 0
                  count: 0
                  high:  0
                  batch: 1
      vm stats threshold: 4
        cpu: 1
                  count: 0
                  high:  0
                  batch: 1
      vm stats threshold: 4
      all_unreclaimable: 1
      prev_priority:     12
      start_pfn:         0
      inactive_ratio:    1
    Node 0, zone    DMA32
      pages free     73888
            min      1341
            low      1676
            high     2011
            scanned  0
            spanned  1044480
            present  964664
        nr_free_pages 73888
        nr_inactive_anon 91477
        nr_active_anon 250603
        nr_inactive_file 247542
        nr_active_file 222216
        nr_unevictable 0
        nr_mlock     0
        nr_anon_pages 88383
        nr_mapped    138528
        nr_file_pages 723455
        nr_dirty     54
        nr_writeback 0
        nr_slab_reclaimable 37898
        nr_slab_unreclaimable 2103
        nr_page_table_pages 1120
        nr_kernel_stack 447
        nr_unstable  0
        nr_bounce    0
        nr_vmscan_write 14094
        nr_writeback_temp 0
        nr_isolated_anon 0
        nr_isolated_file 0
        nr_shmem     253697
        numa_hit     4482015936
        numa_miss    0
        numa_foreign 0
        numa_interleave 0
        numa_local   4482015936
        numa_other   0
            protection: (0, 0, 4292, 4292)
      pagesets
        cpu: 0
                  count: 168
                  high:  186
                  batch: 31
      vm stats threshold: 24
        cpu: 1
                  count: 156
                  high:  186
                  batch: 31
      vm stats threshold: 24
      all_unreclaimable: 0
      prev_priority:     12
      start_pfn:         4096
      inactive_ratio:    5
    Node 0, zone   Normal
      pages free     41986
            min      1528
            low      1910
            high     2292
            scanned  0
            spanned  1114112
            present  1098880
        nr_free_pages 41986
        nr_inactive_anon 122977
        nr_active_anon 469482
        nr_inactive_file 230289
        nr_active_file 203672
        nr_unevictable 0
        nr_mlock     0
        nr_anon_pages 220252
        nr_mapped    158002
        nr_file_pages 806178
        nr_dirty     233
        nr_writeback 0
        nr_slab_reclaimable 28248
        nr_slab_unreclaimable 3556
        nr_page_table_pages 2978
        nr_kernel_stack 383
        nr_unstable  0
        nr_bounce    0
        nr_vmscan_write 72761
        nr_writeback_temp 0
        nr_isolated_anon 0
        nr_isolated_file 0
        nr_shmem     372212
        numa_hit     3099991491
        numa_miss    0
        numa_foreign 0
        numa_interleave 8025
        numa_local   3099991491
        numa_other   0
            protection: (0, 0, 0, 0)
      pagesets
        cpu: 0
                  count: 82
                  high:  186
                  batch: 31
      vm stats threshold: 28
        cpu: 1
                  count: 143
                  high:  186
                  batch: 31
      vm stats threshold: 28
      all_unreclaimable: 0
      prev_priority:     12
      start_pfn:         1048576
      inactive_ratio:    6
    四、测试下效果
    tsecer@harry :cat overcommit.c 
    #include <stdio.h>
    #include <stdlib.h>
     
    int main(int argc, char *argv[])
    {
     int iloop = 1;
     if (argc >= 3)
     {
            iloop = atoi(argv[2]);
     }
     
     for (int i = 0; i < iloop; i++)
     {
             size_t len = atol(argv[1]) * 0x1000;
             void * pm = malloc(len);
             printf("len %ld addr %p ", len, pm);
     }
    }
    tsecer@harry :g++ overcommit.c 
    tsecer@harry :cat /proc/vmstat | egrep "nr_file_pages|nr_slab_reclaimable|nr_free_pages" | awk '{sum += $2} END {print sum}'
    1655451
    tsecer@harry :./a.out $((1655557 - 11435))
    len 6734323712 addr (nil)
    tsecer@harry :./a.out $((1655557 - 12435))
    len 6730227712 addr (nil)
    tsecer@harry :./a.out $((1655557 - 13435))
    len 6726131712 addr 0x7fc7d335b010
    tsecer@harry :./a.out $((1655557 - 13435)) 100 |head 这里的现象是只要没有超过这个阈值,虚拟空间就可以一直申请
    len 6726131712 addr 0x7fc76f8cb010
    len 6726131712 addr 0x7fc5dea40010
    len 6726131712 addr 0x7fc44dbb5010
    len 6726131712 addr 0x7fc2bcd2a010
    len 6726131712 addr 0x7fc12be9f010
    len 6726131712 addr 0x7fbf9b014010
    len 6726131712 addr 0x7fbe0a189010
    len 6726131712 addr 0x7fbc792fe010
    len 6726131712 addr 0x7fbae8473010
    len 6726131712 addr 0x7fb9575e8010
    五、oomkiller
    out_of_memory-->>__out_of_memory
    tsecer@harry :cat /proc/sys/vm/oom_kill_allocating_task 
    0
    /*
     * Must be called with tasklist_lock held for read.
     */
    static void __out_of_memory(gfp_t gfp_mask, int order)
    {
    struct task_struct *p;
    unsigned long points;
     
    if (sysctl_oom_kill_allocating_task)
    if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
    "Out of memory (oom_kill_allocating_task)"))
    return;
    /*
     * Rambo mode: Shoot down a process and hope it solves whatever
     * issues we may have.
     */
    p = select_bad_process(&points, NULL);
    这个select_bad_process的选择里面代码大部分是工程上的一些微调,主要原则就是优先杀死虚拟内存占用量最高的进程,通常也就是我们的主逻辑进程,下面是选择代码,备份在这里便于查阅:
     
    /**
     * badness - calculate a numeric value for how bad this task has been
     * @p: task struct of which task we should calculate
     * @uptime: current uptime in seconds
     *
     * The formula used is relatively simple and documented inline in the
     * function. The main rationale is that we want to select a good task
     * to kill when we run out of memory.
     *
     * Good in this context means that:
     * 1) we lose the minimum amount of work done
     * 2) we recover a large amount of memory
     * 3) we don't kill anything innocent of eating tons of memory
     * 4) we want to kill the minimum amount of processes (one)
     * 5) we try to kill the process the user expects us to kill, this
     *    algorithm has been meticulously tuned to meet the principle
     *    of least surprise ... (be careful when you change it)
     */
     
    unsigned long badness(struct task_struct *p, unsigned long uptime)
    {
    unsigned long points, cpu_time, run_time;
    struct mm_struct *mm;
    struct task_struct *child;
    int oom_adj = p->signal->oom_adj;
    struct task_cputime task_time;
    unsigned long utime;
    unsigned long stime;
     
    if (oom_adj == OOM_DISABLE)
    return 0;
     
    task_lock(p);
    mm = p->mm;
    if (!mm) {
    task_unlock(p);
    return 0;
    }
     
    /*
     * The memory size of the process is the basis for the badness.
     */
    points = mm->total_vm;
     
    /*
     * After this unlock we can no longer dereference local variable `mm'
     */
    task_unlock(p);
     
    /*
     * swapoff can easily use up all memory, so kill those first.
     */
    if (p->flags & PF_OOM_ORIGIN)
    return ULONG_MAX;
     
    /*
     * Processes which fork a lot of child processes are likely
     * a good choice. We add half the vmsize of the children if they
     * have an own mm. This prevents forking servers to flood the
     * machine with an endless amount of children. In case a single
     * child is eating the vast majority of memory, adding only half
     * to the parents will make the child our kill candidate of choice.
     */
    list_for_each_entry(child, &p->children, sibling) {
    task_lock(child);
    if (child->mm != mm && child->mm)
    points += child->mm->total_vm/2 + 1;
    task_unlock(child);
    }
     
    /*
     * CPU time is in tens of seconds and run time is in thousands
             * of seconds. There is no particular reason for this other than
             * that it turned out to work very well in practice.
     */
    thread_group_cputime(p, &task_time);
    utime = cputime_to_jiffies(task_time.utime);
    stime = cputime_to_jiffies(task_time.stime);
    cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
     
     
    if (uptime >= p->start_time.tv_sec)
    run_time = (uptime - p->start_time.tv_sec) >> 10;
    else
    run_time = 0;
     
    if (cpu_time)
    points /= int_sqrt(cpu_time);
    if (run_time)
    points /= int_sqrt(int_sqrt(run_time));
     
    /*
     * Niced processes are most likely less important, so double
     * their badness points.
     */
    if (task_nice(p) > 0)
    points *= 2;
     
    /*
     * Superuser processes are usually more important, so we make it
     * less likely that we kill those.
     */
    if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
        has_capability_noaudit(p, CAP_SYS_RESOURCE))
    points /= 4;
     
    /*
     * We don't want to kill a process with direct hardware access.
     * Not only could that mess up the hardware, but usually users
     * tend to only have this flag set on applications they think
     * of as important.
     */
    if (has_capability_noaudit(p, CAP_SYS_RAWIO))
    points /= 4;
     
    /*
     * If p's nodes don't overlap ours, it may still help to kill p
     * because p may have allocated or otherwise mapped memory on
     * this node before. However it will be less likely.
     */
    if (!has_intersects_mems_allowed(p))
    points /= 8;
     
    /*
     * Adjust the score by oom_adj.
     */
    if (oom_adj) {
    if (oom_adj > 0) {
    if (!points)
    points = 1;
    points <<= oom_adj;
    } else
    points >>= -(oom_adj);
    }
     
    #ifdef DEBUG
    printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points ",
    p->pid, p->comm, points);
    #endif
    return points;
    }
     
    /*
     * Simple selection loop. We chose the process with the highest
     * number of 'points'. We expect the caller will lock the tasklist.
     *
     * (not docbooked, we don't want this one cluttering up the manual)
     */
    static struct task_struct *select_bad_process(unsigned long *ppoints,
    struct mem_cgroup *mem)
    {
    struct task_struct *p;
    struct task_struct *chosen = NULL;
    struct timespec uptime;
    *ppoints = 0;
     
    do_posix_clock_monotonic_gettime(&uptime);
    for_each_process(p) {
    unsigned long points;
     
    /*
     * skip kernel threads and tasks which have already released
     * their mm.
     */
    if (!p->mm)
    continue;
    /* skip the init task */
    if (is_global_init(p))
    continue;
    if (mem && !task_in_mem_cgroup(p, mem))
    continue;
     
    /*
     * This task already has access to memory reserves and is
     * being killed. Don't allow any other task access to the
     * memory reserve.
     *
     * Note: this may have a chance of deadlock if it gets
     * blocked waiting for another task which itself is waiting
     * for memory. Is there a better alternative?
     */
    if (test_tsk_thread_flag(p, TIF_MEMDIE))
    return ERR_PTR(-1UL);
     
    /*
     * This is in the process of releasing memory so wait for it
     * to finish before killing some other task by mistake.
     *
     * However, if p is the current task, we allow the 'kill' to
     * go ahead if it is exiting: this will simply set TIF_MEMDIE,
     * which will allow it to gain access to memory reserves in
     * the process of exiting and releasing its resources.
     * Otherwise we could get an easy OOM deadlock.
     */
    if (p->flags & PF_EXITING) {
    if (p != current)
    return ERR_PTR(-1UL);
     
    chosen = p;
    *ppoints = ULONG_MAX;
    }
     
    if (p->signal->oom_adj == OOM_DISABLE)
    continue;
     
    points = badness(p, uptime.tv_sec);
    if (points > *ppoints || !chosen) {
    chosen = p;
    *ppoints = points;
    }
    }
     
    return chosen;
    }
  • 相关阅读:
    demo04-默认标签
    demo03-段落标签
    demo02-标题标签
    demo01-注释标签
    前端基础介绍
    xadmin的详细使用
    设置Linux环境变量中文显示乱码
    ES应用
    HTTP协议
    jboss
  • 原文地址:https://www.cnblogs.com/tsecer/p/10487647.html
Copyright © 2020-2023  润新知