linux下内存分配时overcommit使用

linux下内存分配时overcommit使用

一、用户态地址分配主要路径

用户态所有地址空间的申请主要经过mmap系统调用，也有一些是基于brk系统调用，对于mmap系统调用，它在某些条件下会执行申请空间合理性的判断，而brk则是一定进行合理性判断，下面是mmap相关的一些代码do_mmap_pgoff，它主要就是在于如果新分配的空间是私有可写空间，就会进行一次当前系统页面状况的检查，也就是执行security_vm_enough_memory函数：

if (accountable && (!(flags & MAP_NORESERVE) || 一般用户态申请都是满足这个条件，从而进入下面分支。

   sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {

if (vm_flags & VM_SHARED) {

/* Check memory availability in shmem_file_setup? */

vm_flags |= VM_ACCOUNT;

} else if (vm_flags & VM_WRITE) {//私有可写空间进行剩余空间检查

/*

* Private writable mapping: check memory availability

*/

charged = len >> PAGE_SHIFT;

if (security_vm_enough_memory(charged))

return -ENOMEM;

vm_flags |= VM_ACCOUNT;

}

}

二、检查时主要代码security_vm_enough_memory

security_vm_enough_memory函数相关的主要代码如下，默认情况下，系统的sysctl_overcommit_memory配置为OVERCOMMIT_GUESS，也就是零：

security_vm_enough_memory-->>cap_vm_enough_memory-->>__vm_enough_memory

if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {

unsigned long n;

free = global_page_state(NR_FILE_PAGES);可写回文件

free += nr_swap_pages;

/*

* Any slabs which are created with the

* SLAB_RECLAIM_ACCOUNT flag claim to have contents

* which are reclaimable, under pressure. The dentry

* cache and most inode caches should fall into this

*/

free += global_page_state(NR_SLAB_RECLAIMABLE);//可回收内核中slab结构

/*

* Leave the last 3% for root

*/

if (!cap_sys_admin)

free -= free / 32;

if (free > pages)

return 0;

/*

* nr_free_pages() is very expensive on large systems,

* only call if we're about to fail.

*/

n = nr_free_pages();//当前空闲页面

/*

* Leave reserved pages. The pages are not for anonymous pages.

*/

if (n <= totalreserve_pages)

goto error;

else

n -= totalreserve_pages;

/*

* Leave the last 3% for root

*/

if (!cap_sys_admin)

n -= n / 32;

free += n;

if (free > pages)

return 0;

goto error;

}

此时判断逻辑就是判断系统当前文件页面(即这些页面有对应的文件，可以写回文件从而释放内存页面) + 可回收slab空间(内核结构) + 空闲页面数量(不需要做任何处理可以马上使用的页面)。在注释中说明，nr_free_pages在一些内存(节点)较多的系统中消耗可能会比较大，所以当前两种类型页面数量不能满足时才会判断空闲页面。这里还减少了系统保留页面totalreserve_pages。

三、这些空间的查看：

1、可用空间计算

总大小为1715983个页面

tsecer@harry :cat /proc/vmstat |head -20

nr_free_pages 129372

nr_inactive_anon 214454

nr_active_anon 719778

nr_inactive_file 469531

nr_active_file 425033

nr_unevictable 0

nr_mlock 0

nr_anon_pages 308366

nr_mapped 296513

nr_file_pages 1520481

nr_dirty 124

nr_writeback 0

nr_slab_reclaimable 66130

nr_slab_unreclaimable 5662

nr_page_table_pages 4093

nr_kernel_stack 829

nr_unstable 0

nr_bounce 0

nr_vmscan_write 86855

nr_writeback_temp 0

2、reserve空间的计算

内核中计算代码

/*

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

* or min_free_kbytes changes.

*/

static void calculate_totalreserve_pages(void)

{

struct pglist_data *pgdat;

unsigned long reserve_pages = 0;

enum zone_type i, j;

for_each_online_pgdat(pgdat) {

for (i = 0; i < MAX_NR_ZONES; i++) {

struct zone *zone = pgdat->node_zones + i;

unsigned long max = 0;

/* Find valid and maximum lowmem_reserve in the zone */

for (j = i; j < MAX_NR_ZONES; j++) {

if (zone->lowmem_reserve[j] > max)

max = zone->lowmem_reserve[j];

}

/* we treat pages_high as reserved pages. */

max += zone->pages_high;

if (max > zone->present_pages)//下面的例子中zone DMA将会满足这个分支。

max = zone->present_pages;

reserve_pages += max;

}

}

totalreserve_pages = reserve_pages;

}

用户态查看这些变量，计算其总大小为 7 + 3833 + 2011 + 4292 + 2292 = 12435

tsecer@harry :cat /proc/zoneinfo

Node 0, zone DMA

pages free 3979

min 5

low 6

high   7

scanned 0

spanned 4096

present 3833

nr_free_pages 3979

nr_inactive_anon 0

nr_active_anon 0

nr_inactive_file 0

nr_active_file 0

nr_unevictable 0

nr_mlock 0

nr_anon_pages 0

nr_mapped 0

nr_file_pages 0

nr_dirty 0

nr_writeback 0

nr_slab_reclaimable 0

nr_slab_unreclaimable 0

nr_page_table_pages 0

nr_kernel_stack 0

nr_unstable 0

nr_bounce 0

nr_vmscan_write 0

nr_writeback_temp 0

nr_isolated_anon 0

nr_isolated_file 0

nr_shmem 0

numa_hit 0

numa_miss 0

numa_foreign 0

numa_interleave 0

numa_local 0

numa_other 0

protection: (0, 3768, 8060, 8060)

pagesets

cpu: 0

count: 0

high: 0

batch: 1

vm stats threshold: 4

cpu: 1

count: 0

high: 0

batch: 1

vm stats threshold: 4

all_unreclaimable: 1

prev_priority: 12

start_pfn: 0

inactive_ratio: 1

Node 0, zone DMA32

pages free 73888

min 1341

low 1676

high   2011

scanned 0

spanned 1044480

present 964664

nr_free_pages 73888

nr_inactive_anon 91477

nr_active_anon 250603

nr_inactive_file 247542

nr_active_file 222216

nr_unevictable 0

nr_mlock 0

nr_anon_pages 88383

nr_mapped 138528

nr_file_pages 723455

nr_dirty 54

nr_writeback 0

nr_slab_reclaimable 37898

nr_slab_unreclaimable 2103

nr_page_table_pages 1120

nr_kernel_stack 447

nr_unstable 0

nr_bounce 0

nr_vmscan_write 14094

nr_writeback_temp 0

nr_isolated_anon 0

nr_isolated_file 0

nr_shmem 253697

numa_hit 4482015936

numa_miss 0

numa_foreign 0

numa_interleave 0

numa_local 4482015936

numa_other 0

protection: (0, 0, 4292, 4292)

pagesets

cpu: 0

count: 168

high: 186

batch: 31

vm stats threshold: 24

cpu: 1

count: 156

high: 186

batch: 31

vm stats threshold: 24

all_unreclaimable: 0

prev_priority: 12

start_pfn: 4096

inactive_ratio: 5

Node 0, zone Normal

pages free 41986

min 1528

low 1910

high   2292

scanned 0

spanned 1114112

present 1098880

nr_free_pages 41986

nr_inactive_anon 122977

nr_active_anon 469482

nr_inactive_file 230289

nr_active_file 203672

nr_unevictable 0

nr_mlock 0

nr_anon_pages 220252

nr_mapped 158002

nr_file_pages 806178

nr_dirty 233

nr_writeback 0

nr_slab_reclaimable 28248

nr_slab_unreclaimable 3556

nr_page_table_pages 2978

nr_kernel_stack 383

nr_unstable 0

nr_bounce 0

nr_vmscan_write 72761

nr_writeback_temp 0

nr_isolated_anon 0

nr_isolated_file 0

nr_shmem 372212

numa_hit 3099991491

numa_miss 0

numa_foreign 0

numa_interleave 8025

numa_local 3099991491

numa_other 0

protection: (0, 0, 0, 0)

pagesets

cpu: 0

count: 82

high: 186

batch: 31

vm stats threshold: 28

cpu: 1

count: 143

high: 186

batch: 31

vm stats threshold: 28

all_unreclaimable: 0

prev_priority: 12

start_pfn: 1048576

inactive_ratio: 6

四、测试下效果

tsecer@harry :cat overcommit.c

#include <stdio.h>

#include <stdlib.h>

int main(int argc, char *argv[])

{

int iloop = 1;

if (argc >= 3)

{

iloop = atoi(argv[2]);

}

for (int i = 0; i < iloop; i++)

{

size_t len = atol(argv[1]) * 0x1000;

void * pm = malloc(len);

printf("len %ld addr %p ", len, pm);

}

}

tsecer@harry :g++ overcommit.c

tsecer@harry :cat /proc/vmstat | egrep "nr_file_pages|nr_slab_reclaimable|nr_free_pages" | awk '{sum += $2} END {print sum}'

1655451

tsecer@harry :./a.out $((1655557 - 11435))

len 6734323712 addr (nil)

tsecer@harry :./a.out $((1655557 - 12435))

len 6730227712 addr (nil)

tsecer@harry :./a.out $((1655557 - 13435))

len 6726131712 addr 0x7fc7d335b010

tsecer@harry :./a.out $((1655557 - 13435)) 100 |head 这里的现象是只要没有超过这个阈值，虚拟空间就可以一直申请。

len 6726131712 addr 0x7fc76f8cb010

len 6726131712 addr 0x7fc5dea40010

len 6726131712 addr 0x7fc44dbb5010

len 6726131712 addr 0x7fc2bcd2a010

len 6726131712 addr 0x7fc12be9f010

len 6726131712 addr 0x7fbf9b014010

len 6726131712 addr 0x7fbe0a189010

len 6726131712 addr 0x7fbc792fe010

len 6726131712 addr 0x7fbae8473010

len 6726131712 addr 0x7fb9575e8010

五、oomkiller

out_of_memory-->>__out_of_memory

tsecer@harry :cat /proc/sys/vm/oom_kill_allocating_task

0

/*

* Must be called with tasklist_lock held for read.

*/

static void __out_of_memory(gfp_t gfp_mask, int order)

{

struct task_struct *p;

unsigned long points;

if (sysctl_oom_kill_allocating_task)

if (!oom_kill_process(current, gfp_mask, order, 0, NULL,

"Out of memory (oom_kill_allocating_task)"))

return;

/*

* Rambo mode: Shoot down a process and hope it solves whatever

* issues we may have.

*/

p = select_bad_process(&points, NULL);

这个select_bad_process的选择里面代码大部分是工程上的一些微调，主要原则就是优先杀死虚拟内存占用量最高的进程，通常也就是我们的主逻辑进程，下面是选择代码，备份在这里便于查阅：

/**

* badness - calculate a numeric value for how bad this task has been

* @p: task struct of which task we should calculate

* @uptime: current uptime in seconds

*

* The formula used is relatively simple and documented inline in the

* function. The main rationale is that we want to select a good task

* to kill when we run out of memory.

*

* Good in this context means that:

* 1) we lose the minimum amount of work done

* 2) we recover a large amount of memory

* 3) we don't kill anything innocent of eating tons of memory

* 4) we want to kill the minimum amount of processes (one)

* 5) we try to kill the process the user expects us to kill, this

* algorithm has been meticulously tuned to meet the principle

* of least surprise ... (be careful when you change it)

*/

unsigned long badness(struct task_struct *p, unsigned long uptime)

{

unsigned long points, cpu_time, run_time;

struct mm_struct *mm;

struct task_struct *child;

int oom_adj = p->signal->oom_adj;

struct task_cputime task_time;

unsigned long utime;

unsigned long stime;

if (oom_adj == OOM_DISABLE)

return 0;

task_lock(p);

mm = p->mm;

if (!mm) {

task_unlock(p);

return 0;

}

/*

* The memory size of the process is the basis for the badness.

*/

points = mm->total_vm;

/*

* After this unlock we can no longer dereference local variable `mm'

*/

task_unlock(p);

/*

* swapoff can easily use up all memory, so kill those first.

*/

if (p->flags & PF_OOM_ORIGIN)

return ULONG_MAX;

/*

* Processes which fork a lot of child processes are likely

* a good choice. We add half the vmsize of the children if they

* have an own mm. This prevents forking servers to flood the

* machine with an endless amount of children. In case a single

* child is eating the vast majority of memory, adding only half

* to the parents will make the child our kill candidate of choice.

*/

list_for_each_entry(child, &p->children, sibling) {

task_lock(child);

if (child->mm != mm && child->mm)

points += child->mm->total_vm/2 + 1;

task_unlock(child);

}

/*

* CPU time is in tens of seconds and run time is in thousands

* of seconds. There is no particular reason for this other than

* that it turned out to work very well in practice.

*/

thread_group_cputime(p, &task_time);

utime = cputime_to_jiffies(task_time.utime);

stime = cputime_to_jiffies(task_time.stime);

cpu_time = (utime + stime) >> (SHIFT_HZ + 3);

if (uptime >= p->start_time.tv_sec)

run_time = (uptime - p->start_time.tv_sec) >> 10;

else

run_time = 0;

if (cpu_time)

points /= int_sqrt(cpu_time);

if (run_time)

points /= int_sqrt(int_sqrt(run_time));

/*

* Niced processes are most likely less important, so double

* their badness points.

*/

if (task_nice(p) > 0)

points *= 2;

/*

* Superuser processes are usually more important, so we make it

* less likely that we kill those.

*/

if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||

   has_capability_noaudit(p, CAP_SYS_RESOURCE))

points /= 4;

/*

* We don't want to kill a process with direct hardware access.

* Not only could that mess up the hardware, but usually users

* tend to only have this flag set on applications they think

* of as important.

*/

if (has_capability_noaudit(p, CAP_SYS_RAWIO))

points /= 4;

/*

* If p's nodes don't overlap ours, it may still help to kill p

* because p may have allocated or otherwise mapped memory on

* this node before. However it will be less likely.

*/

if (!has_intersects_mems_allowed(p))

points /= 8;

/*

* Adjust the score by oom_adj.

*/

if (oom_adj) {

if (oom_adj > 0) {

if (!points)

points = 1;

points <<= oom_adj;

} else

points >>= -(oom_adj);

}

#ifdef DEBUG

printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points ",

p->pid, p->comm, points);

#endif

return points;

}

/*

* Simple selection loop. We chose the process with the highest

* number of 'points'. We expect the caller will lock the tasklist.

*

* (not docbooked, we don't want this one cluttering up the manual)

*/

static struct task_struct *select_bad_process(unsigned long *ppoints,

struct mem_cgroup *mem)

{

struct task_struct *p;

struct task_struct *chosen = NULL;

struct timespec uptime;

*ppoints = 0;

do_posix_clock_monotonic_gettime(&uptime);

for_each_process(p) {

unsigned long points;

/*

* skip kernel threads and tasks which have already released

* their mm.

*/

if (!p->mm)

continue;

/* skip the init task */

if (is_global_init(p))

continue;

if (mem && !task_in_mem_cgroup(p, mem))

continue;

/*

* This task already has access to memory reserves and is

* being killed. Don't allow any other task access to the

* memory reserve.

*

* Note: this may have a chance of deadlock if it gets

* blocked waiting for another task which itself is waiting

* for memory. Is there a better alternative?

*/

if (test_tsk_thread_flag(p, TIF_MEMDIE))

return ERR_PTR(-1UL);

/*

* This is in the process of releasing memory so wait for it

* to finish before killing some other task by mistake.

*

* However, if p is the current task, we allow the 'kill' to

* go ahead if it is exiting: this will simply set TIF_MEMDIE,

* which will allow it to gain access to memory reserves in

* the process of exiting and releasing its resources.

* Otherwise we could get an easy OOM deadlock.

*/

if (p->flags & PF_EXITING) {

if (p != current)

return ERR_PTR(-1UL);

chosen = p;

*ppoints = ULONG_MAX;

}

if (p->signal->oom_adj == OOM_DISABLE)

continue;

points = badness(p, uptime.tv_sec);

if (points > *ppoints || !chosen) {

chosen = p;

*ppoints = points;

}

}

return chosen;

}
相关阅读:
demo04-默认标签
 demo03-段落标签
 demo02-标题标签
 demo01-注释标签
 前端基础介绍
 xadmin的详细使用
 设置Linux环境变量中文显示乱码
 ES应用
 HTTP协议
 jboss
原文地址：https://www.cnblogs.com/tsecer/p/10487647.html