一、用户态地址分配主要路径
用户态所有地址空间的申请主要经过mmap系统调用,也有一些是基于brk系统调用,对于mmap系统调用,它在某些条件下会执行申请空间合理性的判断,而brk则是一定进行合理性判断,下面是mmap相关的一些代码do_mmap_pgoff,它主要就是在于如果新分配的空间是私有可写空间,就会进行一次当前系统页面状况的检查,也就是执行security_vm_enough_memory函数:
if (accountable && (!(flags & MAP_NORESERVE) || 一般用户态申请都是满足这个条件,从而进入下面分支。
sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
if (vm_flags & VM_SHARED) {
/* Check memory availability in shmem_file_setup? */
vm_flags |= VM_ACCOUNT;
} else if (vm_flags & VM_WRITE) {//私有可写空间进行剩余空间检查
/*
* Private writable mapping: check memory availability
*/
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory(charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
}
二、检查时主要代码security_vm_enough_memory
security_vm_enough_memory函数相关的主要代码如下,默认情况下,系统的sysctl_overcommit_memory配置为OVERCOMMIT_GUESS,也就是零:
security_vm_enough_memory-->>cap_vm_enough_memory-->>__vm_enough_memory
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
unsigned long n;
free = global_page_state(NR_FILE_PAGES);可写回文件
free += nr_swap_pages;
/*
* Any slabs which are created with the
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
* which are reclaimable, under pressure. The dentry
* cache and most inode caches should fall into this
*/
free += global_page_state(NR_SLAB_RECLAIMABLE);//可回收内核中slab结构
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
free -= free / 32;
if (free > pages)
return 0;
/*
* nr_free_pages() is very expensive on large systems,
* only call if we're about to fail.
*/
n = nr_free_pages();//当前空闲页面
/*
* Leave reserved pages. The pages are not for anonymous pages.
*/
if (n <= totalreserve_pages)
goto error;
else
n -= totalreserve_pages;
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
n -= n / 32;
free += n;
if (free > pages)
return 0;
goto error;
}
此时判断逻辑就是判断系统当前 文件页面(即这些页面有对应的文件,可以写回文件从而释放内存页面) + 可回收slab空间(内核结构) + 空闲页面数量(不需要做任何处理可以马上使用的页面)。在注释中说明,nr_free_pages在一些内存(节点)较多的系统中消耗可能会比较大,所以当前两种类型页面数量不能满足时才会判断空闲页面。这里还减少了系统保留页面totalreserve_pages。
三、这些空间的查看:
1、可用空间计算
总大小为1715983个页面
tsecer@harry :cat /proc/vmstat |head -20
nr_free_pages 129372
nr_inactive_anon 214454
nr_active_anon 719778
nr_inactive_file 469531
nr_active_file 425033
nr_unevictable 0
nr_mlock 0
nr_anon_pages 308366
nr_mapped 296513
nr_file_pages 1520481
nr_dirty 124
nr_writeback 0
nr_slab_reclaimable 66130
nr_slab_unreclaimable 5662
nr_page_table_pages 4093
nr_kernel_stack 829
nr_unstable 0
nr_bounce 0
nr_vmscan_write 86855
nr_writeback_temp 0
2、reserve空间的计算
内核中计算代码
/*
* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
* or min_free_kbytes changes.
*/
static void calculate_totalreserve_pages(void)
{
struct pglist_data *pgdat;
unsigned long reserve_pages = 0;
enum zone_type i, j;
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
unsigned long max = 0;
/* Find valid and maximum lowmem_reserve in the zone */
for (j = i; j < MAX_NR_ZONES; j++) {
if (zone->lowmem_reserve[j] > max)
max = zone->lowmem_reserve[j];
}
/* we treat pages_high as reserved pages. */
max += zone->pages_high;
if (max > zone->present_pages)//下面的例子中zone DMA将会满足这个分支。
max = zone->present_pages;
reserve_pages += max;
}
}
totalreserve_pages = reserve_pages;
}
用户态查看这些变量,计算其总大小为 7 + 3833 + 2011 + 4292 + 2292 = 12435
tsecer@harry :cat /proc/zoneinfo
Node 0, zone DMA
pages free 3979
min 5
low 6
high 7
scanned 0
spanned 4096
present 3833
nr_free_pages 3979
nr_inactive_anon 0
nr_active_anon 0
nr_inactive_file 0
nr_active_file 0
nr_unevictable 0
nr_mlock 0
nr_anon_pages 0
nr_mapped 0
nr_file_pages 0
nr_dirty 0
nr_writeback 0
nr_slab_reclaimable 0
nr_slab_unreclaimable 0
nr_page_table_pages 0
nr_kernel_stack 0
nr_unstable 0
nr_bounce 0
nr_vmscan_write 0
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 0
numa_hit 0
numa_miss 0
numa_foreign 0
numa_interleave 0
numa_local 0
numa_other 0
protection: (0, 3768, 8060, 8060)
pagesets
cpu: 0
count: 0
high: 0
batch: 1
vm stats threshold: 4
cpu: 1
count: 0
high: 0
batch: 1
vm stats threshold: 4
all_unreclaimable: 1
prev_priority: 12
start_pfn: 0
inactive_ratio: 1
Node 0, zone DMA32
pages free 73888
min 1341
low 1676
high 2011
scanned 0
spanned 1044480
present 964664
nr_free_pages 73888
nr_inactive_anon 91477
nr_active_anon 250603
nr_inactive_file 247542
nr_active_file 222216
nr_unevictable 0
nr_mlock 0
nr_anon_pages 88383
nr_mapped 138528
nr_file_pages 723455
nr_dirty 54
nr_writeback 0
nr_slab_reclaimable 37898
nr_slab_unreclaimable 2103
nr_page_table_pages 1120
nr_kernel_stack 447
nr_unstable 0
nr_bounce 0
nr_vmscan_write 14094
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 253697
numa_hit 4482015936
numa_miss 0
numa_foreign 0
numa_interleave 0
numa_local 4482015936
numa_other 0
protection: (0, 0, 4292, 4292)
pagesets
cpu: 0
count: 168
high: 186
batch: 31
vm stats threshold: 24
cpu: 1
count: 156
high: 186
batch: 31
vm stats threshold: 24
all_unreclaimable: 0
prev_priority: 12
start_pfn: 4096
inactive_ratio: 5
Node 0, zone Normal
pages free 41986
min 1528
low 1910
high 2292
scanned 0
spanned 1114112
present 1098880
nr_free_pages 41986
nr_inactive_anon 122977
nr_active_anon 469482
nr_inactive_file 230289
nr_active_file 203672
nr_unevictable 0
nr_mlock 0
nr_anon_pages 220252
nr_mapped 158002
nr_file_pages 806178
nr_dirty 233
nr_writeback 0
nr_slab_reclaimable 28248
nr_slab_unreclaimable 3556
nr_page_table_pages 2978
nr_kernel_stack 383
nr_unstable 0
nr_bounce 0
nr_vmscan_write 72761
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 372212
numa_hit 3099991491
numa_miss 0
numa_foreign 0
numa_interleave 8025
numa_local 3099991491
numa_other 0
protection: (0, 0, 0, 0)
pagesets
cpu: 0
count: 82
high: 186
batch: 31
vm stats threshold: 28
cpu: 1
count: 143
high: 186
batch: 31
vm stats threshold: 28
all_unreclaimable: 0
prev_priority: 12
start_pfn: 1048576
inactive_ratio: 6
四、测试下效果
tsecer@harry :cat overcommit.c
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
int iloop = 1;
if (argc >= 3)
{
iloop = atoi(argv[2]);
}
for (int i = 0; i < iloop; i++)
{
size_t len = atol(argv[1]) * 0x1000;
void * pm = malloc(len);
printf("len %ld addr %p
", len, pm);
}
}
tsecer@harry :g++ overcommit.c
tsecer@harry :cat /proc/vmstat | egrep "nr_file_pages|nr_slab_reclaimable|nr_free_pages" | awk '{sum += $2} END {print sum}'
1655451
tsecer@harry :./a.out $((1655557 - 11435))
len 6734323712 addr (nil)
tsecer@harry :./a.out $((1655557 - 12435))
len 6730227712 addr (nil)
tsecer@harry :./a.out $((1655557 - 13435))
len 6726131712 addr 0x7fc7d335b010
tsecer@harry :./a.out $((1655557 - 13435)) 100 |head 这里的现象是只要没有超过这个阈值,虚拟空间就可以一直申请。
len 6726131712 addr 0x7fc76f8cb010
len 6726131712 addr 0x7fc5dea40010
len 6726131712 addr 0x7fc44dbb5010
len 6726131712 addr 0x7fc2bcd2a010
len 6726131712 addr 0x7fc12be9f010
len 6726131712 addr 0x7fbf9b014010
len 6726131712 addr 0x7fbe0a189010
len 6726131712 addr 0x7fbc792fe010
len 6726131712 addr 0x7fbae8473010
len 6726131712 addr 0x7fb9575e8010
五、oomkiller
out_of_memory-->>__out_of_memory
tsecer@harry :cat /proc/sys/vm/oom_kill_allocating_task
0
/*
* Must be called with tasklist_lock held for read.
*/
static void __out_of_memory(gfp_t gfp_mask, int order)
{
struct task_struct *p;
unsigned long points;
if (sysctl_oom_kill_allocating_task)
if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
"Out of memory (oom_kill_allocating_task)"))
return;
/*
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
*/
p = select_bad_process(&points, NULL);
这个select_bad_process的选择里面代码大部分是工程上的一些微调,主要原则就是优先杀死虚拟内存占用量最高的进程,通常也就是我们的主逻辑进程,下面是选择代码,备份在这里便于查阅:
/**
* badness - calculate a numeric value for how bad this task has been
* @p: task struct of which task we should calculate
* @uptime: current uptime in seconds
*
* The formula used is relatively simple and documented inline in the
* function. The main rationale is that we want to select a good task
* to kill when we run out of memory.
*
* Good in this context means that:
* 1) we lose the minimum amount of work done
* 2) we recover a large amount of memory
* 3) we don't kill anything innocent of eating tons of memory
* 4) we want to kill the minimum amount of processes (one)
* 5) we try to kill the process the user expects us to kill, this
* algorithm has been meticulously tuned to meet the principle
* of least surprise ... (be careful when you change it)
*/
unsigned long badness(struct task_struct *p, unsigned long uptime)
{
unsigned long points, cpu_time, run_time;
struct mm_struct *mm;
struct task_struct *child;
int oom_adj = p->signal->oom_adj;
struct task_cputime task_time;
unsigned long utime;
unsigned long stime;
if (oom_adj == OOM_DISABLE)
return 0;
task_lock(p);
mm = p->mm;
if (!mm) {
task_unlock(p);
return 0;
}
/*
* The memory size of the process is the basis for the badness.
*/
points = mm->total_vm;
/*
* After this unlock we can no longer dereference local variable `mm'
*/
task_unlock(p);
/*
* swapoff can easily use up all memory, so kill those first.
*/
if (p->flags & PF_OOM_ORIGIN)
return ULONG_MAX;
/*
* Processes which fork a lot of child processes are likely
* a good choice. We add half the vmsize of the children if they
* have an own mm. This prevents forking servers to flood the
* machine with an endless amount of children. In case a single
* child is eating the vast majority of memory, adding only half
* to the parents will make the child our kill candidate of choice.
*/
list_for_each_entry(child, &p->children, sibling) {
task_lock(child);
if (child->mm != mm && child->mm)
points += child->mm->total_vm/2 + 1;
task_unlock(child);
}
/*
* CPU time is in tens of seconds and run time is in thousands
* of seconds. There is no particular reason for this other than
* that it turned out to work very well in practice.
*/
thread_group_cputime(p, &task_time);
utime = cputime_to_jiffies(task_time.utime);
stime = cputime_to_jiffies(task_time.stime);
cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
if (uptime >= p->start_time.tv_sec)
run_time = (uptime - p->start_time.tv_sec) >> 10;
else
run_time = 0;
if (cpu_time)
points /= int_sqrt(cpu_time);
if (run_time)
points /= int_sqrt(int_sqrt(run_time));
/*
* Niced processes are most likely less important, so double
* their badness points.
*/
if (task_nice(p) > 0)
points *= 2;
/*
* Superuser processes are usually more important, so we make it
* less likely that we kill those.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
has_capability_noaudit(p, CAP_SYS_RESOURCE))
points /= 4;
/*
* We don't want to kill a process with direct hardware access.
* Not only could that mess up the hardware, but usually users
* tend to only have this flag set on applications they think
* of as important.
*/
if (has_capability_noaudit(p, CAP_SYS_RAWIO))
points /= 4;
/*
* If p's nodes don't overlap ours, it may still help to kill p
* because p may have allocated or otherwise mapped memory on
* this node before. However it will be less likely.
*/
if (!has_intersects_mems_allowed(p))
points /= 8;
/*
* Adjust the score by oom_adj.
*/
if (oom_adj) {
if (oom_adj > 0) {
if (!points)
points = 1;
points <<= oom_adj;
} else
points >>= -(oom_adj);
}
#ifdef DEBUG
printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points
",
p->pid, p->comm, points);
#endif
return points;
}
/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. We expect the caller will lock the tasklist.
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
static struct task_struct *select_bad_process(unsigned long *ppoints,
struct mem_cgroup *mem)
{
struct task_struct *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
*ppoints = 0;
do_posix_clock_monotonic_gettime(&uptime);
for_each_process(p) {
unsigned long points;
/*
* skip kernel threads and tasks which have already released
* their mm.
*/
if (!p->mm)
continue;
/* skip the init task */
if (is_global_init(p))
continue;
if (mem && !task_in_mem_cgroup(p, mem))
continue;
/*
* This task already has access to memory reserves and is
* being killed. Don't allow any other task access to the
* memory reserve.
*
* Note: this may have a chance of deadlock if it gets
* blocked waiting for another task which itself is waiting
* for memory. Is there a better alternative?
*/
if (test_tsk_thread_flag(p, TIF_MEMDIE))
return ERR_PTR(-1UL);
/*
* This is in the process of releasing memory so wait for it
* to finish before killing some other task by mistake.
*
* However, if p is the current task, we allow the 'kill' to
* go ahead if it is exiting: this will simply set TIF_MEMDIE,
* which will allow it to gain access to memory reserves in
* the process of exiting and releasing its resources.
* Otherwise we could get an easy OOM deadlock.
*/
if (p->flags & PF_EXITING) {
if (p != current)
return ERR_PTR(-1UL);
chosen = p;
*ppoints = ULONG_MAX;
}
if (p->signal->oom_adj == OOM_DISABLE)
continue;
points = badness(p, uptime.tv_sec);
if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
}
}
return chosen;
}