至此,内存初始化部分已看完,遗留问题:
1、对于unicore或者mips的页表建立都很清楚,但是对于ARM我不清楚:
初始化部分涉及的页表映射建立,我都以unicore架构为准,ARM的页表映射从原理上讲easy,问题在于ARM的页表中没有引入Dirty、Accessed位,因此,对于如何在基于ARM架构的Linux系统上实现页回收就有些疑问,上次和同学看下代码,ARM使用了软件的方法解决了该问题,但是具体方法自己并不清楚. 当然对于新的ARM架构可能在页表项上已支持Dirty、Accessed位。
2、kswap的原理、实现?
3、对于zone->free_area的free_list的链表中的页的添加顺序(tail or head),以及在其它的缓存机制中加入链表时——加入链表尾和链表头的区别?冷页、热页?
4、ZONE_MOVABLE区,希望关于这个区是我自己看错了,或者是对于嵌入式系统该区就没有利用,因为在我跟踪内存管理初始化的过程中,该区基本没用,但确实存在该区:为什么要引入ZONE_MOVABLE、有什么用?
5、zone:lowmem_reserve有什么用?
6、我回避了文件系统初始化部分,以及init_post的实现过程,我暂时不想把它们混入内存初始化部分。
7、.etc
kmem_cache_init_late之后的代码至kswapd_init
start_kernel()
|-->page_address_init() | |-->setup_arch(&command_line); | |-->setup_per_cpu_areas(); | |-->build_all_zonelist() | |-->page_alloc_init() | |-->pidhash_init() | |-->vfs_caches_init_early() | |-->mm_init() | |-->....... | |-->init_IRQ() | |-->...... | |-->gfp_allowed_mask = __GFP_BITS_MASK; | |-->kmem_cache_init_late(); | |-->...... | |-->setup_per_cpu_pageset(); | 各个CPU申请内存时,如果需要获取页,则从各个zone中自己 | per_cpu_pageset获取(zone->pageset[]), 此处完成初始化. | |-->...... | |-->anon_vma_init(); |-->anon_vma_cachep = kmem_cache_create("anon_vma",
| sizeof(struct anon_vma), | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); | |-->anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); | 即anon_vma_chain_cachep = kmem_cache_create("anon_vma_chain", | sizeof(struct anon_vma_chain), | __alignof__(struct anon_vma_chain), | SLAB_PANIC, NULL); | |-->fork_init(totalram_pages); | |-->proc_caches_init(); | |-->...... | |-->buffer_init(); | 没看 |-->...... | |-->vfs_caches_init(totalram_pages); | 虚拟文件系统初始化 | |-->...... | |-->rest_init();
void setup_per_cpu_pageset(void) |-->struct zone *zone = NULL; | int cpu = 0; | |--for_each_populated_zone(zone) |--{ | 遍历所有的zone(zone->present_pages需不为0,注意ZONE_MOVABLE) | | zone->pageset = alloc_percpu(struct per_cpu_pageset); | 原来的pageset职责是由全局的boot_pageset变量担当的,现在进行重新申请. | 关于alloc_percpu的percpu资源初始化是在setup_per_cpu_areas中完成的, | 这是通过bootmem完成的资源分配. | | for_each_possible_cpu(cpu) | { | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | setup_pageset(pcp, zone_batchsize(zone)); | | 此处我们可以关注下对于boot_pageset的初始化: | setup_pageset(&per_cpu(boot_pageset, cpu), 0);最后的参数是0 | | if(percpu_pagelist_fraction) setup_pagelist_highmark(pcp, | (zone->present_pages / percpu_pagelist_fraction));
| 关于percpu_pagelist_fraction是通过proc来配置的,此处我们可以认为 | percpu_pagelist_fraction是0 | | } |--}
void fork_init(unsigned long mempages)
|-->task_struct_cachep = kmem_cache_create("task_struct",
| sizeof(struct task_struct), | ARCH_MIN_TASKALIGN, SLAB_PANIC|SLAB_NOTRACK, NULL); | |-->max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); | if(max_threads < 20) max_threads = 20; | |-->init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; | init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; | init_task.signal->rlim[RLIMIT_SIGPENDING] = | init_task.signal->rlim[RLIMIT_NRPOC];
void proc_caches_init(void) |-->sighand_cachep = kmem_cache_create("sighand_cache", | sizeof(struct sighand_struct), 0, | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| | SLAB_NOTRACK, sighand_ctor); | |-->signal_cachep = kmem_cache_create("signal_cache", | sizeof(struct signal_struct), 0, | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | |-->files_cachep = kmem_cache_create("files_cache", | sizeof(struct files_struct), 0, | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | |-->fs_cachep = kmem_cache_create("fs_cache", | sizeof(struct fs_struct), 0, | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | |-->mm_cachep = kmem_cache_create("mm_struct", | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | |-->vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); | 即kmem_cache_create("vm_area_struct", | sizeof(struct vm_area_struct),
| __alignof__(struct vm_area_struct), SLAB_PANIC, NULL); |-->mmap_init();
void mmap_init(void) |-->int ret; | ret = percpu_counter_init(&vm_committed_as, 0); | |-->__percpu_counter_init(&vm_committed_as, 0); int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, struct lock_class_key *key) |-->spin_lock_init(&fbc->lock); |-->fbc->count = amount; |-->fbc->counters = alloc_percpu(s32); |-->return 0;
void rest_init(void)
|-->kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); | |-->kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); | |-->kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); | |-->cpu_idle();
int kernel_init(void *unused) |-->smp_prepare_cpus(setup_max_cpus); | |-->wakeup_secondary(); | |-->smp_init() | |-->cpu_up(cpu) | | |-->_cpu_up(cpu, 0) | | | |-->__cpu_up(cpu); | | | |-->boot_secondary(cpu, idle) | |-->do_basic_setup(); |-->...... | 并不意味这不重要,此处先回避相应的函数 | |-->do_initcalls(); | 对于这个函数,我们主要关注下: | rootfs_initcall(populate_rootfs); | module_init(init_per_zone_wmark_min); | module_init(kswapd_init); | | |-->...... | |-->init_post();
int init_post(void) |-->free_initmem() |-->totalram_pages += free_area(__phys_to_pfn(__pa(__init_begin)), | __phys_to_ptn(__pa(__init_end)), | "init"); | 释放内核初始化相关部分的空间. | |-->system_state = SYSTEM_RUNNING; | |-->run_init_process(...)
我们此处重点关注: rootfs_initcall(populate_rootfs); module_init(init_per_zone_wmark_min); module_init(kswapd_init). 为什么? 1、populate_rootfs因为不知对于__initramfs_start ~ __initramfs_end的处理, 所以看下,当然不会太深入,这可是文件系统啊; 2、init_per_zone_wmark_min涉及zone->watermark[]初始化; 3、kswapd_init页回收机制初始化(这个没有深入,我想看会资料后,在详细做下记录). 直接编译进内核: #define __define_initcall(level, fn, id) static initcall_t __initcall_##fn##id __used __attribute__((__section__(".initcall" level ".init"))) = fn #define rootfs_initcall(fn) __define_initcall("rootfs", fn, rootfs) #define device_initcall(fn) __define_initcall("6", fn, 6) #define __initcall(fn) device_initcall(fn) #define module_init(x) __initcall(x) 链接脚本中的__early_initcall_end和__initcall_end void do_initcalls(void) |-->initcall_t *fn; |-->for(fn = __early_initcall_end; fn < __initcall_end; fn++) | do_one_initcall(*fn); | |-->flush_scheduled_work();
int populate_rootfs(void) |-->char *err = unpack_to_rootfs(__initramfs_start, | __initramfs_end - __initramfs_start); | 关注下usr目录下的Makefile就明白了. | |-->if(initrd_start) |--{ | err = unpack_to_rootfs((char*)initrd_start, initrd_start, | initrd_end - initrd_start); | 此处没有深入下去,文件系统以后解决. | | if(!err){free_initrd(); return 0;} | else ..... | ...... |--} | return 0; 这个没有深入,我想看会资料后,在详细做下记录 int kswapd_init(void) |-->swap_setup(); |-->kswapd_run(0); |-->contig_page_data.kswapd = kthread_run(
| &contig_page_data, "kswapd0");
int init_per_zone_wmark_min(void) |-->unsigned long lowmem_kbytes; | lowmen_kbytes = nr_free_buffer_pages() * (PGAE_SIZE >> 10); | nr_free_buffer_pages()获取ZONE_DMA和ZONE_NORMAL区的页数. | 当然,如果没有ZONE_DMA,则只获取ZONE_NORMAL区的页数 | |-->min_free_kbytes = int_sqrt(lowmem_kbytes * 16); | |-->if(min_free_kbytes < 128) min_free_kbytes = 128; |-->if(min_free_kbytes > 65536) min_free_kbytes = 65536; | |-->setup_per_zone_wmarks(); |-->setup_per_zone_lowmem_reserve(); |-->setup_per_zone_inactive_ratio(); |-->return 0;
关于该函数的理解,可能有误: 每个zone都可能被耗尽,为了解决这种情况, 就在比自己低阶的zone中,把自身的一部分内存保存在低阶的 zone->lowmem_reserve[]中,以备自身被耗尽. void setup_per_zone_lowmem_reserve(void) |-->struct pglist_data *pgdat; | enum zone_type j, idx; | |-->考虑UMA |-->pgdat = contig_page_data; |-->for(j = 0; j < MAX_NR_ZONES; j++) |--{ | struct zone *zone = pgdat->node_zones + j; | unsigned long present_pages = zone->present_pages; | zone->lowmem_reserve[j] = 0; | idx = j; | while(idx) | { | struct zone *lower_zone; | idx--; | if(sysctl_lowmem_reserve_ratio[idx] < 1) | sysctl_lowmem_reserve_ratio = 1; | | lower_zone = pgdat->node_zones + idx; | lower_zone->lowmem_reserve[j] = present_pages / | sysctl_lowmem_reserve_ratio[idx]; | present_pages += lower_zone->present_pages; | } |--} | |-->calculate_totalreserve_pages();
void setup_per_zone_inactive_ratio(void) |-->for_each_zone(zone) |-->calculate_zone_inactive_ratio(zone);
void calculate_zone_inactive_ratio(struct zone *zone) |-->unsigned int gb, ratio; | |-->gb = zone->present_pages >> (30 - PAGE_SHIFT); | |-->if(gb) = int_sqrt(10 * gb); | else ratio = 1; | |-->zone->inactive_ratio = ratio;
void setup_per_zone_wmarks(void) |-->unsigned long pages_min = min_free_kbytes >> (PAGE_SHITF - 10); | 获取min_free_kbytes所对应的页数. | |-->unsigned long lowmem_pages = 0; | struct zone *zone = NULL; | unsigned long flags = 0; | |-->for_each_zone(zone) |--{ | if(!is_highmem(zone)) lowmem_pages += zone->present_pages; | 如注释所言,获取非ZONE_HIGHMEM的zone中的present_pages总和. |--} | |--for_each_zone(zone) |--{ | u64 tmp; | tmp = (u64)pages_min * zone->present_pages; | do_div(tmp, lowmem_pages); //tmp = tmp / lowmem_pages | 对于以上两句,以通常的数学意义容易理解: | pages_min * (zone->present_pages / lowmem_pages); | if(is_highmem(zone)) | { | 对于ZONE_HIGMEM做特殊处理. | int min_pages = zone->present_pages / 1024; | if(min_pages < SWAP_CLUSTER_MAX) | min_pages = SWAP_CLUSTER_MAX; | if(min_pages > 128) | min_pages = 128; | zone->watermark[WMARK_MIN] = min_pages; | } | else | zone->watermark[WMARK_MIN] = tmp; | | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | setup_zone_migrate_reserve(zone); |--} | |--calculate_totalreserve_pages(); |
void setup_zone_migrate_reserve(struct zone *zone) |-->unsigned long start_pfn, pfn, end_pfn; | struct page *page; | unsigned long block_migratetype; | int reserve; | |-->start_pfn = zone->zone_start_pfn; | zone->zone_start_pfn是该zone中起始的那个页在物理内存中位置. | end_pfn = start_pfn + zone->spanned_pages; | |-->reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) | >> pageblock_order; | reserve = min(2, reserve); | 以pageblock_order的整数被内存作为MIGRATE_RESERVE大小. | |-->for(pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) |--{ | page = pfn_to_page(pfn); | if(PageRserved(page)) continue; | block_migratetype = get_pageblock_migratetype(page); | | if(reserve > 0 && block_migratetype == MIGRATE_RESERVE) | { | 初始化过程中,到现在为止,我们只用到了MIGRATE_UNMOVABLE, | MIGRATE_MOVABLE,只所以要检查,因为该函数可能通过proc | 被调用. | reserve--; continue; | } | | if(reserve > 0 && block_migratetype == MIGRATE_MOVABLE) | { | set_pageblock_migratetype(page, MIGRATE_RESERVE); | move_freepages_block(zone, page, MIGRATE_RESERVE); | 从MIGRATE_MOVABLE上迁移页块到MIGRATE_RESERVE链表上. | reserve--; | continue; | }
|
| if(block_migratetype = MIGRATE_RESERVE)
| {
| set_pageblock_migratetype(page, MIGRATE_MOVABLE); | move_freepages_block(zone, page, MIGRATE_MOVABLE); | } |--}
int zone_batchsize(struct zone *zone) |-->int batch; | |-->batch = zone->present_pages / 1024; | if(batch * PAGE_SIZE > 512 * 1024) | batch = (512 * 1024) / PAGE_SIZE; | batch /= 4; | if(batch < 1) batch = 1; | batch = rounddown_pow_of_two(batch + batch/2) - 1; | return batch;
void calculate_totalreserve_pages(void)
|-->struct pglist_data *pgdat = NULL; | unsigned long reserve_pages = 0; | enum zone_type i, j; | |-->考虑UMA |--pgdat = contig_page_data; |--for(i = 0; i < MAX_NR_ZONES; i++) |--{ | struct zone *zone = pgdat->node_zones + i; | unsigned long max = 0; | for(j = i; j < MAX_NR_ZONES; j++) | { | if(zone->lowmem_reserve[j] > max) | max = zone->lowmem_reserve[j]; | 在初始化过程中,setup_per_lowmem_reserve是后于setup_per_zone_wmarks | 执行的,因此,首次执行calculate_totoalreseve_pages时,
| zone->lowmem_reserve[]为0,执行setup_per_lowmem_reserve
| 后zone->lowmem_reserve[]被修正.
| 注意,在setup_per_lowmem_reserve中也会执行该函数. | } | | max += high_wmark_pages(zone); | | if(max > zone->present_pages) | max = zone->present_pages; | | reserve_pages += max; |--} | |-->totoalreserve_pages = reserve_pages;
已是十月份,毕设+论文……后阶段,内容会更新地较慢,更多地从机制上阐述内核,关于具体的策略实现不会再如此详细。