作者
彭东林
pengdonglin137@163.com
平台
Linux-4.10.17
Qemu + vexpress-ca9
概述
通过配置内核,会在/sys/kernel/debug下产生一个名为kernel_page_tables的文件,查看这个文件可以知道当前内核页表的映射信息。
正文
一、配置内核
首先配置内核,使其支持导出内核页表到debugfs下面:
Kernel hacking --->
---> [*] Export kernel pagetable layout to userspace via debugfs
配置完后,重新编译内核,并用新内核启动,就会在/sys/kernel/debug下看到kernel_page_tables文件:
然后cat该文件,可以获得如下信息:
1 [root@vexpress debug]# cat kernel_page_tables 2 ---[ Modules ]--- 3 0xbfe01000-0xbfe1e000 116K RW NX SHD MEM/CACHED/WBWA 4 ---[ Kernel Mapping ]--- 5 0xc0000000-0xc0100000 1M RW NX SHD 6 0xc0100000-0xc0700000 6M ro x SHD 7 0xc0700000-0xc0900000 2M ro NX SHD 8 0xc0900000-0xf0000000 759M RW NX SHD 9 ---[ vmalloc() Area ]--- 10 0xf0800000-0xf0801000 4K RW NX SHD DEV/SHARED 11 0xf0802000-0xf0803000 4K RW NX SHD DEV/SHARED 12 0xf0804000-0xf0805000 4K RW NX SHD DEV/SHARED 13 0xf0806000-0xf0807000 4K RW NX SHD DEV/SHARED 14 0xf0808000-0xf0809000 4K RW NX SHD DEV/SHARED 15 0xf080a000-0xf080b000 4K RW NX SHD DEV/SHARED 16 0xf080c000-0xf080d000 4K RW NX SHD DEV/SHARED 17 0xf0814000-0xf0815000 4K RW NX SHD DEV/SHARED 18 0xf0816000-0xf0817000 4K RW NX SHD DEV/SHARED 19 0xf0818000-0xf0819000 4K RW NX SHD DEV/SHARED 20 0xf081a000-0xf081b000 4K RW NX SHD DEV/SHARED 21 0xf081c000-0xf085c000 256K RW NX SHD MEM/BUFFERABLE/WC 22 0xf085d000-0xf085e000 4K RW NX SHD DEV/SHARED 23 0xf085f000-0xf0860000 4K RW NX SHD DEV/SHARED 24 0xf0861000-0xf0862000 4K RW NX SHD DEV/SHARED 25 0xf0875000-0xf0876000 4K RW NX SHD DEV/SHARED 26 0xf0879000-0xf087a000 4K RW NX SHD DEV/SHARED 27 0xf087d000-0xf087e000 4K RW NX SHD DEV/SHARED 28 0xf0889000-0xf088a000 4K RW NX SHD DEV/SHARED 29 0xf088b000-0xf088c000 4K RW NX SHD DEV/SHARED 30 0xf088d000-0xf0898000 44K RW NX SHD MEM/CACHED/WBWA 31 0xf0899000-0xf08db000 264K RW NX SHD MEM/CACHED/WBWA 32 0xf08dc000-0xf08e7000 44K RW NX SHD MEM/CACHED/WBWA 33 0xf08e8000-0xf0908000 128K RW NX SHD MEM/CACHED/WBWA 34 0xf0909000-0xf0929000 128K RW NX SHD MEM/CACHED/WBWA 35 0xf092a000-0xf094a000 128K RW NX SHD MEM/CACHED/WBWA 36 0xf094b000-0xf096b000 128K RW NX SHD MEM/CACHED/WBWA 37 0xf096c000-0xf098c000 128K RW NX SHD MEM/CACHED/WBWA 38 0xf098d000-0xf09ad000 128K RW NX SHD MEM/CACHED/WBWA 39 0xf09ae000-0xf09ce000 128K RW NX SHD MEM/CACHED/WBWA 40 0xf09cf000-0xf09ef000 128K RW NX SHD MEM/CACHED/WBWA 41 0xf09f0000-0xf09f1000 4K RW NX SHD DEV/SHARED 42 0xf09f2000-0xf09f3000 4K RW NX SHD DEV/SHARED 43 0xf09f4000-0xf09f5000 4K RW NX SHD DEV/SHARED 44 0xf09f6000-0xf0b76000 1536K RW NX SHD MEM/BUFFERABLE/WC 45 0xf0b77000-0xf0b78000 4K RW NX SHD DEV/SHARED 46 0xf0b79000-0xf0b7a000 4K RW NX SHD DEV/SHARED 47 0xf0b80000-0xf1380000 8M RW NX SHD DEV/SHARED 48 0xf1385000-0xf1386000 4K RW NX SHD DEV/SHARED 49 0xf1387000-0xf1388000 4K RW NX SHD DEV/SHARED 50 0xf1389000-0xf138a000 4K RW NX SHD DEV/SHARED 51 0xf138b000-0xf138c000 4K RW NX SHD DEV/SHARED 52 0xf138d000-0xf138e000 4K RW NX SHD DEV/SHARED 53 0xf1390000-0xf13a0000 64K RW NX SHD DEV/SHARED 54 0xf13aa000-0xf13b5000 44K RW NX SHD MEM/CACHED/WBWA 55 0xf13b6000-0xf13b9000 12K RW NX SHD MEM/CACHED/WBWA 56 0xf1400000-0xf5400000 64M RW NX SHD DEV/SHARED 57 0xf5401000-0xf5425000 144K RW NX SHD MEM/CACHED/WBWA 58 0xf5480000-0xf7480000 32M RW NX SHD DEV/SHARED 59 0xf8009000-0xf800a000 4K RW NX SHD DEV/SHARED 60 0xf8080000-0xfc080000 64M RW NX SHD DEV/SHARED 61 ---[ vmalloc() End ]--- 62 ---[ Fixmap Area ]--- 63 0xffecd000-0xffecf000 8K RW NX SHD MEM/CACHED/WBWA 64 0xffedd000-0xffedf000 8K RW NX SHD MEM/CACHED/WBWA 65 0xffeed000-0xffeef000 8K RW NX SHD MEM/CACHED/WBWA 66 0xffefd000-0xffeff000 8K RW NX SHD MEM/CACHED/WBWA 67 ---[ Vectors ]--- 68 0xffff0000-0xffff1000 4K USR ro x SHD MEM/CACHED/WBWA 69 0xffff1000-0xffff2000 4K ro x SHD MEM/CACHED/WBWA 70 ---[ Vectors End ]---
上面每一行的含义:被映射的虚拟地址的范围、大小以及该段内存所具备的属性
二、解释
1、基础知识
参考文档:
我们使用的开发板是用Qemu虚拟的一个四核的Cortex-A9的板子,物理内存1GB、物理内存起始地址0x6000_0000,使用了段式和页式两种页表。
段式
这种映射方式一次映射1MB,只需要一级段表就可以实现虚拟到物理的映射
段式映射过程:
页式
这种情况下需要两级页表才能实现虚拟到物理内存的映射
第1级:(一个表项可以映射1MB)
第2级:(一个页表项映射4KB)
页表式映射过程:
2、代码分析
文件:arch/arm/mm/dump.c
定义了两个结构体数组变量section_bits和pte_bits,其中section_bits用于第一种段映射,可以根据段表项的[19:0]表示的该段的属性返回相应的字符串。pte_bits用于第二种页表映射,可以根据页表项的[11:0]表示的该页的属性获得对应的字符串。
section_bits:
static const struct prot_bits section_bits[] = { { .mask = PMD_SECT_APX | PMD_SECT_AP_READ | PMD_SECT_AP_WRITE, // (1<<15) | (1<<11)|(1<<10) .val = PMD_SECT_APX | PMD_SECT_AP_WRITE, // (1<<15) | (1<<10) .set = " ro", }, { .mask = PMD_SECT_APX | PMD_SECT_AP_READ | PMD_SECT_AP_WRITE, // (1<<15) | (1<<11)|(1<<10) .val = PMD_SECT_AP_WRITE, // (1<<10) .set = " RW", }, { .mask = PMD_SECT_APX | PMD_SECT_AP_READ | PMD_SECT_AP_WRITE, // (1<<15) | (1<<11)|(1<<10) .val = PMD_SECT_AP_READ, // (1<<11) .set = "USR ro", }, { .mask = PMD_SECT_APX | PMD_SECT_AP_READ | PMD_SECT_AP_WRITE, // (1<<15) | (1<<11)|(1<<10) .val = PMD_SECT_AP_READ | PMD_SECT_AP_WRITE, // (1<<11)|(1<<10) .set = "USR RW", }, { .mask = PMD_SECT_XN, // (1<<4) .val = PMD_SECT_XN, // (1<<4) .set = "NX", .clear = "x ", }, { .mask = PMD_SECT_S, // (1<<16) .val = PMD_SECT_S, // (1<<16) .set = "SHD", .clear = " ", }, };
pte_bits:
static const struct prot_bits pte_bits[] = { { .mask = L_PTE_USER, .val = L_PTE_USER, .set = "USR", .clear = " ", }, { .mask = L_PTE_RDONLY, .val = L_PTE_RDONLY, .set = "ro", .clear = "RW", }, { .mask = L_PTE_XN, .val = L_PTE_XN, .set = "NX", .clear = "x ", }, { .mask = L_PTE_SHARED, .val = L_PTE_SHARED, .set = "SHD", .clear = " ", }, { .mask = L_PTE_MT_MASK, .val = L_PTE_MT_UNCACHED, .set = "SO/UNCACHED", }, { .mask = L_PTE_MT_MASK, .val = L_PTE_MT_BUFFERABLE, .set = "MEM/BUFFERABLE/WC", }, { .mask = L_PTE_MT_MASK, .val = L_PTE_MT_WRITETHROUGH, .set = "MEM/CACHED/WT", }, { .mask = L_PTE_MT_MASK, .val = L_PTE_MT_WRITEBACK, .set = "MEM/CACHED/WBRA", }, { .mask = L_PTE_MT_MASK, .val = L_PTE_MT_MINICACHE, .set = "MEM/MINICACHE", }, { .mask = L_PTE_MT_MASK, .val = L_PTE_MT_WRITEALLOC, .set = "MEM/CACHED/WBWA", }, { .mask = L_PTE_MT_MASK, .val = L_PTE_MT_DEV_SHARED, .set = "DEV/SHARED", }, { .mask = L_PTE_MT_MASK, .val = L_PTE_MT_DEV_NONSHARED, .set = "DEV/NONSHARED", }, { .mask = L_PTE_MT_MASK, .val = L_PTE_MT_DEV_WC, .set = "DEV/WC", }, { .mask = L_PTE_MT_MASK, .val = L_PTE_MT_DEV_CACHED, .set = "DEV/CACHED", }, };
然后将这两个结构体数组放到pg_level中,将来便利kernel页表会用到
static struct pg_level pg_level[] = { { }, { /* pgd */ }, { /* pud */ }, { /* pmd */ .bits = section_bits, .num = ARRAY_SIZE(section_bits), }, { /* pte */ .bits = pte_bits, .num = ARRAY_SIZE(pte_bits), }, };
设置pg_level的mask成员,代码如下:
1 static int ptdump_init(void) 2 { 3 struct dentry *pe; 4 unsigned i, j; 5 6 for (i = 0; i < ARRAY_SIZE(pg_level); i++) 7 if (pg_level[i].bits) 8 for (j = 0; j < pg_level[i].num; j++) 9 pg_level[i].mask |= pg_level[i].bits[j].mask; 10 11 address_markers[2].start_address = VMALLOC_START; 12 13 pe = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, 14 &ptdump_fops); 15 return pe ? 0 : -ENOMEM; 16 } 17 __initcall(ptdump_init);
第6到第8行,设置pg_level[i]的mask,用于提高运行效率
第11行,address_markers[2]表示的vmalloc区域的起始地址,因为vmalloc区域的大小不固定,会根据实际的物理内存的尺寸发生变化
第13行会在/sys/kernel/debug下创建名为kernel_page_tables的文件
此外,在该文件中还定义了aarch32下Linux内核内存布局,第一列是每个区域的起始虚拟地址,第二列为该区域的名称
1 static struct addr_marker address_markers[] = { 2 { MODULES_VADDR, "Modules" }, 3 { PAGE_OFFSET, "Kernel Mapping" }, 4 { 0, "vmalloc() Area" }, 5 { VMALLOC_END, "vmalloc() End" }, 6 { FIXADDR_START, "Fixmap Area" }, 7 { CONFIG_VECTORS_BASE, "Vectors" }, 8 { CONFIG_VECTORS_BASE + PAGE_SIZE * 2, "Vectors End" }, 9 { -1, NULL }, 10 };
这部分可以参考Documentation/arm/memory.txt:
Start | End | Use |
ffff8000 | ffffffff | copy_user_page / clear_user_page use. |
ffff4000 | ffffffff | cache aliasing on ARMv6 and later CPUs |
ffff1000 | ffff7fff | Reserved Platforms must not use this address range |
ffff0000 | ffff0fff | CPU vector page. The CPU vectors are mapped here if the CPU supports vector relocation (control register V bit.) |
fffe8000 | fffeffff | DTCM mapping area for platforms with DTCM mounted inside the CPU. |
fffe0000 | fffe7fff | ITCM mapping area for platforms with ITCM mounted inside the CPU |
ffc00000 | ffefffff | Fixmap mapping region. Addresses provided by fix_to_virt() will be located here |
VMALLOC_START | VMALLOC_END-1 | vmalloc() / ioremap() space. Memory returned by vmalloc/ioremap will be dynamically placed in this region.Machine specific static mappings are also located here through iotable_init().VMALLOC_START is based upon the value of the high_memory variable, and VMALLOC_END is equal to 0xff800000. |
PAGE_OFFSET | high_memory-1 | Kernel direct-mapped RAM region.This maps the platforms RAM, and typically maps all platform RAM in a 1:1 relationship |
PKMAP_BASE | PAGE_OFFSET-1 | Permanent kernel mappings One way of mapping HIGHMEM pages into kernel space. |
MODULES_VADDR | MODULES_END-1 | Kernel module space Kernel modules inserted via insmod are placed here using dynamic mappings. |
00001000 | TASK_SIZE-1 | User space mappings Per-thread mappings are placed here via the mmap() system call. |
00000000 | 00000fff | CPU vector page / null pointer trap CPUs which do not support vector remapping place their vector page here. NULL pointer dereferences by both the kernel and user space are also caught via this mapping. |
当读取kernel_page_tables文件时,函数ptdump_show被调用,该函数又进一步调用了walk_pgd,下面从walk_pgd开始分析:
1 static void walk_pgd(struct seq_file *m) 2 { 3 pgd_t *pgd = swapper_pg_dir; 4 struct pg_state st; 5 unsigned long addr; 6 unsigned i; 7 8 memset(&st, 0, sizeof(st)); 9 st.seq = m; 10 st.marker = address_markers; 11 12 for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { 13 addr = i * PGDIR_SIZE; 14 if (!pgd_none(*pgd)) { 15 walk_pud(&st, pgd, addr); 16 } else { 17 note_page(&st, addr, 1, pgd_val(*pgd)); 18 } 19 } 20 21 note_page(&st, 0, 0, 0); 22 }
第3行的swapper_pg_dir就是内核一级表的虚拟起始地址,也就是0xC0004000,范围是0xC0004000~0xC0008000
第12到19行,遍历解析内核映射表,输出映射信息
第12行的PRTS_PER_PGD为2048,第13行的PGDIR_SIZE是2MB。在ARM Linux中,pgd_t的定义如下:
typedef struct { pmdval_t pgd[2]; } pgd_t;
即,一个pgd实际存放了两个连续的一级段表项,每一个可以映射1MB,所以一共2MB。
第13行的addr表示的是虚拟地址,从0开始,按照2MB对齐
第14行,由于目前最多只使用了两级页表,没有pud和pmd实际上都跟pgd合并了,pgd_none返回0.
第15行,开始进入walk_pud,由于没有pud和pmd,该函数会调用到walk_pmd, 然后在这个函数中会根据一级表项的类型(段表or页表)执行不同的操作
第21行,确保输出信息的完整
下面简单看看walk_pud:
1 static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) 2 { 3 pud_t *pud = pud_offset(pgd, 0); 4 unsigned long addr; 5 unsigned i; 6 7 for (i = 0; i < PTRS_PER_PUD; i++, pud++) { 8 addr = start + i * PUD_SIZE; 9 if (!pud_none(*pud)) { 10 walk_pmd(st, pud, addr); 11 } else { 12 note_page(st, addr, 2, pud_val(*pud)); 13 } 14 } 15 }
第7行的PTRS_PER_PUD是1
第9行的pud_none返回0
所以walk_pud的for循环只循环一次
下面分析walk_pmd,这里的start实际就是前面的2MB对齐的虚拟地址
1 static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) 2 { 3 pmd_t *pmd = pmd_offset(pud, 0); 4 unsigned long addr; 5 unsigned i; 6 7 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { 8 addr = start + i * PMD_SIZE; 9 printk("%s: addr: 0x%lx, pmd: 0x%p, pmd[0]: 0x%x, pmd[1]: 0x%x ", 10 __func__, addr, pmd, pmd[0], pmd[1]); 11 if (pmd_none(*pmd) || pmd_large(*pmd) || !pmd_present(*pmd)) 12 note_page(st, addr, 3, pmd_val(*pmd)); 13 else 14 walk_pte(st, pmd, addr); 15 16 if (SECTION_SIZE < PMD_SIZE && pmd_large(pmd[1])) 17 note_page(st, addr + SECTION_SIZE, 3, pmd_val(pmd[1])); 18 } 19 }
第7行PTRS_PER_PMD是1,所以这里的for循环也只循环一次
第11行的pmd_none判断pmd所指的一级表项是否为空,对于内核页表小于MODULES_VADDR的那些虚拟地址对应的pmd都是0;pmd_large判断段表还是两级页表的第一级,这里实际上是判断表项的[1:0],如果是段表,bit0任意,bit1位1;如果是两级页表的第一级,bit0是1,bit1是0. 最后一个!pmd_present在这里跟pmd_none是一个逻辑
如果是段表,第12行的note_page会被调用,这个处理的是pmd[0],处理完后,第17行判断如果pmd[1]如果也是段表的话,在处理pmd[1],这里的SECTION_SIZE是1MB
如果是二级页表的第一级的话,walk_pte会被调用,这个函数会找到第二级页表的基地址,然后遍历其中的每一个二级页表项
下面分析一下walk_pte
1 static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) 2 { 3 pte_t *pte = pte_offset_kernel(pmd, 0); 4 unsigned long addr; 5 unsigned i; 6 7 for (i = 0; i < PTRS_PER_PTE; i++, pte++) { 8 addr = start + i * PAGE_SIZE; 9 printk("-- %s: start: 0x%lx, pte: 0x%p, *pte: 0x%x ", __func__, addr, pte, pte_val(*pte)); 10 note_page(st, addr, 4, pte_val(*pte)); 11 } 12 }
第7行的PTRS_PER_PTE是512。也就是这里遍历了pmd[0]和pmd[1]所指向的二级页表,如果pmd[0]是二级页表的第一级,那么pmd[1]也是二级页表的第一级
下面分析note_page,读取kernel_page_tables时的输出信息都来自该函数:
1 static void note_page(struct pg_state *st, unsigned long addr, unsigned level, u64 val) 2 { 3 static const char units[] = "KMGTPE"; 4 u64 prot = val & pg_level[level].mask; 5 6 if (!st->level) { 7 st->level = level; 8 st->current_prot = prot; 9 seq_printf(st->seq, "---[ %s ]--- ", st->marker->name); 10 printk("---[ %s ]--- ", st->marker->name); 11 } else if (prot != st->current_prot || level != st->level || 12 addr >= st->marker[1].start_address) { 13 const char *unit = units; 14 unsigned long delta; 15 16 if (st->current_prot) { 17 seq_printf(st->seq, "0x%08lx-0x%08lx ", 18 st->start_address, addr); 19 printk("0x%08lx-0x%08lx ", 20 st->start_address, addr); 21 22 delta = (addr - st->start_address) >> 10; 23 while (!(delta & 1023) && unit[1]) { 24 delta >>= 10; 25 unit++; 26 } 27 seq_printf(st->seq, "%9lu%c", delta, *unit); 28 if (pg_level[st->level].bits) 29 dump_prot(st, pg_level[st->level].bits, pg_level[st->level].num); 30 seq_printf(st->seq, " "); 31 } 32 33 if (addr >= st->marker[1].start_address) { 34 st->marker++; 35 seq_printf(st->seq, "---[ %s ]--- ", st->marker->name); 36 printk("---[ %s ]--- ", st->marker->name); 37 } 38 st->start_address = addr; 39 st->current_prot = prot; 40 st->level = level; 41 } 42 }
第11行的if判断比较重要,如果正在遍历的内存段的属性跟前一个不同的话,那么会输出前一个内存段的信息。利用这里的逻辑,虚拟地址连续且属性相同的内存段会合并输出信息
第22到26行,计算内存段的大小
第27行输出内存段的大小
第29行将该内存段的属性转换成对应的字符串
1 static void dump_prot(struct pg_state *st, const struct prot_bits *bits, size_t num) 2 { 3 unsigned i; 4 5 for (i = 0; i < num; i++, bits++) { 6 const char *s; 7 8 if ((st->current_prot & bits->mask) == bits->val) 9 s = bits->set; 10 else 11 s = bits->clear; 12 13 if (s) 14 seq_printf(st->seq, " %s", s); 15 } 16 }
函数dump_prot会根据current_prot指定的属性(对于段表,来自*pmd,对于页表,来自*pte),输出对应的字符串。
附件是读取kernel_page_tables时打印的log,可以帮助理解上面的程序。
完。