• 内核态内存映射


    内核页表

    和用户态页表不同,在系统初始化的时候,我们就要创建内核页表了。我们从内核页表的根 swapper_pg_dir 开始找线索。

    // arch/x86/include/asm/pgtable_64.h
    extern pud_t level3_kernel_pgt[512];
    extern pud_t level3_ident_pgt[512]; // 对应直接映射区
    extern pmd_t level2_kernel_pgt[512]; // 对应内核代码区
    extern pmd_t level2_fixmap_pgt[512]; // 对应固定映射区
    extern pmd_t level2_ident_pgt[512];
    extern pte_t level1_fixmap_pgt[512];
    extern pgd_t init_top_pgt[];
    
    #define swapper_pg_dir init_top_pgt // 指向内核最顶级的目录 pgd

    内核页表的顶级目录 init_top_pgt,定义在 __INITDATA 里面。咱们讲过 ELF 的格式,也讲过虚拟内存空间的布局。它们都有代码段,还有一些初始化了的全局变量,放在.init 区域。这些说的就是这个区域。

    可以看到,页表的根其实是全局变量,这就使得我们初始化的时候,甚至内存管理还没有初始化的时候,很容易就可以定位到。

    // archx86kernelhead_64.S
    
    __INITDATA
    
    // quad 是声明了一项的内容,org 是跳到了某个位置
    NEXT_PAGE(init_top_pgt)
      .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
      .org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
      .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
      .org    init_top_pgt + PGD_START_KERNEL*8, 0
      /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
      .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
    
    
    NEXT_PAGE(level3_ident_pgt)
      .quad  level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
      .fill  511, 8, 0
    NEXT_PAGE(level2_ident_pgt)
      /* Since I easily can, map the first 1G.
       * Don't set NX because code runs from these pages.
       */
      PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
    
    
    NEXT_PAGE(level3_kernel_pgt)
      .fill  L3_START_KERNEL,8,0
      /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
      .quad  level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
      .quad  level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
    
    
    NEXT_PAGE(level2_kernel_pgt)
      /*
       * 512 MB kernel mapping. We spend a full page on this pagetable
       * anyway.
       *
       * The kernel code+data+bss must not be bigger than that.
       *
       * (NOTE: at +512MB starts the module area, see MODULES_VADDR.
       *  If you want to increase this then increase MODULES_VADDR
       *  too.)
       */
      PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
        KERNEL_IMAGE_SIZE/PMD_SIZE)
    
    
    NEXT_PAGE(level2_fixmap_pgt)
      .fill  506,8,0
      .quad  level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
      /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
      .fill  5,8,0
    
    
    NEXT_PAGE(level1_fixmap_pgt)
      .fill  51
    View Code
    // __PAGE_OFFSET_BASE: 虚拟地址空间里面内核的起始地址
    // __START_KERNEL_map: 虚拟地址空间里面内核代码段的起始地址
    PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
    PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
    L3_START_KERNEL = pud_index(__START_KERNEL_map)

    如果是用户态进程页表,会有 mm_struct 指向进程顶级目录 pgd,对于内核来讲,也定义了一个 mm_struct,指向 swapper_pg_dir。

    struct mm_struct init_mm = {
      .mm_rb    = RB_ROOT,
      .pgd    = swapper_pg_dir,
      .mm_users  = ATOMIC_INIT(2),
      .mm_count  = ATOMIC_INIT(1),
      .mmap_sem  = __RWSEM_INITIALIZER(init_mm.mmap_sem),
      .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
      .mmlist    = LIST_HEAD_INIT(init_mm.mmlist),
      .user_ns  = &init_user_ns,
      INIT_MM_CONTEXT(init_mm)
    };

    定义完了内核页表,接下来是初始化内核页表,在系统启动的时候 start_kernel 会调用 setup_arch。

    在 setup_arch 中,load_cr3(swapper_pg_dir) 说明内核页表要开始起作用了,并且刷新了 TLB,初始化 init_mm 的成员变量,最重要的就是 init_mem_mapping。最终它会调用 kernel_physical_mapping_init。

    在 kernel_physical_mapping_init 里,我们先通过 __va 将物理地址转换为虚拟地址,然后再创建虚拟地址和物理地址的映射页表。

    void __init setup_arch(char **cmdline_p)
    {
      /*
       * copy kernel address range established so far and switch
       * to the proper swapper page table
       */
      clone_pgd_range(swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
          initial_page_table + KERNEL_PGD_BOUNDARY,
          KERNEL_PGD_PTRS);
    
    
      load_cr3(swapper_pg_dir);
      __flush_tlb_all();
      ......
      init_mm.start_code = (unsigned long) _text;
      init_mm.end_code = (unsigned long) _etext;
      init_mm.end_data = (unsigned long) _edata;
      init_mm.brk = _brk_end;
      ......
      init_mem_mapping();
      ......
    }
    
    
    /*
     * Create page table mapping for the physical memory for specific physical
     * addresses. The virtual and physical addresses have to be aligned on PMD level
     * down. It returns the last physical address mapped.
     */
    unsigned long __meminit
    kernel_physical_mapping_init(unsigned long paddr_start,
               unsigned long paddr_end,
               unsigned long page_size_mask)
    {
      unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
    
      paddr_last = paddr_end;
      vaddr = (unsigned long)__va(paddr_start);
      vaddr_end = (unsigned long)__va(paddr_end);
      vaddr_start = vaddr;
    
      for (; vaddr < vaddr_end; vaddr = vaddr_next) {
        pgd_t *pgd = pgd_offset_k(vaddr);
        p4d_t *p4d;
    
        vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
    
        if (pgd_val(*pgd)) {
          p4d = (p4d_t *)pgd_page_vaddr(*pgd);
          paddr_last = phys_p4d_init(p4d, __pa(vaddr),
                   __pa(vaddr_end),
                   page_size_mask);
          continue;
        }
    
        p4d = alloc_low_page();
        paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
                 page_size_mask);
    
        p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
      }
      __flush_tlb_all();
    
      return paddr_l
    }

    vmalloc 和 kmap_atomic 原理

    在用户态可以通过 malloc 函数分配内存,当然 malloc 在分配比较大的内存的时候,底层调用的是 mmap,当然也可以直接通过 mmap 做内存映射,在内核里面也有相应的函数。

    在虚拟地址空间里面,有个 vmalloc 区域,从 VMALLOC_START 开始到 VMALLOC_END,可以用于映射一段物理内存。

    /**
     *  vmalloc  -  allocate virtually contiguous memory
     *  @size:    allocation size
     *  Allocate enough pages to cover @size from the page level
     *  allocator and map them into contiguous kernel virtual space.
     *
     *  For tight control over page level allocator and protection flags
     *  use __vmalloc() instead.
     */
    void *vmalloc(unsigned long size)
    {
      return __vmalloc_node_flags(size, NUMA_NO_NODE,
                GFP_KERNEL);
    }
    
    
    static void *__vmalloc_node(unsigned long size, unsigned long align,
              gfp_t gfp_mask, pgprot_t prot,
              int node, const void *caller)
    {
      return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
            gfp_mask, prot, 0, node, caller);
    }
    void *kmap_atomic_prot(struct page *page, pgprot_t prot)
    {
      ......
      // 如果是 64 位没有高端地址的,就调用 page_address,里面会调用 lowmem_page_address
    // 其实低端内存的映射,会直接使用 __va 进行临时映射
      if (!PageHighMem(page))
        return page_address(page);
      ......
      // 如果是 32 位有高端地址的,就需要调用 set_pte 通过内核页表进行临时映射
      vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
      set_pte(kmap_pte-idx, mk_pte(page, prot));
      ......
      return (void *)vaddr;
    }
    
    
    void *kmap_atomic(struct page *page)
    {
      return kmap_atomic_prot(page, kmap_prot);
    }
    
    
    static __always_inline void *lowmem_page_address(const struct page *page)
    {
      return page_to_virt(page);
    }
    
    
    #define page_to_virt(x)  __va(PFN_PHYS(page_to_pfn(x)

    可以看出,kmap_atomic 和 vmalloc 不同。

    kmap_atomic 发现,没有页表的时候,就直接创建页表进行映射了。

    而 vmalloc 没有,它只分配了内核的虚拟地址。所以,访问它的时候,会产生缺页异常。

    内核态的缺页异常还是会调用 do_page_fault,但是会走到vmalloc_fault。这个函数并不复杂,主要用于关联内核页表项。

    /*
     * 32-bit:
     *
     *   Handle a fault on the vmalloc or module mapping area
     */
    static noinline int vmalloc_fault(unsigned long address)
    {
      unsigned long pgd_paddr;
      pmd_t *pmd_k;
      pte_t *pte_k;
    
    
      /* Make sure we are in vmalloc area: */
      if (!(address >= VMALLOC_START && address < VMALLOC_END))
        return -1;
    
    
      /*
       * Synchronize this task's top level page-table
       * with the 'reference' page table.
       *
       * Do _not_ use "current" here. We might be inside
       * an interrupt in the middle of a task switch..
       */
      pgd_paddr = read_cr3_pa();
      pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
      if (!pmd_k)
        return -1;
    
    
      pte_k = pte_offset_kernel(pmd_k, address);
      if (!pte_present(*pte_k))
        return -1;
    
    
      return 0
    }
  • 相关阅读:
    【redis】主从复制
    【redis】订阅功能
    【redis】基础
    MySQL【十二】pymysql操作数据库
    MySQL【十一】创建索引
    MySQL【十】认识索引
    MySQL【九】树
    MySQL【八】多表查询
    ubuntu 制作ISO模块
    ubuntu 开机自启动
  • 原文地址:https://www.cnblogs.com/sunnycindy/p/14974917.html
Copyright © 2020-2023  润新知