/*
* linux/mm/memory.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/*
* demand-loading started 01.12.91 - seems it is high on the list of
* things wanted, and it should be easy to implement. - Linus
*/
/*
* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
* pages started 02.12.91, seems to work. - Linus.
*
* Tested sharing by executing about 30 /bin/sh: under the old kernel it
* would have taken more than the 6M I have free, but it worked well as
* far as I could see.
*
* Also corrected some "invalidate()"s - I wasn't doing enough of them.
*/
/*
* Real VM (paging to/from disk) started 18.12.91. Much more work and
* thought has to go into this. Oh, well..
* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
* Found it. Everything seems to work now.
* 20.12.91 - Ok, making the swap-device changeable like the root.
*/
#include <asm/system.h>
#include <linux/config.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/head.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
//内存高地址
unsigned long high_memory = 0;
//0页表
extern unsigned long pg0[1024]; /* page table for 0-4MB for everybody */
//声卡内存初始化
extern void sound_mem_init(void);
//内核错误
extern void die_if_kernel(char *,struct pt_regs *,long);
//交换页数量
int nr_swap_pages = 0;
//空闲页数量
int nr_free_pages = 0;
//空闲链表
unsigned long free_page_list = 0;
/*
* The secondary free_page_list is used for malloc() etc things that
* may need pages during interrupts etc. Normal get_free_page() operations
* don't touch it, so it stays as a kind of "panic-list", that can be
* accessed when all other mm tricks have failed.
*/
int nr_secondary_pages = 0;
unsigned long secondary_page_list = 0;
//拷贝页面
#define copy_page(from,to)
__asm__("cld ; rep ; movsl": :"S" (from),"D" (to),"c" (1024):"cx","di","si")
unsigned short * mem_map = NULL;
//代码段空间
#define CODE_SPACE(addr,p) ((addr) < (p)->end_code)
/*
* oom() prints a message (so that the user knows why the process died),
* and gives the process an untrappable SIGSEGV.
*/
//输出内存越界消息,并发送信号,关闭进程----------------------------------------------ok
void oom(struct task_struct * task)
{
printk("
out of memory
");
task->sigaction[SIGKILL-1].sa_handler = NULL;
task->blocked &= ~(1<<(SIGKILL-1));
send_sig(SIGKILL,task,1);
}
//释放一个目录项对应的页表------------------------------------------------------------ok
static void free_one_table(unsigned long * page_dir)
{
int j;
//页目录中指定项,这里操作会发生拷贝,实际上就是页表的地址
unsigned long pg_table = *page_dir;
unsigned long * page_table;
if (!pg_table)
return;
*page_dir = 0;//页目录中指定项清零
if (pg_table >= high_memory || !(pg_table & PAGE_PRESENT)) {
printk("Bad page table: [%p]=%08lx
",page_dir,pg_table);
return;
}
//页表所在的内存页,校验是否保留
if (mem_map[MAP_NR(pg_table)] & MAP_PAGE_RESERVED)
return;
//页表地址
page_table = (unsigned long *) (pg_table & PAGE_MASK);
//遍历页表中的每一项,进行释放
for (j = 0 ; j < PTRS_PER_PAGE ; j++,page_table++) {
//从页表中取出内存页
unsigned long pg = *page_table;
//如果页不存在,跳出
if (!pg)
continue;
//归还页表中的页项
*page_table = 0;
//如果也存在,释放内存页,否则释放交换页
if (pg & PAGE_PRESENT)
free_page(PAGE_MASK & pg);
else
swap_free(pg);
}
//释放页表所在页面
free_page(PAGE_MASK & pg_table);
}
/*
* This function clears all user-level page tables of a process - this
* is needed by execve(), so that old pages aren't in the way. Note that
* unlike 'free_page_tables()', this function still leaves a valid
* page-table-tree in memory: it just removes the user pages. The two
* functions are similar, but there is a fundamental difference.
*/
//清空页表---清空进程所占的用户空间页表----------------------------------------------ok
void clear_page_tables(struct task_struct * tsk)
{
int i;
unsigned long pg_dir;
unsigned long * page_dir;
if (!tsk)
return;
if (tsk == task[0])
panic("task[0] (swapper) doesn't support exec()
");
//获取到页表目录地址
pg_dir = tsk->tss.cr3;
//页表目录中第一项
page_dir = (unsigned long *) pg_dir;
//页表目录不正确
if (!page_dir || page_dir == swapper_pg_dir) {
printk("Trying to clear kernel page-directory: not good
");
return;
}
//页表目录所在内存页,被多个进程所共享
if (mem_map[MAP_NR(pg_dir)] > 1) {
unsigned long * new_pg;
//申请一页内存
if (!(new_pg = (unsigned long*) get_free_page(GFP_KERNEL))) {
oom(tsk);
return;
}
//遍历后面的256项,循环拷贝
for (i = 768 ; i < 1024 ; i++)
new_pg[i] = page_dir[i];
//释放页目录表
free_page(pg_dir);
//重新指向页目录表
tsk->tss.cr3 = (unsigned long) new_pg;
return;
}
//遍历释放虚拟低地址空间的0~3G,768项,非共享
for (i = 0 ; i < 768 ; i++,page_dir++)
free_one_table(page_dir);
invalidate();
return;
}
/*
* This function frees up all page tables of a process when it exits.
*/
//释放页表---进程退出,释放所有页表--------------------------------------------------ok
void free_page_tables(struct task_struct * tsk)
{
int i;
unsigned long pg_dir;
unsigned long * page_dir;
if (!tsk)
return;
if (tsk == task[0]) {
printk("task[0] (swapper) killed: unable to recover
");
panic("Trying to free up swapper memory space");
}
//页表目录地址
pg_dir = tsk->tss.cr3;
if (!pg_dir || pg_dir == (unsigned long) swapper_pg_dir) {
printk("Trying to free kernel page-directory: not good
");
return;
}
//交换页目录地址
tsk->tss.cr3 = (unsigned long) swapper_pg_dir;
//当前任务
if (tsk == current)
__asm__ __volatile__("movl %0,%%cr3": :"a" (tsk->tss.cr3));
//共享则调用free_page处理
if (mem_map[MAP_NR(pg_dir)] > 1) {
free_page(pg_dir);
return;
}
//非共享,循环遍历释放页表
page_dir = (unsigned long *) pg_dir;
for (i = 0 ; i < PTRS_PER_PAGE ; i++,page_dir++)
free_one_table(page_dir);
free_page(pg_dir);
invalidate();
}
/*
* clone_page_tables() clones the page table for a process - both
* processes will have the exact same pages in memory. There are
* probably races in the memory management with cloning, but we'll
* see..
*/
//为进程克隆页目录-------------------------------------------------------------------ok
int clone_page_tables(struct task_struct * tsk)
{
unsigned long pg_dir;
pg_dir = current->tss.cr3;
mem_map[MAP_NR(pg_dir)]++;
tsk->tss.cr3 = pg_dir;
return 0;
}
/*
* copy_page_tables() just copies the whole process memory range:
* note the special handling of RESERVED (ie kernel) pages, which
* means that they are always shared by all processes.
*/
//拷贝页表,首先克隆页表目录,然后克隆页表,最后克隆页--------------------------------ok
int copy_page_tables(struct task_struct * tsk)
{
int i;
unsigned long old_pg_dir, *old_page_dir;//旧的页表目录
unsigned long new_pg_dir, *new_page_dir;//新的页表目录
if (!(new_pg_dir = get_free_page(GFP_KERNEL))) //为新的页表目录申请一页内存
return -ENOMEM;
//获取旧的页表目录地址
old_pg_dir = current->tss.cr3;
tsk->tss.cr3 = new_pg_dir;
old_page_dir = (unsigned long *) old_pg_dir;
new_page_dir = (unsigned long *) new_pg_dir;
for (i = 0 ; i < PTRS_PER_PAGE ; i++,old_page_dir++,new_page_dir++) {
int j;
unsigned long old_pg_table, *old_page_table;
unsigned long new_pg_table, *new_page_table;
old_pg_table = *old_page_dir;
if (!old_pg_table)
continue;
if (old_pg_table >= high_memory || !(old_pg_table & PAGE_PRESENT)) {
printk("copy_page_tables: bad page table: "
"probable memory corruption");
*old_page_dir = 0;
continue;
}
if (mem_map[MAP_NR(old_pg_table)] & MAP_PAGE_RESERVED) {
*new_page_dir = old_pg_table;
continue;
}
if (!(new_pg_table = get_free_page(GFP_KERNEL))) {
free_page_tables(tsk);
return -ENOMEM;
}
old_page_table = (unsigned long *) (PAGE_MASK & old_pg_table);
new_page_table = (unsigned long *) (PAGE_MASK & new_pg_table);
for (j = 0 ; j < PTRS_PER_PAGE ; j++,old_page_table++,new_page_table++) {
unsigned long pg;
pg = *old_page_table;
if (!pg)
continue;
if (!(pg & PAGE_PRESENT)) {
*new_page_table = swap_duplicate(pg);
continue;
}
if ((pg & (PAGE_RW | PAGE_COW)) == (PAGE_RW | PAGE_COW))
pg &= ~PAGE_RW;
*new_page_table = pg;
if (mem_map[MAP_NR(pg)] & MAP_PAGE_RESERVED)
continue;
*old_page_table = pg;
mem_map[MAP_NR(pg)]++;
}
*new_page_dir = new_pg_table | PAGE_TABLE;
}
invalidate();
return 0;
}
/*
* a more complete version of free_page_tables which performs with page
* granularity.
*/
//释放一定范围的页表
int unmap_page_range(unsigned long from, unsigned long size)
{
unsigned long page, page_dir;
unsigned long *page_table, *dir;
unsigned long poff, pcnt, pc;
if (from & ~PAGE_MASK) {
printk("unmap_page_range called with wrong alignment
");
return -EINVAL;
}
//计算size有多少页
size = (size + ~PAGE_MASK) >> PAGE_SHIFT;
//计算开始地址所在的页目录项的位置
dir = PAGE_DIR_OFFSET(current->tss.cr3,from);
//计算偏移
poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
//计算页数
if ((pcnt = PTRS_PER_PAGE - poff) > size)
pcnt = size;
//
for ( ; size > 0; ++dir, size -= pcnt,
pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size)) {
if (!(page_dir = *dir)) {
poff = 0;
continue;
}
if (!(page_dir & PAGE_PRESENT)) {
printk("unmap_page_range: bad page directory.");
continue;
}
//找到页表
page_table = (unsigned long *)(PAGE_MASK & page_dir);
if (poff) {
page_table += poff;
poff = 0;
}
//遍历释放
for (pc = pcnt; pc--; page_table++) {
if ((page = *page_table) != 0) {
*page_table = 0;
if (1 & page) {
if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED))
if (current->rss > 0)
--current->rss;
free_page(PAGE_MASK & page);
} else
swap_free(page);
}
}
if (pcnt == PTRS_PER_PAGE) {
*dir = 0;
free_page(PAGE_MASK & page_dir);
}
}
invalidate();
return 0;
}
//映射
int zeromap_page_range(unsigned long from, unsigned long size, int mask)
{
unsigned long *page_table, *dir;
unsigned long poff, pcnt;
unsigned long page;
if (mask) {
if ((mask & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) {
printk("zeromap_page_range: mask = %08x
",mask);
return -EINVAL;
}
mask |= ZERO_PAGE;
}
if (from & ~PAGE_MASK) {
printk("zeromap_page_range: from = %08lx
",from);
return -EINVAL;
}
dir = PAGE_DIR_OFFSET(current->tss.cr3,from);
size = (size + ~PAGE_MASK) >> PAGE_SHIFT;
poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
if ((pcnt = PTRS_PER_PAGE - poff) > size)
pcnt = size;
while (size > 0) {
if (!(PAGE_PRESENT & *dir)) {
/* clear page needed here? SRB. */
if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) {
invalidate();
return -ENOMEM;
}
if (PAGE_PRESENT & *dir) {
free_page((unsigned long) page_table);
page_table = (unsigned long *)(PAGE_MASK & *dir++);
} else
*dir++ = ((unsigned long) page_table) | PAGE_TABLE;
} else
page_table = (unsigned long *)(PAGE_MASK & *dir++);
page_table += poff;
poff = 0;
for (size -= pcnt; pcnt-- ;) {
if ((page = *page_table) != 0) {
*page_table = 0;
if (page & PAGE_PRESENT) {
if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED))
if (current->rss > 0)
--current->rss;
free_page(PAGE_MASK & page);
} else
swap_free(page);
}
*page_table++ = mask;
}
pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size);
}
invalidate();
return 0;
}
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
* in null mappings (currently treated as "copy-on-access")
*/
//映射一段物理内存到请求页,旧的映射被移除,任何引用到不存在的页的源于
//空映射
//重新映射
int remap_page_range(unsigned long from, unsigned long to, unsigned long size, int mask)
{
unsigned long *page_table, *dir;
unsigned long poff, pcnt;
unsigned long page;
if (mask) {
if ((mask & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) {
printk("remap_page_range: mask = %08x
",mask);
return -EINVAL;
}
}
if ((from & ~PAGE_MASK) || (to & ~PAGE_MASK)) {
printk("remap_page_range: from = %08lx, to=%08lx
",from,to);
return -EINVAL;
}
dir = PAGE_DIR_OFFSET(current->tss.cr3,from);
size = (size + ~PAGE_MASK) >> PAGE_SHIFT;
poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
if ((pcnt = PTRS_PER_PAGE - poff) > size)
pcnt = size;
while (size > 0) {
if (!(PAGE_PRESENT & *dir)) {
/* clearing page here, needed? SRB. */
if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) {
invalidate();
return -1;
}
*dir++ = ((unsigned long) page_table) | PAGE_TABLE;
}
else
page_table = (unsigned long *)(PAGE_MASK & *dir++);
if (poff) {
page_table += poff;
poff = 0;
}
for (size -= pcnt; pcnt-- ;) {
if ((page = *page_table) != 0) {
*page_table = 0;
if (PAGE_PRESENT & page) {
if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED))
if (current->rss > 0)
--current->rss;
free_page(PAGE_MASK & page);
} else
swap_free(page);
}
/*
* the first condition should return an invalid access
* when the page is referenced. current assumptions
* cause it to be treated as demand allocation in some
* cases.
*/
if (!mask)
*page_table++ = 0; /* not present */
else if (to >= high_memory)
*page_table++ = (to | mask);
else if (!mem_map[MAP_NR(to)])
*page_table++ = 0; /* not present */
else {
*page_table++ = (to | mask);
if (!(mem_map[MAP_NR(to)] & MAP_PAGE_RESERVED)) {
++current->rss;
mem_map[MAP_NR(to)]++;
}
}
to += PAGE_SIZE;
}
pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size);
}
invalidate();
return 0;
}
/*
* This function puts a page in memory at the wanted address.
* It returns the physical address of the page gotten, 0 if
* out of memory (either when trying to access page-table or
* page.)
*/
//放置一页
unsigned long put_page(struct task_struct * tsk,unsigned long page,
unsigned long address,int prot)
{
unsigned long *page_table;
if ((prot & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT)
printk("put_page: prot = %08x
",prot);
if (page >= high_memory) {
printk("put_page: trying to put page %08lx at %08lx
",page,address);
return 0;
}
page_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address);
if ((*page_table) & PAGE_PRESENT)
page_table = (unsigned long *) (PAGE_MASK & *page_table);
else {
printk("put_page: bad page directory entry
");
oom(tsk);
*page_table = BAD_PAGETABLE | PAGE_TABLE;
return 0;
}
page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
if (*page_table) {
printk("put_page: page already exists
");
*page_table = 0;
invalidate();
}
*page_table = page | prot;
/* no need for invalidate */
return page;
}
/*
* The previous function doesn't work very well if you also want to mark
* the page dirty: exec.c wants this, as it has earlier changed the page,
* and we want the dirty-status to be correct (for VM). Thus the same
* routine, but this time we mark it dirty too.
*/
//放置脏页
unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
{
unsigned long tmp, *page_table;
if (page >= high_memory)
printk("put_dirty_page: trying to put page %08lx at %08lx
",page,address);
if (mem_map[MAP_NR(page)] != 1)
printk("mem_map disagrees with %08lx at %08lx
",page,address);
page_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address);
if (PAGE_PRESENT & *page_table)
page_table = (unsigned long *) (PAGE_MASK & *page_table);
else {
if (!(tmp = get_free_page(GFP_KERNEL)))
return 0;
if (PAGE_PRESENT & *page_table) {
free_page(tmp);
page_table = (unsigned long *) (PAGE_MASK & *page_table);
} else {
*page_table = tmp | PAGE_TABLE;
page_table = (unsigned long *) tmp;
}
}
page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
if (*page_table) {
printk("put_dirty_page: page already exists
");
*page_table = 0;
invalidate();
}
*page_table = page | (PAGE_DIRTY | PAGE_PRIVATE);
/* no need for invalidate */
return page;
}
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
* Note that we do many checks twice (look at do_wp_page()), as
* we have to be careful about race-conditions.
*
* Goto-purists beware: the only reason for goto's here is that it results
* in better assembly code.. The "default" path will see no jumps at all.
*/
//写保护
static void __do_wp_page(unsigned long error_code, unsigned long address,
struct task_struct * tsk, unsigned long user_esp)
{
unsigned long *pde, pte, old_page, prot;
unsigned long new_page;
new_page = __get_free_page(GFP_KERNEL);
pde = PAGE_DIR_OFFSET(tsk->tss.cr3,address);
pte = *pde;
if (!(pte & PAGE_PRESENT))
goto end_wp_page;
if ((pte & PAGE_TABLE) != PAGE_TABLE || pte >= high_memory)
goto bad_wp_pagetable;
pte &= PAGE_MASK;
pte += PAGE_PTR(address);
old_page = *(unsigned long *) pte;
if (!(old_page & PAGE_PRESENT))
goto end_wp_page;
if (old_page >= high_memory)
goto bad_wp_page;
if (old_page & PAGE_RW)
goto end_wp_page;
tsk->min_flt++;
prot = (old_page & ~PAGE_MASK) | PAGE_RW;
old_page &= PAGE_MASK;
if (mem_map[MAP_NR(old_page)] != 1) {
if (new_page) {
if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED)
++tsk->rss;
copy_page(old_page,new_page);
*(unsigned long *) pte = new_page | prot;
free_page(old_page);
invalidate();
return;
}
free_page(old_page);
oom(tsk);
*(unsigned long *) pte = BAD_PAGE | prot;
invalidate();
return;
}
*(unsigned long *) pte |= PAGE_RW;
invalidate();
if (new_page)
free_page(new_page);
return;
bad_wp_page:
printk("do_wp_page: bogus page at address %08lx (%08lx)
",address,old_page);
*(unsigned long *) pte = BAD_PAGE | PAGE_SHARED;
send_sig(SIGKILL, tsk, 1);
goto end_wp_page;
bad_wp_pagetable:
printk("do_wp_page: bogus page-table at address %08lx (%08lx)
",address,pte);
*pde = BAD_PAGETABLE | PAGE_TABLE;
send_sig(SIGKILL, tsk, 1);
end_wp_page:
if (new_page)
free_page(new_page);
return;
}
/*
* check that a page table change is actually needed, and call
* the low-level function only in that case..
*/
//写保护
void do_wp_page(unsigned long error_code, unsigned long address,
struct task_struct * tsk, unsigned long user_esp)
{
unsigned long page;
unsigned long * pg_table;
pg_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address);
page = *pg_table;
if (!page)
return;
if ((page & PAGE_PRESENT) && page < high_memory) {
pg_table = (unsigned long *) ((page & PAGE_MASK) + PAGE_PTR(address));
page = *pg_table;
if (!(page & PAGE_PRESENT))
return;
if (page & PAGE_RW)
return;
if (!(page & PAGE_COW)) {
if (user_esp && tsk == current) {
current->tss.cr2 = address;
current->tss.error_code = error_code;
current->tss.trap_no = 14;
send_sig(SIGSEGV, tsk, 1);
return;
}
}
if (mem_map[MAP_NR(page)] == 1) {
*pg_table |= PAGE_RW | PAGE_DIRTY;
invalidate();
return;
}
__do_wp_page(error_code, address, tsk, user_esp);
return;
}
printk("bad page directory entry %08lx
",page);
*pg_table = 0;
}
//写时验证
int __verify_write(unsigned long start, unsigned long size)
{
size--;
size += start & ~PAGE_MASK;
size >>= PAGE_SHIFT;
start &= PAGE_MASK;
do {
do_wp_page(1,start,current,0);
start += PAGE_SIZE;
} while (size--);
return 0;
}
//获取空闲页
static inline void get_empty_page(struct task_struct * tsk, unsigned long address)
{
unsigned long tmp;
if (!(tmp = get_free_page(GFP_KERNEL))) {
oom(tsk);
tmp = BAD_PAGE;
}
if (!put_page(tsk,tmp,address,PAGE_PRIVATE))
free_page(tmp);
}
/*
* try_to_share() checks the page at address "address" in the task "p",
* to see if it exists, and if it is clean. If so, share it with the current
* task.
*
* NOTE! This assumes we have checked that p != current, and that they
* share the same executable or library.
*
* We may want to fix this to allow page sharing for PIC pages at different
* addresses so that ELF will really perform properly. As long as the vast
* majority of sharable libraries load at fixed addresses this is not a
* big concern. Any sharing of pages between the buffer cache and the
* code space reduces the need for this as well. - ERY
*/
//尝试共享
static int try_to_share(unsigned long address, struct task_struct * tsk,
struct task_struct * p, unsigned long error_code, unsigned long newpage)
{
unsigned long from;
unsigned long to;
unsigned long from_page;
unsigned long to_page;
from_page = (unsigned long)PAGE_DIR_OFFSET(p->tss.cr3,address);
to_page = (unsigned long)PAGE_DIR_OFFSET(tsk->tss.cr3,address);
/* is there a page-directory at from? */
from = *(unsigned long *) from_page;
if (!(from & PAGE_PRESENT))
return 0;
from &= PAGE_MASK;
from_page = from + PAGE_PTR(address);
from = *(unsigned long *) from_page;
/* is the page clean and present? */
if ((from & (PAGE_PRESENT | PAGE_DIRTY)) != PAGE_PRESENT)
return 0;
if (from >= high_memory)
return 0;
if (mem_map[MAP_NR(from)] & MAP_PAGE_RESERVED)
return 0;
/* is the destination ok? */
to = *(unsigned long *) to_page;
if (!(to & PAGE_PRESENT))
return 0;
to &= PAGE_MASK;
to_page = to + PAGE_PTR(address);
if (*(unsigned long *) to_page)
return 0;
/* share them if read - do COW immediately otherwise */
if (error_code & PAGE_RW) {
if(!newpage) /* did the page exist? SRB. */
return 0;
copy_page((from & PAGE_MASK),newpage);
to = newpage | PAGE_PRIVATE;
} else {
mem_map[MAP_NR(from)]++;
from &= ~PAGE_RW;
to = from;
if(newpage) /* only if it existed. SRB. */
free_page(newpage);
}
*(unsigned long *) from_page = from;
*(unsigned long *) to_page = to;
invalidate();
return 1;
}
/*
* share_page() tries to find a process that could share a page with
* the current one. Address is the address of the wanted page relative
* to the current data space.
*
* We first check if it is at all feasible by checking executable->i_count.
* It should be >1 if there are other tasks sharing this inode.
*/
//共享页面
int share_page(struct vm_area_struct * area, struct task_struct * tsk,
struct inode * inode,
unsigned long address, unsigned long error_code, unsigned long newpage)
{
struct task_struct ** p;
if (!inode || inode->i_count < 2 || !area->vm_ops)
return 0;
for (p = &LAST_TASK ; p > &FIRST_TASK ; --p) {
if (!*p)
continue;
if (tsk == *p)
continue;
if (inode != (*p)->executable) {
if(!area) continue;
/* Now see if there is something in the VMM that
we can share pages with */
if(area){
struct vm_area_struct * mpnt;
for (mpnt = (*p)->mmap; mpnt; mpnt = mpnt->vm_next) {
if (mpnt->vm_ops == area->vm_ops &&
mpnt->vm_inode->i_ino == area->vm_inode->i_ino&&
mpnt->vm_inode->i_dev == area->vm_inode->i_dev){
if (mpnt->vm_ops->share(mpnt, area, address))
break;
};
};
if (!mpnt) continue; /* Nope. Nuthin here */
};
}
if (try_to_share(address,tsk,*p,error_code,newpage))
return 1;
}
return 0;
}
/*
* fill in an empty page-table if none exists.
*/
//获取空的页表
static inline unsigned long get_empty_pgtable(struct task_struct * tsk,unsigned long address)
{
unsigned long page;
unsigned long *p;
p = PAGE_DIR_OFFSET(tsk->tss.cr3,address);
if (PAGE_PRESENT & *p)
return *p;
if (*p) {
printk("get_empty_pgtable: bad page-directory entry
");
*p = 0;
}
page = get_free_page(GFP_KERNEL);
p = PAGE_DIR_OFFSET(tsk->tss.cr3,address);
if (PAGE_PRESENT & *p) {
free_page(page);
return *p;
}
if (*p) {
printk("get_empty_pgtable: bad page-directory entry
");
*p = 0;
}
if (page) {
*p = page | PAGE_TABLE;
return *p;
}
oom(current);
*p = BAD_PAGETABLE | PAGE_TABLE;
return 0;
}
//缺页
void do_no_page(unsigned long error_code, unsigned long address,
struct task_struct *tsk, unsigned long user_esp)
{
unsigned long tmp;
unsigned long page;
struct vm_area_struct * mpnt;
page = get_empty_pgtable(tsk,address);
if (!page)
return;
page &= PAGE_MASK;
page += PAGE_PTR(address);
tmp = *(unsigned long *) page;
if (tmp & PAGE_PRESENT)
return;
++tsk->rss;
if (tmp) {
++tsk->maj_flt;
swap_in((unsigned long *) page);
return;
}
address &= 0xfffff000;
tmp = 0;
for (mpnt = tsk->mmap; mpnt != NULL; mpnt = mpnt->vm_next) {
if (address < mpnt->vm_start)
break;
if (address >= mpnt->vm_end) {
tmp = mpnt->vm_end;
continue;
}
if (!mpnt->vm_ops || !mpnt->vm_ops->nopage) {
++tsk->min_flt;
get_empty_page(tsk,address);
return;
}
mpnt->vm_ops->nopage(error_code, mpnt, address);
return;
}
if (tsk != current)
goto ok_no_page;
if (address >= tsk->end_data && address < tsk->brk)
goto ok_no_page;
if (mpnt && mpnt == tsk->stk_vma &&
address - tmp > mpnt->vm_start - address &&
tsk->rlim[RLIMIT_STACK].rlim_cur > mpnt->vm_end - address) {
mpnt->vm_start = address;
goto ok_no_page;
}
tsk->tss.cr2 = address;
current->tss.error_code = error_code;
current->tss.trap_no = 14;
send_sig(SIGSEGV,tsk,1);
if (error_code & 4) /* user level access? */
return;
ok_no_page:
++tsk->min_flt;
get_empty_page(tsk,address);
}
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
//页保护
asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
unsigned long address;
unsigned long user_esp = 0;
unsigned int bit;
/* get the address */
__asm__("movl %%cr2,%0":"=r" (address));
if (address < TASK_SIZE) {
if (error_code & 4) { /* user mode access? */
if (regs->eflags & VM_MASK) {
bit = (address - 0xA0000) >> PAGE_SHIFT;
if (bit < 32)
current->screen_bitmap |= 1 << bit;
} else
user_esp = regs->esp;
}
if (error_code & 1)
do_wp_page(error_code, address, current, user_esp);
else
do_no_page(error_code, address, current, user_esp);
return;
}
address -= TASK_SIZE;
if (wp_works_ok < 0 && address == 0 && (error_code & PAGE_PRESENT)) {
wp_works_ok = 1;
pg0[0] = PAGE_SHARED;
printk("This processor honours the WP bit even when in supervisor mode. Good.
");
return;
}
if (address < PAGE_SIZE) {
printk("Unable to handle kernel NULL pointer dereference");
pg0[0] = PAGE_SHARED;
} else
printk("Unable to handle kernel paging request");
printk(" at address %08lx
",address);
die_if_kernel("Oops", regs, error_code);
do_exit(SIGKILL);
}
//----------------------------------------------------------------------------------------------------------
/*
* BAD_PAGE is the page that is used for page faults when linux
* is out-of-memory. Older versions of linux just did a
* do_exit(), but using this instead means there is less risk
* for a process dying in kernel mode, possibly leaving a inode
* unused etc..
*
* BAD_PAGETABLE is the accompanying page-table: it is initialized
* to point to BAD_PAGE entries.
*
* ZERO_PAGE is a special page that is used for zero-initialized
* data and COW.
*/
//BAD_PAGE是当linux发送内存越界错误使用的内存页。旧版本的linux仅仅是退出
//但是使用这个替代意味着当进程在内核模式下死掉减少了风险
//错误页伴随着页表,它初始化指向了BAD_PAGE项
//ZERO_PAGE是指定页,用来初始化零数据和COW(写时复制)
//初始化错误页表
unsigned long __bad_pagetable(void)
{
extern char empty_bad_page_table[PAGE_SIZE];
__asm__ __volatile__("cld ; rep ; stosl": //EDI自增4,循环拷贝eax中的内容到edi中
:"a" (BAD_PAGE + PAGE_TABLE), //eax中存放BAD_PAGE第一个页的地址
"D" ((long) empty_bad_page_table), //edx中存放空的错误页表地址
"c" (PTRS_PER_PAGE) //计数器
:"di","cx");
return (unsigned long) empty_bad_page_table;
}
//错误页
unsigned long __bad_page(void)
{
extern char empty_bad_page[PAGE_SIZE];
__asm__ __volatile__("cld ; rep ; stosl":
:"a" (0),
"D" ((long) empty_bad_page),
"c" (PTRS_PER_PAGE)
:"di","cx");
return (unsigned long) empty_bad_page;
}
//zero页
unsigned long __zero_page(void)
{
extern char empty_zero_page[PAGE_SIZE];
__asm__ __volatile__("cld ; rep ; stosl":
:"a" (0),
"D" ((long) empty_zero_page),
"c" (PTRS_PER_PAGE)
:"di","cx");
return (unsigned long) empty_zero_page;
}
//显示内存信息
void show_mem(void)
{
int i,free = 0,total = 0,reserved = 0;
int shared = 0;
printk("Mem-info:
");
printk("Free pages: %6dkB
",nr_free_pages<<(PAGE_SHIFT-10));
printk("Secondary pages: %6dkB
",nr_secondary_pages<<(PAGE_SHIFT-10));
printk("Free swap: %6dkB
",nr_swap_pages<<(PAGE_SHIFT-10));
i = high_memory >> PAGE_SHIFT;
while (i-- > 0) {
total++;
if (mem_map[i] & MAP_PAGE_RESERVED)
reserved++;
else if (!mem_map[i])
free++;
else
shared += mem_map[i]-1;
}
printk("%d pages of RAM
",total);
printk("%d free pages
",free);
printk("%d reserved pages
",reserved);
printk("%d pages shared
",shared);
show_buffers();
}
/*
* paging_init() sets up the page tables - note that the first 4MB are
* already mapped by head.S.
*
* This routines also unmaps the page at virtual kernel address 0, so
* that we can trap those pesky NULL-reference errors in the kernel.
*/
//初始化页表
unsigned long paging_init(unsigned long start_mem, unsigned long end_mem)
{
unsigned long * pg_dir;
unsigned long * pg_table;
unsigned long tmp;
unsigned long address;
/*
* Physical page 0 is special; it's not touched by Linux since BIOS
* and SMM (for laptops with [34]86/SL chips) may need it. It is read
* and write protected to detect null pointer references in the
* kernel.
*/
#if 0
memset((void *) 0, 0, PAGE_SIZE);
#endif
//开始内存调整位置
start_mem = PAGE_ALIGN(start_mem);
address = 0;
//交换分区页目标表,指向页目标表
pg_dir = swapper_pg_dir;
//遍历所有核心内存
while (address < end_mem) {
//指向第768项内容,第0页
tmp = *(pg_dir + 768); /* at virtual addr 0xC0000000 */
//如果为空
if (!tmp) {
//设置为表属性
tmp = start_mem | PAGE_TABLE;
//填充该项
*(pg_dir + 768) = tmp;
//调整开始内存为下一页
start_mem += PAGE_SIZE;
}
//此时不空
*pg_dir = tmp; /* also map it in at 0x0000000 for init */
//下一项
pg_dir++;
//页表地址
pg_table = (unsigned long *) (tmp & PAGE_MASK);
//初始化新创建的页表
for (tmp = 0 ; tmp < PTRS_PER_PAGE ; tmp++,pg_table++) {
//如果地址没有超出最高内存,则页表中的项
if (address < end_mem)
*pg_table = address | PAGE_SHARED;
else
*pg_table = 0;
//下一页
address += PAGE_SIZE;
}
}
invalidate();
return start_mem;
}
//内存初始化 可用内存低初始值
void mem_init(unsigned long start_low_mem,
//可用内存初始值 可用内存结束值
unsigned long start_mem, unsigned long end_mem)
{
//代码页
int codepages = 0;
//保留页
int reservedpages = 0;
//数据页
int datapages = 0;
//
unsigned long tmp;
unsigned short * p;
extern int etext;
//关中断
cli();
//处理高地址
//调整内存,按页对齐
end_mem &= PAGE_MASK;
//内存高地址设置为处理后的内存结尾处
high_memory = end_mem;
//处理低地址,调整内存开始地址
start_mem += 0x0000000f;
start_mem &= ~0x0000000f;
//计算内存的页数,mm头文件中定义的宏,全部页数
tmp = MAP_NR(end_mem);
//mem_map第一项指向主存开始地址,即在此时的start_mem内存开始处,放置一个数组mem_map,
//统一管理从start_mem到end_mem得内存,一页,映射一项
mem_map = (unsigned short *) start_mem;
//p指向mem_map最后一项
p = mem_map + tmp;
//取最后一项为开始项
start_mem = (unsigned long) p;
//遍历设置属性
while (p > mem_map)
*--p = MAP_PAGE_RESERVED;
//确定内存的开始地址
start_low_mem = PAGE_ALIGN(start_low_mem);
start_mem = PAGE_ALIGN(start_mem);
while (start_low_mem < 0xA0000) {
mem_map[MAP_NR(start_low_mem)] = 0;
start_low_mem += PAGE_SIZE;
}
while (start_mem < end_mem) {
mem_map[MAP_NR(start_mem)] = 0;
start_mem += PAGE_SIZE;
}
#ifdef CONFIG_SOUND
sound_mem_init();
#endif
free_page_list = 0;
nr_free_pages = 0;
//统计内存信息
for (tmp = 0 ; tmp < end_mem ; tmp += PAGE_SIZE) {
if (mem_map[MAP_NR(tmp)]) {
if (tmp >= 0xA0000 && tmp < 0x100000)
reservedpages++;
else if (tmp < (unsigned long) &etext)
codepages++;
else
datapages++;
continue;
}
//空闲的内存页面
*(unsigned long *) tmp = free_page_list;
free_page_list = tmp;
nr_free_pages++;
}
//
tmp = nr_free_pages << PAGE_SHIFT;
printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data)
",
tmp >> 10,
end_mem >> 10,
codepages << (PAGE_SHIFT-10),
reservedpages << (PAGE_SHIFT-10),
datapages << (PAGE_SHIFT-10));
/* test if the WP bit is honoured in supervisor mode */
wp_works_ok = -1;
pg0[0] = PAGE_READONLY;
invalidate();
__asm__ __volatile__("movb 0,%%al ; movb %%al,0": : :"ax", "memory");
pg0[0] = 0;
invalidate();
if (wp_works_ok < 0)
wp_works_ok = 0;
return;
}
//内存信息---------------------------------------------------ok
void si_meminfo(struct sysinfo *val)
{
int i;
//最后一页的序号
i = high_memory >> PAGE_SHIFT;
//系统信息初始化
//ram总数
val->totalram = 0;
//空闲的ram
val->freeram = 0;
//共享的ram
val->sharedram = 0;
//缓冲区
val->bufferram = buffermem;
while (i-- > 0) {
//遍历每一内存页,如果该页保留,则跳过
if (mem_map[i] & MAP_PAGE_RESERVED)
continue;
//否则ram总量属性增加1
val->totalram++;
//如果该页没有被使用,则空闲ram增加
if (!mem_map[i]) {
val->freeram++;
continue;
}
//共享页数计算
val->sharedram += mem_map[i]-1;
}
//以上计算的是页数,这里计算大小
val->totalram <<= PAGE_SHIFT;
val->freeram <<= PAGE_SHIFT;
val->sharedram <<= PAGE_SHIFT;
return;
}
/* This handles a generic mmap of a disk file */
//管理通用硬盘文件的mmap
void file_mmap_nopage(int error_code, struct vm_area_struct * area, unsigned long address)
{
struct inode * inode = area->vm_inode;
unsigned int block;
unsigned long page;
int nr[8];
int i, j;
int prot = area->vm_page_prot;
address &= PAGE_MASK;
block = address - area->vm_start + area->vm_offset;
block >>= inode->i_sb->s_blocksize_bits;
page = get_free_page(GFP_KERNEL);
if (share_page(area, area->vm_task, inode, address, error_code, page)) {
++area->vm_task->min_flt;
return;
}
++area->vm_task->maj_flt;
if (!page) {
oom(current);
put_page(area->vm_task, BAD_PAGE, address, PAGE_PRIVATE);
return;
}
for (i=0, j=0; i< PAGE_SIZE ; j++, block++, i += inode->i_sb->s_blocksize)
nr[j] = bmap(inode,block);
if (error_code & PAGE_RW)
prot |= PAGE_RW | PAGE_DIRTY;
page = bread_page(page, inode->i_dev, nr, inode->i_sb->s_blocksize, prot);
if (!(prot & PAGE_RW)) {
if (share_page(area, area->vm_task, inode, address, error_code, page))
return;
}
if (put_page(area->vm_task,page,address,prot))
return;
free_page(page);
oom(current);
}
void file_mmap_free(struct vm_area_struct * area)
{
//如果是虚拟内存节点,则放置回
if (area->vm_inode)
iput(area->vm_inode);
#if 0
if (area->vm_inode)
printk("Free inode %x:%d (%d)
",area->vm_inode->i_dev,
area->vm_inode->i_ino, area->vm_inode->i_count);
#endif
}
/*
* Compare the contents of the mmap entries, and decide if we are allowed to
* share the pages
*/
//如果我们允许共享页,比较mmap实体内容,即需要节点、开始地址、结束地址、偏移和页协议完全相同
int file_mmap_share(struct vm_area_struct * area1,
struct vm_area_struct * area2,
unsigned long address)
{
if (area1->vm_inode != area2->vm_inode)
return 0;
if (area1->vm_start != area2->vm_start)
return 0;
if (area1->vm_end != area2->vm_end)
return 0;
if (area1->vm_offset != area2->vm_offset)
return 0;
if (area1->vm_page_prot != area2->vm_page_prot)
return 0;
return 1;
}
//定义虚存操作
struct vm_operations_struct file_mmap = {
NULL, /* open */
file_mmap_free, /* close */
file_mmap_nopage, /* nopage */
NULL, /* wppage */
file_mmap_share, /* share */
NULL, /* unmap */
};