• 《linux 内核全然剖析》sched.c sched.h 代码分析笔记


    版权声明:本文为博主原创文章。未经博主同意不得转载。

    https://blog.csdn.net/u011368821/article/details/25129835

    sched.c sched.h 代码分析笔记

    首先上header file

    sched.h

    
    
    #ifndef _SCHED_H
    #define _SCHED_H
    
    #define HZ 100
    
    #define NR_TASKS    64
    #define TASK_SIZE    0x04000000
    #define LIBRARY_SIZE    0x00400000
    
    #if (TASK_SIZE & 0x3fffff)
    #error "TASK_SIZE must be multiple of 4M"
    #endif
    
    #if (LIBRARY_SIZE & 0x3fffff)
    #error "LIBRARY_SIZE must be a multiple of 4M"
    #endif
    
    #if (LIBRARY_SIZE >= (TASK_SIZE/2))
    #error "LIBRARY_SIZE too damn big!"
    #endif
    
    #if (((TASK_SIZE>>16)*NR_TASKS) != 0x10000)
    #error "TASK_SIZE*NR_TASKS must be 4GB"
    #endif
    
    #define LIBRARY_OFFSET (TASK_SIZE - LIBRARY_SIZE)
    
    #define CT_TO_SECS(x)    ((x) / HZ)
    #define CT_TO_USECS(x)    (((x) % HZ) * 1000000/HZ)
    
    #define FIRST_TASK task[0]
    #define LAST_TASK task[NR_TASKS-1]
    
    #include <linux/head.h>
    #include <linux/fs.h>
    #include <linux/mm.h>
    #include <sys/param.h>
    #include <sys/time.h>
    #include <sys/resource.h>
    #include <signal.h>
    
    #if (NR_OPEN > 32)
    #error "Currently the close-on-exec-flags and select masks are in one long, max 32 files/proc"
    #endif
    
    #define TASK_RUNNING        0
    #define TASK_INTERRUPTIBLE    1
    #define TASK_UNINTERRUPTIBLE    2
    #define TASK_ZOMBIE        3
    #define TASK_STOPPED        4
    
    #ifndef NULL
    #define NULL ((void *) 0)
    #endif
    
    extern int copy_page_tables(unsigned long from, unsigned long to, long size);
    extern int free_page_tables(unsigned long from, unsigned long size);
    
    extern void sched_init(void);
    extern void schedule(void);
    extern void trap_init(void);
    extern void panic(const char * str);
    extern int tty_write(unsigned minor,char * buf,int count);
    
    typedef int (*fn_ptr)();
    
    struct i387_struct {
        long    cwd;
        long    swd;
        long    twd;
        long    fip;
        long    fcs;
        long    foo;
        long    fos;
        long    st_space[20];    /* 8*10 bytes for each FP-reg = 80 bytes */
    };
    
    struct tss_struct {
        long    back_link;    /* 16 high bits zero */
        long    esp0;
        long    ss0;        /* 16 high bits zero */
        long    esp1;
        long    ss1;        /* 16 high bits zero */
        long    esp2;
        long    ss2;        /* 16 high bits zero */
        long    cr3;
        long    eip;
        long    eflags;
        long    eax,ecx,edx,ebx;
        long    esp;
        long    ebp;
        long    esi;
        long    edi;
        long    es;        /* 16 high bits zero */
        long    cs;        /* 16 high bits zero */
        long    ss;        /* 16 high bits zero */
        long    ds;        /* 16 high bits zero */
        long    fs;        /* 16 high bits zero */
        long    gs;        /* 16 high bits zero */
        long    ldt;        /* 16 high bits zero */
        long    trace_bitmap;    /* bits: trace 0, bitmap 16-31 */
        struct i387_struct i387;
    };
    
    struct task_struct {
    /* these are hardcoded - don't touch */
        long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
        long counter;
        long priority;
        long signal;
        struct sigaction sigaction[32];
        long blocked;    /* bitmap of masked signals */
    /* various fields */
        int exit_code;
        unsigned long start_code,end_code,end_data,brk,start_stack;
        long pid,pgrp,session,leader;
        int    groups[NGROUPS];
        /*
         * pointers to parent process, youngest child, younger sibling,
         * older sibling, respectively.  (p->father can be replaced with
         * p->p_pptr->pid)
         */
        struct task_struct    *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
        unsigned short uid,euid,suid;
        unsigned short gid,egid,sgid;
        unsigned long timeout,alarm;
        long utime,stime,cutime,cstime,start_time;
        struct rlimit rlim[RLIM_NLIMITS];
        unsigned int flags;    /* per process flags, defined below */
        unsigned short used_math;
    /* file system info */
        int tty;        /* -1 if no tty, so it must be signed */
        unsigned short umask;
        struct m_inode * pwd;
        struct m_inode * root;
        struct m_inode * executable;
        struct m_inode * library;
        unsigned long close_on_exec;
        struct file * filp[NR_OPEN];
    /* ldt for this task 0 - zero 1 - cs 2 - ds&ss */
        struct desc_struct ldt[3];
    /* tss for this task */
        struct tss_struct tss;
    };
    
    /*
     * Per process flags
     */
    #define PF_ALIGNWARN    0x00000001    /* Print alignment warning msgs */
                        /* Not implemented yet, only for 486*/
    
    /*
     *  INIT_TASK is used to set up the first task table, touch at
     * your own risk!. Base=0, limit=0x9ffff (=640kB)
     */
    #define INIT_TASK 
    /* state etc */    { 0,15,15, 
    /* signals */    0,{{},},0, 
    /* ec,brk... */    0,0,0,0,0,0, 
    /* pid etc.. */    0,0,0,0, 
    /* suppl grps*/ {NOGROUP,}, 
    /* proc links*/ &init_task.task,0,0,0, 
    /* uid etc */    0,0,0,0,0,0, 
    /* timeout */    0,0,0,0,0,0,0, 
    /* rlimits */   { {0x7fffffff, 0x7fffffff}, {0x7fffffff, 0x7fffffff},  
              {0x7fffffff, 0x7fffffff}, {0x7fffffff, 0x7fffffff}, 
              {0x7fffffff, 0x7fffffff}, {0x7fffffff, 0x7fffffff}}, 
    /* flags */    0, 
    /* math */    0, 
    /* fs info */    -1,0022,NULL,NULL,NULL,NULL,0, 
    /* filp */    {NULL,}, 
        { 
            {0,0}, 
    /* ldt */    {0x9f,0xc0fa00}, 
            {0x9f,0xc0f200}, 
        }, 
    /*tss*/    {0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&pg_dir,
         0,0,0,0,0,0,0,0, 
         0,0,0x17,0x17,0x17,0x17,0x17,0x17, 
         _LDT(0),0x80000000, 
            {} 
        }, 
    }
    
    extern struct task_struct *task[NR_TASKS];
    extern struct task_struct *last_task_used_math;
    extern struct task_struct *current;
    extern unsigned long volatile jiffies;
    extern unsigned long startup_time;
    extern int jiffies_offset;
    
    #define CURRENT_TIME (startup_time+(jiffies+jiffies_offset)/HZ)
    
    extern void add_timer(long jiffies, void (*fn)(void));
    extern void sleep_on(struct task_struct ** p);
    extern void interruptible_sleep_on(struct task_struct ** p);
    extern void wake_up(struct task_struct ** p);
    extern int in_group_p(gid_t grp);
    
    /*
     * Entry into gdt where to find first TSS. 0-nul, 1-cs, 2-ds, 3-syscall
     * 4-TSS0, 5-LDT0, 6-TSS1 etc ...
     */
    #define FIRST_TSS_ENTRY 4
    #define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)
    #define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3))
    #define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3))
    #define ltr(n) __asm__("ltr %%ax"::"a" (_TSS(n)))
    #define lldt(n) __asm__("lldt %%ax"::"a" (_LDT(n)))
    #define str(n) 
    __asm__("str %%ax
    	" 
        "subl %2,%%eax
    	" 
        "shrl $4,%%eax" 
        :"=a" (n) 
        :"a" (0),"i" (FIRST_TSS_ENTRY<<3))
    /*
     *    switch_to(n) should switch tasks to task nr n, first
     * checking that n isn't the current task, in which case it does nothing.
     * This also clears the TS-flag if the task we switched to has used
     * tha math co-processor latest.
     */
    #define switch_to(n) {
    struct {long a,b;} __tmp; 
    __asm__("cmpl %%ecx,_current
    	" 
        "je 1f
    	" 
        "movw %%dx,%1
    	" 
        "xchgl %%ecx,_current
    	" 
        "ljmp %0
    	" 
        "cmpl %%ecx,_last_task_used_math
    	" 
        "jne 1f
    	" 
        "clts
    " 
        "1:" 
        ::"m" (*&__tmp.a),"m" (*&__tmp.b), 
        "d" (_TSS(n)),"c" ((long) task[n])); 
    }
    
    #define PAGE_ALIGN(n) (((n)+0xfff)&0xfffff000)
    
    #define _set_base(addr,base) 
    __asm__("movw %%dx,%0
    	" 
        "rorl $16,%%edx
    	" 
        "movb %%dl,%1
    	" 
        "movb %%dh,%2" 
        ::"m" (*((addr)+2)), 
          "m" (*((addr)+4)), 
          "m" (*((addr)+7)), 
          "d" (base) 
        :"dx")
    
    #define _set_limit(addr,limit) 
    __asm__("movw %%dx,%0
    	" 
        "rorl $16,%%edx
    	" 
        "movb %1,%%dh
    	" 
        "andb $0xf0,%%dh
    	" 
        "orb %%dh,%%dl
    	" 
        "movb %%dl,%1" 
        ::"m" (*(addr)), 
          "m" (*((addr)+6)), 
          "d" (limit) 
        :"dx")
    
    #define set_base(ldt,base) _set_base( ((char *)&(ldt)) , base )
    #define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , (limit-1)>>12 )
    
    #define _get_base(addr) ({
    unsigned long __base; 
    __asm__("movb %3,%%dh
    	" 
        "movb %2,%%dl
    	" 
        "shll $16,%%edx
    	" 
        "movw %1,%%dx" 
        :"=d" (__base) 
        :"m" (*((addr)+2)), 
         "m" (*((addr)+4)), 
         "m" (*((addr)+7))); 
    __base;})
    
    #define get_base(ldt) _get_base( ((char *)&(ldt)) )
    
    #define get_limit(segment) ({ 
    unsigned long __limit; 
    __asm__("lsll %1,%0
    	incl %0":"=r" (__limit):"r" (segment)); 
    __limit;})
    
    #endif


    对于_TSS 和 _LDT两个宏定义

    #define FIRST_TSS_ENTRY 4
    #define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)
    #define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3))
    #define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3))
    TSS是第五个(IDT是第一个) ,每一个描写叙述符号占8byte ,于是有 FIRST_TSS_ENRTY<<3








                每一个任务都有两个堆栈。分别用于用户态和内核态程序的执行,而且分别称为用户态堆栈和内核态堆栈。处于不同的CPU特权级中。这两个堆栈之间的主要差别在于任务的内核态堆栈非常小,所保存的数量最多不能超过4096-任务数据结构块个字节,大约为3K。而任务的用户态堆栈却能够在用户的64M空间内延伸。




    show_task

    
    
    void show_task(int nr,struct task_struct * p)//显示p指向的nr号进程的相关信息
    {
        int i,j = 4096-sizeof(struct task_struct);//j记录了任务数据结构之后的堆栈空间大小
    
        printk("%d: pid=%d, state=%d, father=%d, child=%d, ",nr,p->pid,
            p->state, p->p_pptr->pid, p->p_cptr ? p->p_cptr->pid : -1);//打印关于p指向进程的各种信息
        i=0;
        while (i<j && !((char *)(p+1))[i])//非常巧妙的计算了任务数据结构之后的空字节(数据内容为0)的大小
            i++;
        printk("%d/%d chars free in kstack
    
    ",i,j);//内核栈最大为j,空字节数是i,分数比率i/j
        printk("   PC=%08X.", *(1019 + (unsigned long *) p));
        //p指向结构体起始地址偏移1019,应该是指数据结构中的TSS结构内EIP处的值(所谓PC指针),eip的值即当前任务用户态的代码指针。
        if (p->p_ysptr || p->p_osptr) //假设p进程有同辈的进程。那么打印它们的进程号
            printk("   Younger sib=%d, older sib=%d
    
    ",
                p->p_ysptr ? p->p_ysptr->pid : -1,
                p->p_osptr ?

    p->p_osptr->pid : -1); else printk(" "); }

    关于show_task讨论的一些帖子

    http://www.oldlinux.org/oldlinux/viewthread.php?tid=12182

    http://www.oldlinux.org/oldlinux/viewthread.php?

    tid=14683


    show_task

    
    
    //调用show_task,打印全部非空进程的信息
    void show_state(void)
    {
        int i;
    
        printk("
    Task-info:
    
    ");
        for (i=0;i<NR_TASKS;i++)
            if (task[i])//扫描task数组。非空即打印相应task[i]进程相关信息
                show_task(i,task[i]);
    }
    
    



                  在内核中的调度程序用于选择系统中下一个要执行的进程。

    这样的选择执行机制是多任务操作系统的基础。调度程序能够看作为处于执行状态都进程之间分配CPU执行时间的管理代码。

    Linux进程是抢占式的。但被抢占的进程仍处于TASK_RUNNING状态,仅仅是临时没有被CPU执行。进程的抢占发生在进程处于用于态执行阶段,在内核态执行时是不能被强制的。(0.12的不能够。貌似如今的能够了)


                 schdule()函数首先扫描任务数组,通过比較每一个就绪状态任务的执行时间递减滴答计数counter的值来确定当前哪个进程执行的时间最少,哪个counter值最大。就表示执行时间还不长。于是就选中该进程,并使用任务切换宏函数到该进程执行。

    schedule()

    
    
    void schedule(void)
    {
    	int i,next,c;
    	struct task_struct ** p;
    
    /* check alarm, wake up any interruptible tasks that have got a signal */
    
    	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)//把p初始化为指向最后一个进程的地址的指针,逆向扫描全部进程
    		if (*p) {//*p 指向当前进程的指针
    			if ((*p)->timeout && (*p)->timeout < jiffies) {//这里< 没错,我一直非常纠结为什么不是> 这里jiffies是渐变的,持续变的。而timeout 仅仅是作为一个阈值
    				(*p)->timeout = 0;
    				//假设当前进程等待非常久了((*p)->timeout < jiffies),而且这个进程处于TASK_INTERRUPTIBLE
    				//我们就把这个进程置与TASK_RUNNING状态
    				if ((*p)->state == TASK_INTERRUPTIBLE)
    					(*p)->state = TASK_RUNNING;
    			}
    			if ((*p)->alarm && (*p)->alarm < jiffies) { //假设此时jiffies大于alarm信号周期,则让将SIGALRM写入进程的信号位
    				(*p)->signal |= (1<<(SIGALRM-1));
    				(*p)->alarm = 0;
    			}
    			if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
    			(*p)->state==TASK_INTERRUPTIBLE)// 除SIGKILL SIGSTOP信号外,其它信号都是非堵塞状态的话,而且进程处于TASK_INTERRUPTIBLE
    				(*p)->state=TASK_RUNNING;//我们就把这个进程置与TASK_RUNNING状态
    		}
     
    /* this is the scheduler proper: */
    
    	while (1) {
    		c = -1;
    		next = 0;
    		i = NR_TASKS;
    		p = &task[NR_TASKS];
    		while (--i) {//把全部进程都扫一遍,counter是递减的,找出counter最大的进程,保存在next里面
    			if (!*--p)//当前*p指向进程为空,下一个
    				continue;
    			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
    			//counter是任务执行时间计数。注意处于scheduled状态的进程也是在执行是。仅仅是没有使用CPU而已
    				c = (*p)->counter, next = i;
    		}
    		if (c) break;//c>0 就说明找到了已经执行一段时间。而且执行时间最短的进程,跳出while(1)
    		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)//假设c==0,说明全部schedule的进程都没有执行 
    			if (*p)
    				(*p)->counter = ((*p)->counter >> 1) +
    						(*p)->priority; //又一次计算counter = counter/2 + priority
    	}
    	switch_to(next);//让进程next使用CPU
    }
    



           每当选择出一个新的能够执行的进程时,switch_to()宏执行实际进程切换操作。

    该宏会把CPU的当前进程状态(context)切换成新进程的状态。

           在切换之前,switch_to首先检查要切换的进程是否就是当前进程。假设是,啥也别做。直接退出。假设不是,就把内核全局变量current置为新任务的指针。然后ljmp 长跳转到新任务的状态段TSS组成的地址处,造成CPU执行任务切换操作。此时。CPU会把其全部寄存器的状态保存到当前任务寄存器TR中TSS段选择所指向的当前进程任务数据结构。然后把新任务状态段选择符所指向的新任务数据结构tss结构中的寄存器恢复到CPU中,系统正式開始执行新切换的任务。



    switch_to


    
    
    #define switch_to(n) {
    struct {long a,b;} __tmp; 
    __asm__("cmpl %%ecx,_current
    	"  //进程n是当前current进程。直接结束switch,否则继续je之后的内容
        "je 1f
    	" 
        "movw %%dx,%1
    	"  //将新任务的TSS的16选择符号存入 _tmp.b 中
        "xchgl %%ecx,_current
    	"  //交换ecx 和current的值,这个时候current就是next指向的进程了!

    "ljmp %0 " // long jump 把控制流跳转到 %0 _tmp 处 这个long jump比較“特别”。一句两句凝视说不清楚, //可能看到这里会疑惑都跳转了,以下的语句还有什么用?实用!

    由于会“跳回来” "cmpl %%ecx,_last_task_used_math " // 原任务是否使用过协处理器 "jne 1f " //没用过,跳到l,结束 "clts " //用过,清理 "1:" //切换TS标识 ::"m" (*&__tmp.a),"m" (*&__tmp.b), "d" (_TSS(n)),"c" ((long) task[n])); }

    为什么会执行这句话

    
    
    cmpl %%ecx,_last_task_used_math

    既然任务切换时CPU会恢复寄存器现场,那么它当然也会保存寄存器现场了。

    这些寄存器现场都会被写入原任务的tss结构里,值得注意的是。EIP会指向引起任务切换指令ljmp的下一条指令cmpl,所以,非常明显,当原任务有朝一日再次被调度执行时,它将从EIP所指的地方,而这个地方恰巧是cmpl !
    比較实用的一个帖子:

    http://www.oldlinux.org/oldlinux/archiver/?

    tid-5390.html





    sys_pause

    
    
    int sys_pause(void) //把当前进程转换成可中断的等待状态,并又一次调度
    {
        current->state = TASK_INTERRUPTIBLE;
        schedule();
        return 0;
    }



                                                 

    __sleep_on

    
    
    static inline void __sleep_on(struct task_struct **p, int state)
    //看的时候一定要记住,这个_sleep_on 的作用就是把当前进程正等待资源响应或者不在内存时先让他schedule一下,
    //让别的程序先执行一段时间的,
    //等到自己等待的资源响应之后,这个时候才跳过if推断,执行后面的语句
    {
        struct task_struct *tmp;
    
        if (!p)//常规检查p 为0的时候直接返回
            return;
        if (current == &(init_task.task)) //假设当前进程是
            panic("task[0] trying to sleep");
        tmp = *p;// tmp 指向原等待队列的头指针
        *p = current; //*p 指向等待队列的头指针,把current放入等待队列
        current->state = state;
    repeat:    schedule();
        if (*p && *p != current) {
        //假设*p是 等待队列的头指针。不进入。否则goto一直反复schedule,直到当前current进程是*p
            (**p).state = 0;
            current->state = TASK_UNINTERRUPTIBLE;
            goto repeat;
        }
        if (!*p)
            printk("Warning: *P = NULL
    
    ");
        if (*p = tmp) // 恢复原来的等待队列,*p 指向原来的等待队列头,逐出current进程
            tmp->state=0; //TASK_RUNNING
    }
    

    interruptible_sleep_on

    
    
    void interruptible_sleep_on(struct task_struct **p) //可中断睡眠
    {
        __sleep_on(p,TASK_INTERRUPTIBLE);
    }


    sleep_on

    
    
    void sleep_on(struct task_struct **p)//不可中断睡眠
    {
        __sleep_on(p,TASK_UNINTERRUPTIBLE);
    }
    

    wake_up

    
    
    void wake_up(struct task_struct **p)//唤醒进程
    {
        if (p && *p) {
            if ((**p).state == TASK_STOPPED)
                printk("wake_up: TASK_STOPPED");
            if ((**p).state == TASK_ZOMBIE)
                printk("wake_up: TASK_ZOMBIE");
            (**p).state=0; //TASK_RUNNING
        }
    }
    


    get_pid,getppid,getuid,geteuid,getgid,sys_nice

    
    
    int sys_getpid(void) //各种系统调用查看进程相关信息
    {
        return current->pid;
    }
    
    int sys_getppid(void)
    {
    
        return current->p_pptr->pid;
    }
    
    int sys_getuid(void)
    {
        return current->uid;
    }
    
    int sys_geteuid(void)
    {
        return current->euid;
    }
    
    int sys_getgid(void)
    {
        return current->gid;
    }
    
    int sys_getegid(void)
    {
        return current->egid;
    }
    
    int sys_nice(long increment)
    {
        if(current->priority-increment>0)
           current->priority -=increment;
        return 0;
    
    }



    sched_init

    
    
    void sched_init(void)//schedule 的初始化 被main.c 调用,真心之仅仅能大概看懂。非常多初始化设置不知道为什么
    {
        int i;
        struct desc_struct * p;
    
        if (sizeof(struct sigaction) !=16)
    
           panic("Struct sigactionMUST be 16 bytes");
    
        set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));
    
        set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));
    
        p = gdt+2+FIRST_TSS_ENTRY;
    
        for(i=1;i<NR_TASKS;i++) {
    
        //从1開始,跳过了进程init。保护好刚已经设置好的init_task
    
        //任务清零。描写叙述符清零
    
           task[i] = NULL;
    
           p->a=p->b=0; //偏址清零
    
           p++;
    
           p->a=p->b=0; //TSS 清零
    
           p++;
    
        }
    /* Clear NT, so that we won't have troubles with that later on */ //从这里我就不知道发生鸟神马。。。

    T-T __asm__("pushfl ; andl$0xffffbfff,(%esp) ; popfl"); ltr(0); lldt(0); outb_p(0x36,0x43); /* binary, mode 3, LSB/MSB, ch 0 */ outb_p(LATCH & 0xff , 0x40); /* LSB */ outb(LATCH >> 8 , 0x40); /* MSB */ set_intr_gate(0x20,&timer_interrupt); outb(inb_p(0x21)&~0x01,0x21); set_system_gate(0x80,&system_call); }







  • 相关阅读:
    Sqlite数据库的加密
    利用Cache缓存数据DataTable数据提高大数据量访问性能
    MYSQL数据库主主同步实战
    【误人子弟】《ASP.NET常用语句120条,(很实用的)
    Debian 添加 FreeBSD 内核支持
    FIREBIRD的基本数据类型
    扩大Portable Ubuntu的系统可用空间
    SQL语句 按年龄段分组统计人数问题
    aspx 页面,master页面与ascx用户控件传值的问题
    MonoDevelop 2.0 Released 发布
  • 原文地址:https://www.cnblogs.com/ldxsuanfa/p/10869116.html
  • Copyright © 2020-2023  润新知