• Linux内核TSS的使用


    参见文章:http://blog.chinaunix.net/uid-22695386-id-272098.html

    linux2.4之前的内核有进程最大数的限制,受限制的原因是,每一个进程都有自已的TSS和LDT,而TSS(任务描述符)和LDT(私有描述符)必须放在GDT中,GDT最大只能存放8192个描述符,除掉系统用掉的12描述符之外,最大进程数=(8192-12)/2, 总共4090个进程。从Linux2.4以后,全部进程使用同一个TSS,准确的说是,每个CPU一个TSS,在同一个CPU上的进程使用同一个TSS。TSS的定义在asm-i386/processer.h中,定义如下:

    extern struct tss_struct init_tss[NR_CPUS];

    在start_kernel()->trap_init()->cpu_init()初始化并加载TSS:

    void __init cpu_init (void)
    {
    int nr = smp_processor_id();    //获取当前cpu

    struct tss_struct * t = &init_tss[nr]; //当前cpu使用的tss

    t->esp0 = current->thread.esp0;            //把TSS中esp0更新为当前进程的esp0
    set_tss_desc(nr,t);
    gdt_table[__TSS(nr)].b &= 0xfffffdff;
    load_TR(nr);                                              //加载TSS
    load_LDT(&init_mm.context);                //加载LDT

    }

    我们知道,任务切换(硬切换)需要用到TSS来保存全部寄存器(2.4以前使用jmp来实现切换),

    中断发生时也需要从TSS中读取ring0的esp0,那么,进程使用相同的TSS,任务切换怎么办?

    其实2.4以后不再使用硬切换,而是使用软切换,寄存器不再保存在TSS中了,而是保存在

    task->thread中只用TSS的esp0和IO许可位图,所以,在进程切换过程中,只需要更新TSS中

    的esp0、io bitmap,代码在sched.c中:

    schedule()->switch_to()->__switch_to(),

    void fastcall __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
    {
    struct thread_struct *prev = &prev_p->thread,
         *next = &next_p->thread;
    struct tss_struct *tss = init_tss + smp_processor_id(); //当前cpu的TSS

    /*
    * Reload esp0, LDT and the page table pointer:
    */
    ttss->esp0 = next->esp0; //用下一个进程的esp0更新tss->esp0

    //拷贝下一个进程的io_bitmap到tss->io_bitmap

    if (prev->ioperm || next->ioperm) {
       if (next->ioperm) {
        /*
        * 4 cachelines copy ... not good, but not that
        * bad either. Anyone got something better?
        * This only affects processes which use ioperm().
        * [Putting the TSSs into 4k-tlb mapped regions
        * and playing VM tricks to switch the IO bitmap
        * is not really acceptable.]
        */
        memcpy(tss->io_bitmap, next->io_bitmap,
         IO_BITMAP_BYTES);
        tss->bitmap = IO_BITMAP_OFFSET;
       } else
        /*
        * a bitmap offset pointing outside of the TSS limit
        * causes a nicely controllable SIGSEGV if a process
        * tries to use a port IO instruction. The first
        * sys_ioperm() call sets up the bitmap properly.
        */
        tss->bitmap = INVALID_IO_BITMAP_OFFSET;
    }
    }

    以及代码:

       1: /*
       2:  *    switch_to(x,yn) should switch tasks from x to y.
       3:  *
       4:  * We fsave/fwait so that an exception goes off at the right time
       5:  * (as a call from the fsave or fwait in effect) rather than to
       6:  * the wrong process. Lazy FP saving no longer makes any sense
       7:  * with modern CPU's, and this simplifies a lot of things (SMP
       8:  * and UP become the same).
       9:  *
      10:  * NOTE! We used to use the x86 hardware context switching. The
      11:  * reason for not using it any more becomes apparent when you
      12:  * try to recover gracefully from saved state that is no longer
      13:  * valid (stale segment register values in particular). With the
      14:  * hardware task-switch, there is no way to fix up bad state in
      15:  * a reasonable manner.
      16:  *
      17:  * The fact that Intel documents the hardware task-switching to
      18:  * be slow is a fairly red herring - this code is not noticeably
      19:  * faster. However, there _is_ some room for improvement here,
      20:  * so the performance issues may eventually be a valid point.
      21:  * More important, however, is the fact that this allows us much
      22:  * more flexibility.
      23:  *
      24:  * The return value (in %ax) will be the "prev" task after
      25:  * the task-switch, and shows up in ret_from_fork in entry.S,
      26:  * for example.
      27:  */
      28: __notrace_funcgraph struct task_struct *
      29: __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
      30: {
      31:     struct thread_struct *prev = &prev_p->thread,
      32:                  *next = &next_p->thread;
      33:     int cpu = smp_processor_id();
      34:     struct tss_struct *tss = &per_cpu(init_tss, cpu);
      35:     fpu_switch_t fpu;
      36:  
      37:     /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
      38:  
      39:     fpu = switch_fpu_prepare(prev_p, next_p);
      40:  
      41:     /*
      42:      * Reload esp0.
      43:      */
      44:     load_sp0(tss, next);
      45:  
      46:     /*
      47:      * Save away %gs. No need to save %fs, as it was saved on the
      48:      * stack on entry.  No need to save %es and %ds, as those are
      49:      * always kernel segments while inside the kernel.  Doing this
      50:      * before setting the new TLS descriptors avoids the situation
      51:      * where we temporarily have non-reloadable segments in %fs
      52:      * and %gs.  This could be an issue if the NMI handler ever
      53:      * used %fs or %gs (it does not today), or if the kernel is
      54:      * running inside of a hypervisor layer.
      55:      */
      56:     lazy_save_gs(prev->gs);
      57:  
      58:     /*
      59:      * Load the per-thread Thread-Local Storage descriptor.
      60:      */
      61:     load_TLS(next, cpu);
      62:  
      63:     /*
      64:      * Restore IOPL if needed.  In normal use, the flags restore
      65:      * in the switch assembly will handle this.  But if the kernel
      66:      * is running virtualized at a non-zero CPL, the popf will
      67:      * not restore flags, so it must be done in a separate step.
      68:      */
      69:     if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
      70:         set_iopl_mask(next->iopl);
      71:  
      72:     /*
      73:      * Now maybe handle debug registers and/or IO bitmaps
      74:      */
      75:     if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
      76:              task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
      77:         __switch_to_xtra(prev_p, next_p, tss);
      78:  
      79:     /*
      80:      * Leave lazy mode, flushing any hypercalls made here.
      81:      * This must be done before restoring TLS segments so
      82:      * the GDT and LDT are properly updated, and must be
      83:      * done before math_state_restore, so the TS bit is up
      84:      * to date.
      85:      */
      86:     arch_end_context_switch(next_p);
      87:  
      88:     /*
      89:      * Restore %gs if needed (which is common)
      90:      */
      91:     if (prev->gs | next->gs)
      92:         lazy_load_gs(next->gs);
      93:  
      94:     switch_fpu_finish(next_p, fpu);
      95:  
      96:     percpu_write(current_task, next_p);
      97:  
      98:     return prev_p;
      99: }

    先分析一下comments

    /*
    *    switch_to(x,yn) should switch tasks from x to y.
    *
    * We fsave/fwait so that an exception goes off at the right time
    * (as a call from the fsave or fwait in effect) rather than to
    * the wrong process. Lazy FP saving no longer makes any sense
    * with modern CPU's, and this simplifies a lot of things (SMP
    * and UP become the same).
    *
    * NOTE! We used to use the x86 hardware context switching. The
    * reason for not using it any more becomes apparent when you
    * try to recover gracefully from saved state that is no longer
    * valid (stale 【变味的,失效的】segment register values in particular). With the
    * hardware task-switch, there is no way to fix up bad state in
    * a reasonable manner.
    *
    * The fact that Intel documents the hardware task-switching to
    * be slow is a fairly red herring【题外话】 - this code is not noticeably
    * faster. However, there _is_ some room for improvement here,
    * so the performance issues may eventually be a valid point.
    * More important, however, is the fact that this allows us much
    * more flexibility.
    *
    * The return value (in %ax) will be the "prev" task after
    * the task-switch, and shows up in ret_from_fork in entry.S,
    * for example.
    */

    大致意思是,为了灵活起见,我们将Intel硬件任务切换变为软任务切换

    根据开关引用的文章,每个CPU上执行的进程都使用同一个TSS段,

        int cpu = smp_processor_id();
        struct tss_struct *tss = &per_cpu(init_tss, cpu);

    而且里面有效的信息只有esp0io_map成员。

    /*
         * Reload esp0.
         */
        load_sp0(tss, next);

    原来保存在TSS段中,属于每个进程的上下文信息都保存在下面的结构体(thread_struct)中:

        struct thread_struct *prev = &prev_p->thread,
                     *next = &next_p->thread;

       1: struct thread_struct {
       2:     /* Cached TLS descriptors: */
       3:     struct desc_struct    tls_array[GDT_ENTRY_TLS_ENTRIES];
       4:     unsigned long        sp0;
       5:     unsigned long        sp;
       6: #ifdef CONFIG_X86_32
       7:     unsigned long        sysenter_cs;
       8: #else
       9:     unsigned long        usersp;    /* Copy from PDA */
      10:     unsigned short        es;
      11:     unsigned short        ds;
      12:     unsigned short        fsindex;
      13:     unsigned short        gsindex;
      14: #endif
      15: #ifdef CONFIG_X86_32
      16:     unsigned long        ip;
      17: #endif
      18: #ifdef CONFIG_X86_64
      19:     unsigned long        fs;
      20: #endif
      21:     unsigned long        gs;
      22:     /* Save middle states of ptrace breakpoints */
      23:     struct perf_event    *ptrace_bps[HBP_NUM];
      24:     /* Debug status used for traps, single steps, etc... */
      25:     unsigned long           debugreg6;
      26:     /* Keep track of the exact dr7 value set by the user */
      27:     unsigned long           ptrace_dr7;
      28:     /* Fault info: */
      29:     unsigned long        cr2;
      30:     unsigned long        trap_no;
      31:     unsigned long        error_code;
      32:     /* floating point and extended processor state */
      33:     unsigned long        has_fpu;
      34:     struct fpu        fpu;
      35: #ifdef CONFIG_X86_32
      36:     /* Virtual 86 mode info */
      37:     struct vm86_struct __user *vm86_info;
      38:     unsigned long        screen_bitmap;
      39:     unsigned long        v86flags;
      40:     unsigned long        v86mask;
      41:     unsigned long        saved_sp0;
      42:     unsigned int        saved_fs;
      43:     unsigned int        saved_gs;
      44: #endif
      45:     /* IO permissions: */
      46:     unsigned long        *io_bitmap_ptr;
      47:     unsigned long        iopl;
      48:     /* Max allowed port in the bitmap, in bytes: */
      49:     unsigned        io_bitmap_max;
      50: };

    在Linux操作系统中,gs寄存器用于存储存放TLS的地址。(在Windows中,使用fs寄存器来存放TEB结构体的地址)。

    参见:http://www.linuxidc.com/Linux/2012-06/64079p2.htm

    Linux的glibc使用GS寄存器来访问TLS,也就是说,GS寄存器指示的段指向本线程的TEB(Windows的术语),也就是TLS,这么做有个好处,那就是可以高效的访问TLS里面存储的信息而不用一次次的调用系统调用,当然使用系统调用的方式也是可以的。之所以可以这么做,是因为Intel对各个寄存器的作用的规范规定的比较松散,因此你可以拿GS,FS等段寄存器来做几乎任何事,当然也就可以做TLS直接访问了,最终glibc在线程启动的时候首先将GS寄存器指向GDT的第6个段,完全使用段机制来支持针对TLS的寻址访问,后续的访问TLS信息就和访问用户态的信息一样高效了。

    下面代码,将当前的CPU中的gs寄存器的内容写回到prev结构体中。

    /*
         * Save away %gs. No need to save %fs, as it was saved on the
         * stack on entry.  No need to save %es and %ds, as those are
         * always kernel segments while inside the kernel.  Doing this
         * before setting the new TLS descriptors avoids the situation
         * where we temporarily have non-reloadable segments in %fs
         * and %gs.  This could be an issue if the NMI handler ever
         * used %fs or %gs (it does not today), or if the kernel is
         * running inside of a hypervisor layer.
         */
        lazy_save_gs(prev->gs);

    接下来

    /*
         * Load the per-thread Thread-Local Storage descriptor.
         */
        load_TLS(next, cpu);

    更新GDT表中表示TLS的相关表项。

       1: #define load_TLS(t, cpu)            native_load_tls(t, cpu)
       2:  
       3: static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
       4: {
       5:     struct desc_struct *gdt = get_cpu_gdt_table(cpu);
       6:     unsigned int i;
       7:  
       8:     for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
       9:         gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
      10: }

    首先获取到当前CPU的GDT表

    struct gdt_page {
        struct desc_struct gdt[GDT_ENTRIES];
    } __attribute__((aligned(PAGE_SIZE)));

    DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);

    static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
    {
        return per_cpu(gdt_page, cpu).gdt;
    }

    per_cpu机制,是保证每个CPU都有一份自己的关键数据结构

    参见:http://www.unixresources.net/linux/clf/linuxK/archive/00/00/47/91/479165.html

    在该函数中,为每个CPU分配一段专有数据区,并将.data.percpu中的数据拷贝到其中,
    每个CPU各有一份。由于数据从__per_cpu_start处转移到各CPU自己的专有数据区中了,
    因此存取其中的变量就不能再用原先的值了,比如存取per_cpu__runqueues
    就不能再用per_cpu__runqueues了,需要做一个偏移量的调整,
    即需要加上各CPU自己的专有数据区首地址相对于__per_cpu_start的偏移量。
    在这里也就是__per_cpu_offset[i],其中CPU i的专有数据区相对于
    __per_cpu_start的偏移量为__per_cpu_offset[i]。
    这样,就可以方便地计算专有数据区中各变量的新地址,比如对于per_cpu_runqueues,
    其新地址即变成per_cpu_runqueues+__per_cpu_offset[i]。

    load_TLS的深入到此结束,言归正传。


    /*
         * Now maybe handle debug registers and/or IO bitmaps
         */
        if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
                 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
            __switch_to_xtra(prev_p, next_p, tss);

    接下来处理调试寄存器,以及io位图。

       1: void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
       2:               struct tss_struct *tss)
       3: {
       4:     struct thread_struct *prev, *next;
       5:  
       6:     prev = &prev_p->thread;
       7:     next = &next_p->thread;
       8:  
       9:     if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
      10:         test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
      11:         unsigned long debugctl = get_debugctlmsr();
      12:  
      13:         debugctl &= ~DEBUGCTLMSR_BTF;
      14:         if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
      15:             debugctl |= DEBUGCTLMSR_BTF;
      16:  
      17:         update_debugctlmsr(debugctl);
      18:     }
      19:  
      20:     if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
      21:         test_tsk_thread_flag(next_p, TIF_NOTSC)) {
      22:         /* prev and next are different */
      23:         if (test_tsk_thread_flag(next_p, TIF_NOTSC))
      24:             hard_disable_TSC();
      25:         else
      26:             hard_enable_TSC();
      27:     }
      28:  
      29:     if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
      30:         /*
      31:          * Copy the relevant range of the IO bitmap.
      32:          * Normally this is 128 bytes or less:
      33:          */
      34:         memcpy(tss->io_bitmap, next->io_bitmap_ptr,
      35:                max(prev->io_bitmap_max, next->io_bitmap_max));
      36:     } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
      37:         /*
      38:          * Clear any possible leftover bits:
      39:          */
      40:         memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
      41:     }
      42:     propagate_user_return_notify(prev_p, next_p);
      43: }
  • 相关阅读:
    3delight and useBackground
    Maya 闪电
    jcCut1.1 在Maya里实现切割物体
    jcFeather 2.3.0 Demo
    javascript深度克隆的方法
    前端调用本地摄像头实现拍照(vue) Allen
    《暗时间》 读书笔记
    阅读笔记3流浪动物救助实践困境与路径优化
    阅读笔记1濒危动物网页的设计构思
    阅读笔记2电子宠物系统设计
  • 原文地址:https://www.cnblogs.com/long123king/p/3501853.html
Copyright © 2020-2023  润新知