• Mac操作系统XNU内核(八)系统调用过程代码简单分析


    (一)首先,系统调用有两种方式:

    •     0x80、0x81、0x82三个中断号;
    •     专门指令(至少分Intel架构和ARM架构),比如SYSENTER/SYSCALL

    (二)话分两头,先说中断向量方式

      这是终端向量定义的部分代码:

    INTERRUPT(0x7d)
    INTERRUPT(0x7e)
    USER_TRAP(0x7f, idt64_dtrace_ret) /* Required by dtrace "fasttrap" */
    
    USER_TRAP_SPC(0x80,idt64_unix_scall)
    USER_TRAP_SPC(0x81,idt64_mach_scall)
    USER_TRAP_SPC(0x82,idt64_mdep_scall)
    
    INTERRUPT(0x83)
    INTERRUPT(0x84)
    INTERRUPT(0x85)
    INTERRUPT(0x86)

      (BSD风格的系统调用,终端号就是0x80)

      触发中断以及后面的逻辑,都在汇编文件idt64.s中实现,下面简单看看:

    /*
     * System call handlers.
     * These are entered via a syscall interrupt. The system call number in %rax
     * is saved to the error code slot in the stack frame. We then branch to the
     * common state saving code.
     */
            
    #ifndef UNIX_INT
    #error NO UNIX INT!!!
    #endif
    Entry(idt64_unix_scall)
        swapgs                /* switch to kernel gs (cpu_data) */
        pushq    %rax            /* save system call number */
        PUSH_FUNCTION(HNDL_UNIX_SCALL)
        pushq    $(UNIX_INT)

      接下来执行PUSH_FUNCTIOIN(HNDL_UNIX_SCALL),先展开PUSH_FUNCTION看看:

    #if 1
    #define PUSH_FUNCTION(func)              
        sub    $8, %rsp            ;
        push    %rax                ;
        leaq    func(%rip), %rax        ;
        movq    %rax, 8(%rsp)            ;
        pop    %rax
    #else
    #define PUSH_FUNCTION(func) pushq func
    #endif

      系统调用号,在寄存器RAX,接下来看看HNDL_UNIX_SCALL:

    Entry(hndl_unix_scall)
    
            TIME_TRAP_UENTRY
    
        movq    %gs:CPU_ACTIVE_THREAD,%rcx    /* get current thread     */
        movq    TH_TASK(%rcx),%rbx        /* point to current task  */
        incl    TH_SYSCALLS_UNIX(%rcx)        /* increment call count   */
    
        /* Check for active vtimers in the current task */
        TASK_VTIMER_CHECK(%rbx,%rcx)
    
        sti
    
        CCALL1(unix_syscall, %r15)
        /*
         * always returns through thread_exception_return
         */

      主要有一行:unix_syscall,看看unix_syscall函数的definition:

    /*
     * Function:    unix_syscall
     *
     * Inputs:    regs    - pointer to i386 save area
     *
     * Outputs:    none
     */
    void
    unix_syscall(x86_saved_state_t *state)
    {
        thread_t        thread;
        void            *vt;
        unsigned int        code;
        struct sysent        *callp;
    
        int            error;
        vm_offset_t        params;
        struct proc        *p;
        struct uthread        *uthread;
        x86_saved_state32_t    *regs;
        boolean_t        is_vfork;
    
        assert(is_saved_state32(state));
        regs = saved_state32(state);
    #if DEBUG
        if (regs->eax == 0x800)
            thread_exception_return();
    #endif
        thread = current_thread();
        uthread = get_bsdthread_info(thread);
    
        /* Get the approriate proc; may be different from task's for vfork() */
        is_vfork = uthread->uu_flag & UT_VFORK;
        if (__improbable(is_vfork != 0))
            p = current_proc();
        else 
            p = (struct proc *)get_bsdtask_info(current_task());
    
        /* Verify that we are not being called from a task without a proc */
        if (__improbable(p == NULL)) {
            regs->eax = EPERM;
            regs->efl |= EFL_CF;
            task_terminate_internal(current_task());
            thread_exception_return();
            /* NOTREACHED */
        }
    
        code = regs->eax & I386_SYSCALL_NUMBER_MASK;
        DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u
    ",
                                  code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
        params = (vm_offset_t) (regs->uesp + sizeof (int));
    
        regs->efl &= ~(EFL_CF);
    
        callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
    
        if (__improbable(callp == sysent)) {
            code = fuword(params);
            params += sizeof(int);
            callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
        }

    .........

      通过寄存器中的数据得到code,再通过code取得数组sysent中的系统调用函数,交给callp;后面的代码冗长,这里就不全部贴出来咯。

      (关于sysent数组,改天详述)

      (三)再说系统调用专用指令方式(以Intel架构为例)

      SYSENTER用于32位,SYSCALL用于64位,只说SYSCALL吧,先看汇编:

    Entry(hi64_syscall)
    Entry(idt64_syscall)
    L_syscall_continue:
        swapgs                /* Kapow! get per-cpu data area */
        mov    %rsp, %gs:CPU_UBER_TMP    /* save user stack */
        mov    %gs:CPU_UBER_ISF, %rsp    /* switch stack to pcb */
    
        /*
         * Save values in the ISF frame in the PCB
         * to cons up the saved machine state.
         */
        movl    $(USER_DS), ISF64_SS(%rsp)    
        movl    $(SYSCALL_CS), ISF64_CS(%rsp)    /* cs - a pseudo-segment */
        mov    %r11, ISF64_RFLAGS(%rsp)    /* rflags */
        mov    %rcx, ISF64_RIP(%rsp)        /* rip */
        mov    %gs:CPU_UBER_TMP, %rcx
        mov    %rcx, ISF64_RSP(%rsp)        /* user stack */
        mov    %rax, ISF64_ERR(%rsp)        /* err/rax - syscall code */
        movq    $(T_SYSCALL), ISF64_TRAPNO(%rsp)    /* trapno */
        leaq    HNDL_SYSCALL(%rip), %r11;
        movq    %r11, ISF64_TRAPFN(%rsp)
        mov    ISF64_RFLAGS(%rsp), %r11    /* Avoid leak, restore R11 */
        jmp    L_dispatch_U64            /* this can only be 64-bit */

      主要看看HNDL_SYSCALL:

    /*
     * 64bit Tasks
     * System call entries via syscall only:
     *
     *    r15     x86_saved_state64_t
     *    rsp     kernel stack
     *
     *    both rsp and r15 are 16-byte aligned
     *    interrupts disabled
     *    direction flag cleared
     */
    
    Entry(hndl_syscall)
        TIME_TRAP_UENTRY
    
        movq    %gs:CPU_ACTIVE_THREAD,%rcx    /* get current thread     */
        movq    TH_TASK(%rcx),%rbx        /* point to current task  */
    
        /* Check for active vtimers in the current task */
        TASK_VTIMER_CHECK(%rbx,%rcx)
    
        /*
         * We can be here either for a mach, unix machdep or diag syscall,
         * as indicated by the syscall class:
         */
        movl    R64_RAX(%r15), %eax        /* syscall number/class */
        movl    %eax, %edx
        andl    $(SYSCALL_CLASS_MASK), %edx    /* syscall class */
        cmpl    $(SYSCALL_CLASS_MACH<<SYSCALL_CLASS_SHIFT), %edx
        je    EXT(hndl_mach_scall64)
        cmpl    $(SYSCALL_CLASS_UNIX<<SYSCALL_CLASS_SHIFT), %edx
        je    EXT(hndl_unix_scall64)
        cmpl    $(SYSCALL_CLASS_MDEP<<SYSCALL_CLASS_SHIFT), %edx
        je    EXT(hndl_mdep_scall64)
        cmpl    $(SYSCALL_CLASS_DIAG<<SYSCALL_CLASS_SHIFT), %edx
        je    EXT(hndl_diag_scall64)
    
        /* Syscall class unknown */
        sti
        CCALL3(i386_exception, $(EXC_SYSCALL), %rax, $1)
        /* no return */

      可以看到,这里根据寄存器和全局参数区分4种系统调用,BSD风格的系统调用只是第1种,还有3种:mach syscall、machdep syscall、diag syscall;

      如果是BSD风格系统调用,那么就继续执行hndl_unix_scall64:

    Entry(hndl_unix_scall64)
        incl    TH_SYSCALLS_UNIX(%rcx)        /* increment call count   */
        sti
    
        CCALL1(unix_syscall64, %r15)
        /*
         * always returns through thread_exception_return
         */

      只有一个函数调用,unix_syscall64,接下来看看这个函数的definition:

    void
    unix_syscall64(x86_saved_state_t *state)
    {
        thread_t    thread;
        unsigned int    code;
        struct sysent    *callp;
        void        *uargp;
        int        args_in_regs;
        int        error;
        struct proc    *p;
        struct uthread    *uthread;
        x86_saved_state64_t *regs;
    
        assert(is_saved_state64(state));
        regs = saved_state64(state);
    #if    DEBUG
        if (regs->rax == 0x2000800)
            thread_exception_return();
    #endif
        thread = current_thread();
        uthread = get_bsdthread_info(thread);
    
        /* Get the approriate proc; may be different from task's for vfork() */
        if (__probable(!(uthread->uu_flag & UT_VFORK)))
            p = (struct proc *)get_bsdtask_info(current_task());
        else 
            p = current_proc();
    
        /* Verify that we are not being called from a task without a proc */
        if (__improbable(p == NULL)) {
            regs->rax = EPERM;
            regs->isf.rflags |= EFL_CF;
            task_terminate_internal(current_task());
            thread_exception_return();
            /* NOTREACHED */
        }
        args_in_regs = 6;
    
        code = regs->rax & SYSCALL_NUMBER_MASK;
        DEBUG_KPRINT_SYSCALL_UNIX(
            "unix_syscall64: code=%d(%s) rip=%llx
    ",
            code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
        callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
        uargp = (void *)(&regs->rdi);
    
        if (__improbable(callp == sysent)) {
                /*
             * indirect system call... system call number
             * passed as 'arg0'
             */
                code = regs->rdi;
            callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
            uargp = (void *)(&regs->rsi);
            args_in_regs = 5;
        }

    ..........

      可以看到这里首先从x86_saved_state_t中取得系统调用号code,然后从数组sysent中得到系统调用函数,给callp;再后面是一些参数处理,和callp的执行。

      接下去就到了具体的系统调用函数。

      (大概介绍如上,有人拍砖吗?一起了解啊~)

  • 相关阅读:
    Android众说纷纭分辨率
    Android初识Helloworld
    Android从零开始
    PHP使用APC获取上传文件进度
    央行mlf,SLF,PSL,MLF,SLO
    python的lxml解析器
    美国风投行业50年数据揭示的10条VC投资秘密法则
    osx的10款文本编辑器
    nano命令,vi ed pico sed joe emacs jed ex
    修改centos7/osx的MAC地址
  • 原文地址:https://www.cnblogs.com/andypeker/p/4385802.html
Copyright © 2020-2023  润新知