深入理解Linux之进程初探

一. 关于fork调用

　　fork()调用创建一个新的进程，该进程几乎是当前进程的一个完全拷贝。由fork()创建的新进程被称为子进程。fork函数被调用一次但返回两次。两次返回的唯一区别是子进程中返回0值，而父进程中返回子进程ID。子进程是父进程的副本，它将获得父进程数据空间、堆、栈等资源的副本。注意，子进程持有的是上述存储空间的“副本”，这意味着父子进程间不共享这些存储空间。Linux将复制父进程的地址空间内容给子进程，因此，子进程拥有独立的地址空间。

　　我们来看一个DEMO:

// fork_example.c
#include <memory.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>

int main(int argc, const char *argv[])
{
    pid_t pid;
    
    char stack_data[] = "stack_data";

    char *heap_data = malloc(10 * sizeof(char));    
    strcpy(heap_data, "heap_data");
    
    pid = fork();
    if (pid == 0) {
        printf("CHILD PROCESS: %s, %s\n", stack_data, heap_data);
    } else if (pid > 0) {
        printf("PARENT PROCESS: %s, %s\n", stack_data, heap_data);
    } else {
        printf("FORK FAILED.");
    }

    return 0;
}

　　运行的输出结果为：

CHILD PROCESS: stack_data, heap_data
PARENT PROCESS: stack_data, heap_data

　　可以看出，父进程和子进程的栈和堆的数据是相同的。这些数据在创建子进程时是通过拷贝产生的。

二. 关于execl调用

　　系统调用exec是以新的进程去代替原来的进程，但进程的PID保持不变。因此，可以这样认为，exec系统调用并没有创建新的进程，只是替换了原来进程上下文的内容。原进程的代码段，数据段，堆栈段被新的进程所代替。

　　我们来看一个例子：

// execl_example.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

int main(int argc, const char *argv[])
{
    execl("./hello_world", NULL, NULL);
    
    /* We can only reach this code when there is an error in execl */    
    printf("The execl must be failed!\n");

    return 1;
}

　　我们执行一个不存在的hello_world程序，看看输出结果：

The execl must be failed!

　　现在我们创建一个hello_world程序，该程序简单的打印一个Hello World.

// hello_world.c
#include <stdio.h>

int main(int argc, const char *argv[])
{
    printf("Hello World!\n");
}

　　现在我们继续运行execl_example程序，这时输出为：

Hello World!

　　通过比较两次输出，我们发现：当execl成功时，原有的进程执行就会被打断，替换为新的进程继续执行。

三. 使用汇编进行系统调用

　　我们知道在Linux中，每个系统调用都对应一个系统调用号。这个系统调用号是在unistd.h中定义的。在我的机器上文件的位置是在：

/usr/src/linux-headers-2.6.28-11-generic/arch/x86/include/asm/unistd_32.h

　　如果找不到，可以尝试使用以下命令查找：

locate unistd.h | xargs grep -ri "__NR_fork"

　　下面是unistd.h的部分内容：

... ...
#define __NR_restart_syscall      0
#define __NR_exit          1
#define __NR_fork          2
#define __NR_read          3
#define __NR_write          4
#define __NR_open          5
#define __NR_close          6
#define __NR_waitpid          7
#define __NR_creat          8
#define __NR_link          9
#define __NR_unlink         10
#define __NR_execve         11
... ...

　　使用汇编调用fork:

　　可以看到fork的系统调用号是2，我们现在使用汇编代码重新编写fork_example.c

#include <memory.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>

int  main()
{
    pid_t pid;
    
    char stack_data[] = "stack_data";

    char *heap_data = malloc(10 * sizeof(char));    
    strcpy(heap_data, "heap_data");

    // pid = fork();
    asm volatile(
        "mov $0x2, %%eax\n\t" // 将fork的系统调用号2存到eax寄存器  
        "int $0x80\n\t"       // 产生int 0x80中断
        "mov %%eax,%0\n\t"    // 将结果存入pid中
        : "=m" (pid) 
    );
    
    if (pid == 0) {
        printf("CHILD PROCESS: %s, %s\n", stack_data, heap_data);
    } else if (pid > 0) {
        printf("PARENT PROCESS: %s, %s\n", stack_data, heap_data);
    } else {
        printf("FORK FAILED.\n");
    }

    return 0;
}

　　运行输出结果是：

CHILD PROCESS: stack_data, heap_data
PARENT PROCESS: stack_data, heap_data

　　可以尝试将调用号替换一下，改成$0x3，得到的结果是：

FORK FAILED.

　　使用汇编调用execl:

　　我们再尝试一下使用汇编调用execl。通过上面的观察我们可以看到execl的系统调用号是11.

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

int main(int argc, const char *argv[])
{
    // execl("./hello_world", NULL, NULL);
    const char *program = "./hello_world";
    asm volatile (
        "mov %0,%%ebx\n\t"   // 使用program做为参数1
        "mov $0,%%ecx\n\t"   // 参数2为NULL
        "mov $0,%%edx\n\t"   // 参数3为NULL
        "mov $0xb,%%eax\n\t" // 将execl的系统调用好11存入eax中
        "int $0x80\n\t"      // 产生0x80中断
        : "=m" (program)
    );
    
    /* We can only reach this code when there is an error in execl */    
    printf("The execl must be failed!\n");

    return 1;
}

　　运行结果为：

Hello World!

　　如果将系统调用号改为0x3，输出结果为：

The execl must be failed!

四.系统调用过程详解

　　通过第三步的过程，我们了解到，系统调用在内核中的执行是依靠中断实现的。如果我们想进一步定位fork和execl的代码，我们需要先了解系统调用的详细过程。即回答以下两个问题：

　　1.中断是怎么工作的？

　　2.int 0x80中断是怎么工作的？

　　中断是怎么工作的

　　在Linux操作系统中，中断是通过中断描述符表工作的。中断描述符表（Interrupt Descriptor Table, IDT）是一个系统表，它与每一个中断或者异常向量相联系，每一个向量在表中有相应的中断或者异常处理程序的入口地址。内核在允许中断发生前，必须适当的初始化IDT。对于每个中断，都会有对应的中断处理程序。当产生一个中断时，Linux根据中断向量表中对应的项找到存储中断处理程序的地址，然后调用相应的中断处理程序。中段描述符表在内存中的地址存储在idtr寄存器中。内核在启动中断前，必须初始化IDT，然后将IDT的地址壮载到idtr中。

　　内核初始化的时候调用trap_init()函数和init_IRQ()函数初始化中断向量表。

　　int 0x80中断是怎么工作的

　　通过上面的分析，我们知道每个中断都有对应的处理程序。在系统调用的过程中，会有一个系统调用分派表，每个表项存储了一个系统调用。系统调用中断处理程序，根据系统调用号找到对应的系统调用执行。对于系统调用，参数的传递是通过寄存器ebx ecx edx进行传递的。eax中存储的是系统调用号。系统调用最大为__NR_syscalls个。

　　在arch/x86/include/asm/irq_vectors.h中定义了

# define SYSCALL_VECTOR            0x80

　　现在我们查找trap_init函数，在arch/x86/kernel/traps.c中

set_system_trap_gate(SYSCALL_VECTOR, &system_call);

　　现在，查找system_call函数，在arch/x86/kernel/entry_32.s中：

ENTRY(system_call)
    RING0_INT_FRAME            # can't unwind into user space anyway
    ASM_CLAC
    pushl_cfi %eax            # save orig_eax
    SAVE_ALL
    GET_THREAD_INFO(%ebp)
                    # system call tracing in operation / emulation
    testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
    jnz syscall_trace_entry
    cmpl $(NR_syscalls), %eax
    jae syscall_badsys
syscall_call:
    call *sys_call_table(,%eax,4)
    movl %eax,PT_EAX(%esp)        # store the return value
syscall_exit:
    LOCKDEP_SYS_EXIT
    DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                    # setting need_resched or sigpending
                    # between sampling and the iret
    TRACE_IRQS_OFF
    movl TI_flags(%ebp), %ecx
    testl $_TIF_ALLWORK_MASK, %ecx    # current->work
    jne syscall_exit_work

　　在include/uapi/asm_generic/unistd.h中找到：

__SYSCALL(__NR_fork, sys_fork)

　　fork的系统调用号是2，对应的系统调用分派表中为sys_fork函数。在kernel/fork.c中找到如下代码：

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
    return do_fork(SIGCHLD, 0, 0, NULL, NULL);
#else
    /* can not support in nommu mode */
    return(-EINVAL);
#endif
}
#endif

四.do_fork源码分析

　　现在查找do_fork函数，也在kernel/fork.c中：

/*
 *  Ok, 这就是fork例程的主要部分。
 *
 * 函数执行进程的复制，如果成功则启动新进程。并且等待新进程完成VM的使用。
 */
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * 在分配之前做一些参数和权限检查。
     */
    if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
        if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
            return -EINVAL;
    }

    /*
     * 确定是否需要报告给ptracer，或者哪些需要汇报给ptracer。如果是调用者内核线程
     * 或者标志了CLONE_UNTRACED，则不报告任何跟踪信息。否则，报告相应fork的跟踪信息。
     */
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;

        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }
　　
    // copy_process函数创建进程描述符和子进程需要的其他数据结构。
    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace);
             
    /* 现在唤醒新线程。*/
    if (!IS_ERR(p)) {
        struct completion vfork;

        trace_sched_process_fork(current, p);

        nr = task_pid_vnr(p);

        if (clone_flags & CLONE_PARENT_SETTID)
            put_user(nr, parent_tidptr);

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
            get_task_struct(p);
        }

        wake_up_new_task(p);

        /* fork已经完成，子进程也已经启动。现在通知ptracer。 */
        if (unlikely(trace))
            ptrace_event(trace, nr);

        if (clone_flags & CLONE_VFORK) {
            if (!wait_for_vfork_done(p, &vfork))
                ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
        }
    } else {
        nr = PTR_ERR(p);
    }
    return nr;
}

　　可以看到do_fork调用了copy_process完成了绝大部分的工作。copy_process位于同一个文件当中：

/*
 * 以复制的方式创建一个新的进程。但不启动运行新创建的进程。
 *
 * 主要复制寄存器和其它进程环境中的相应的合适部分。真正的
 * 启动工作则交由调用者完成。
 */
static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace)
{
    int retval;
    struct task_struct *p; // 保存新的进程描述符。
    
    /* 删除了对标志位的一致性和合法性的检查 */
    
    // security_task_create和security_task_alloc()执行所有附加的安全检查。
    retval = security_task_create(clone_flags);
    // dup_task_struct为子进程获取进程描述符。稍后分析。
    p = dup_task_struct(current);
    // task结构中ftrace_ret_stack结构变量的初始化，即函数返回用的栈。
    ftrace_graph_init_task(p);
    get_seccomp_filter(p);
    // task中互斥变量的初始化。
    rt_mutex_init_task(p);
    // 第1个if对进程占用的资源数做出限制，task_rlimit(p, RLIMIT_NPROC)
    // 限制了改进程用户可以拥有的进程总数。 
    if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) {
        // 第2个if使用了capable（）函数来对权限做出检查，检查是否有权对指定
        // 的资源进行操作，该函数返回0则代表无权操作。
        if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->real_cred->user != INIT_USER)
            goto bad_fork_free;
    }
    
    current->flags &= ~PF_NPROC_EXCEEDED; // 将当前进程标志位中的PF_NPROC_EXCEEDED置0。
    copy_creds(p, clone_flags); // copy_creds()复制证书，应该是复制权限及身份信息。

    // 检查创建的线程是否超过了系统进程总量。
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;
    
    // 增加执行实体的模块引用计数。
    if (!try_module_get(task_thread_info(p)->exec_domain->module))
        goto bad_fork_cleanup_count;

    p->did_exec = 0;
    delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
    copy_flags(clone_flags, p); // 更新task_struct结构中flags成员
    INIT_LIST_HEAD(&p->children); // 初始化task_struct结构中的子进程链表
    INIT_LIST_HEAD(&p->sibling); // 初始化task_struct结构中的兄弟进程链表
    rcu_copy_process(p); // rcu相关变量的初始化
    p->vfork_done = NULL; 
    spin_lock_init(&p->alloc_lock); 

    init_sigpending(&p->pending);

    p->utime = p->stime = p->gtime = 0;
    p->utimescaled = p->stimescaled = 0;
    p->prev_cputime.utime = p->prev_cputime.stime = 0;
    seqlock_init(&p->vtime_seqlock);
    p->vtime_snap = 0;
    p->vtime_snap_whence = VTIME_SLEEPING;

    memset(&p->rss_stat, 0, sizeof(p->rss_stat));

    p->default_timer_slack_ns = current->timer_slack_ns;

    task_io_accounting_init(&p->ioac); // 进程描述符中的io数据记录的初始化
    acct_clear_integrals(p);

    posix_cpu_timers_init(p); // timer初始化

    do_posix_clock_monotonic_gettime(&p->start_time);
    p->real_start_time = p->start_time;
    monotonic_to_bootbased(&p->real_start_time);
    p->io_context = NULL;
    p->audit_context = NULL;
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_begin(current);
    cgroup_fork(p);
#ifdef CONFIG_NUMA
    p->mempolicy = mpol_dup(p->mempolicy);
    if (IS_ERR(p->mempolicy)) {
        retval = PTR_ERR(p->mempolicy);
        p->mempolicy = NULL;
        goto bad_fork_cleanup_cgroup;
    }
    mpol_fix_fork_child_flag(p);
#endif
    /* 设置CPU */
    p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
    p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
    seqcount_init(&p->mems_allowed_seq);
    /* 设置跟踪中断标志 */ 
    p->irq_events = 0;
    p->hardirqs_enabled = 0;
    p->hardirq_enable_ip = 0;
    p->hardirq_enable_event = 0;
    p->hardirq_disable_ip = _THIS_IP_;
    p->hardirq_disable_event = 0;
    p->softirqs_enabled = 1;
    p->softirq_enable_ip = _THIS_IP_;
    p->softirq_enable_event = 0;
    p->softirq_disable_ip = 0;
    p->softirq_disable_event = 0;
    p->hardirq_context = 0;
    p->softirq_context = 0;
    /* 设置锁深度 */
    p->lockdep_depth = 0; /* no locks held yet */
    p->curr_chain_key = 0;
    p->lockdep_recursion = 0;

#ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_MEMCG
    p->memcg_batch.do_batch = 0;
    p->memcg_batch.memcg = NULL;
#endif

    sched_fork(p); // 调度相关初始化,将新进程分配到某个CPU上。

    perf_event_init_task(p);
    audit_alloc(p);
        
    /* 以下根据clone_flags的设置复制相应的部分，进行重新分配或者共享父进程的内容 */
    copy_semundo(clone_flags, p);
    copy_files(clone_flags, p);
    copy_fs(clone_flags, p);
    copy_sighand(clone_flags, p);
    copy_signal(clone_flags, p);
    copy_mm(clone_flags, p);
    copy_namespaces(clone_flags, p);
    copy_io(clone_flags, p);
    copy_thread(clone_flags, stack_start, stack_size, p);

    if (pid != &init_struct_pid) {
        retval = -ENOMEM;
        pid = alloc_pid(p->nsproxy->pid_ns);
        if (!pid)
            goto bad_fork_cleanup_io;
    }

    p->pid = pid_nr(pid);
    p->tgid = p->pid;
    // 如果设置了同在一个线程组则继承TGID。 
    // 对于普通进程来说TGID和PID相等， 
    // 对于线程来说，同一线程组内的所有线程的TGID都相等， 
    // 这使得这些多线程可以通过调用getpid()获得相同的PID。
    if (clone_flags & CLONE_THREAD)
        p->tgid = current->tgid;

    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /*
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
    uprobe_copy_process(p);
    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        p->sas_ss_sp = p->sas_ss_size = 0;

    /*
     * Syscall tracing and stepping should be turned off in the
     * child regardless of CLONE_PTRACE.
     */
    user_disable_single_step(p);
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
    clear_all_latency_tracing(p);

    /* ok, now we should be set up.. */
    if (clone_flags & CLONE_THREAD)
        p->exit_signal = -1;
    else if (clone_flags & CLONE_PARENT)
        p->exit_signal = current->group_leader->exit_signal;
    else
        p->exit_signal = (clone_flags & CSIGNAL);

    p->pdeath_signal = 0;
    p->exit_state = 0;

    p->nr_dirtied = 0;
    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    p->dirty_paused_when = 0;

    /*
     * Ok, make it visible to the rest of the system.
     * We dont wake it up yet.
     */
    p->group_leader = p;
    INIT_LIST_HEAD(&p->thread_group);
    p->task_works = NULL;

    /* Need tasklist lock for parent etc handling! */
    write_lock_irq(&tasklist_lock);

    // 如果这两个标志设定了，那么和父进程有相同的父进程
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
        p->real_parent = current->real_parent;
        p->parent_exec_id = current->parent_exec_id;
    } else { 
    // 否则父进程为实际父进程
        p->real_parent = current;
        p->parent_exec_id = current->self_exec_id;
    }

    spin_lock(&current->sighand->siglock);

    /*
     * Process group and session signals need to be delivered to just the
     * parent before the fork or both the parent and the child after the
     * fork. Restart if a signal comes in before we add the new process to
     * it's process group.
     * A fatal signal pending means that current will exit, so the new
     * thread can't slip out of an OOM kill (or normal SIGKILL).
    */
    recalc_sigpending();
    if (signal_pending(current)) {
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        retval = -ERESTARTNOINTR;
        goto bad_fork_free_pid;
    }
    
    // 如果和父进程有相同的线程组
    if (clone_flags & CLONE_THREAD) {
        current->signal->nr_threads++;
        atomic_inc(&current->signal->live);
        atomic_inc(&current->signal->sigcnt);
        p->group_leader = current->group_leader;
        list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
    }

    if (likely(p->pid)) {
        ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); // ptrace的相关初始化
        
        // 如果进程p是线程组leader
        if (thread_group_leader(p)) {
            if (is_child_reaper(pid)) {
                ns_of_pid(pid)->child_reaper = p;
                p->signal->flags |= SIGNAL_UNKILLABLE;
            }

            p->signal->leader_pid = pid;
            p->signal->tty = tty_kref_get(current->signal->tty);
            
            /* 加入对应的PID哈希表 */
            attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
            attach_pid(p, PIDTYPE_SID, task_session(current));
            
            list_add_tail(&p->sibling, &p->real_parent->children);
            list_add_tail_rcu(&p->tasks, &init_task.tasks); // 加入队列
            __this_cpu_inc(process_counts); // 将per cpu变量加一
        }
        attach_pid(p, PIDTYPE_PID, pid); // 维护pid变量
        nr_threads++; // 线程数加1。
    }

    total_forks++; // 将全局变量total_forks加1.
    spin_unlock(&current->sighand->siglock);
    write_unlock_irq(&tasklist_lock);
    proc_fork_connector(p);
    cgroup_post_fork(p);
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_end(current);
    perf_event_fork(p);

    trace_task_newtask(p, clone_flags);

    return p;
}

dup_task_struct也在fork.c文件中

static struct task_struct *dup_task_struct(struct task_struct *orig)
{
    struct task_struct *tsk; // 存放新的task_sturct结构体
    struct thread_info *ti; // 存放线程信息
    unsigned long *stackend; 
    int node = tsk_fork_get_node(orig); 
    int err;

    tsk = alloc_task_struct_node(node); // 通过alloc_task_struct()函数创建task_struct结构空间

    ti = alloc_thread_info_node(tsk, node); // 分配thread_info结构空间

    err = arch_dup_task_struct(tsk, orig); // 关于浮点结构的复制
    
    tsk->stack = ti; // task的对应栈

    setup_thread_stack(tsk, orig);
    clear_user_return_notifier(tsk);
    clear_tsk_need_resched(tsk);
    stackend = end_of_stack(tsk);
    *stackend = STACK_END_MAGIC;    /* for overflow detection */

#ifdef CONFIG_CC_STACKPROTECTOR
    tsk->stack_canary = get_random_int(); // 金丝雀的设置，用于防御栈溢出攻击
#endif

    /*
     * One for us, one for whoever does the "release_task()" (usually
     * parent)
     */
    atomic_set(&tsk->usage, 2); // 设置进程块的使用计数。
#ifdef CONFIG_BLK_DEV_IO_TRACE
    tsk->btrace_seq = 0;
#endif
    tsk->splice_pipe = NULL;
    tsk->task_frag.page = NULL;

    account_kernel_stack(ti, 1);

    return tsk;
}

　　通过上面的代码，可以总结出fork的工作的基本流程是：

五.do_execve的分析

execve对应的内核服务例程位于fs/exec.c中。

/*
 * sys_execve() 服务例程执行一个程序.
 * filename需要执行的文件的绝对路径
 * argv传入系统调用的参数
 * regs是系统调用时系统堆栈的情况
 */
static int do_execve_common(const char *filename,
                struct user_arg_ptr argv,
                struct user_arg_ptr envp)
{
    struct linux_binprm *bprm;
    struct file *file;
    struct files_struct *displaced;
    bool clear_in_exec;
    int retval;
    const struct cred *cred = current_cred(); 

    unshare_files(&displaced); 
    // 动态分配一个linux_binprm数据结构，并用新的可执行文件的数据填充这个结构
    bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); 

    retval = prepare_bprm_creds(bprm);

    retval = check_unsafe_exec(bprm); 
    clear_in_exec = retval;
    current->in_execve = 1;

    file = open_exec(filename); // 打开可执行文件并读入到内存。
    retval = PTR_ERR(file);

    sched_exec(); // 确定最小负载的CPU以执行新程序，并把当前进程转移过去。

    bprm->file = file;
    bprm->filename = filename;
    bprm->interp = filename;

    bprm_mm_init(bprm);

    bprm->argc = count(argv, MAX_ARG_STRINGS);

    bprm->envc = count(envp, MAX_ARG_STRINGS);
    
    // prepare_binprm()填充linux_binprm数据结构，这个函数依次执行：
    // a.检查文件是否可执行。
    // b.初始化bprm的e_uid和e_gid字段。
    // c.用可执行文件的前128个字节填充bprm的buf字段。
    prepare_binprm(bprm); 
    
    /* 把文件路径名拷贝、命令行参数及环境串拷贝到一个或多个新分配的页框中 */
    copy_strings_kernel(1, &bprm->filename, bprm);
    bprm->exec = bprm->p;
    copy_strings(bprm->envc, envp, bprm);
    copy_strings(bprm->argc, argv, bprm);
    
    // 扫描formats链表，并尽力应用每个元素的load_binary方法，把bprm传递给这个
    // 函数。只要load_binary方法成功应答了文件的可执行格式，对formats扫描终止。
    search_binary_handler(bprm);

    /* 成功,释放bprm，返回从该文件可执行格式的load_binary方法中所获得的代码。 */
    current->fs->in_exec = 0;
    current->in_execve = 0;
    acct_update_integrals(current);
    free_bprm(bprm);
    if (displaced)
        put_files_struct(displaced);
    return retval;
}

下面我们看看load_elf_binary函数，该函数位于fs/binfmt_elf.c中

static int load_elf_binary(struct linux_binprm *bprm)
{
    struct file *interpreter = NULL; /* to shut gcc up */
     unsigned long load_addr = 0, load_bias = 0;
    int load_addr_set = 0;
    char * elf_interpreter = NULL;
    unsigned long error;
    struct elf_phdr *elf_ppnt, *elf_phdata;
    unsigned long elf_bss, elf_brk;
    int retval, i;
    unsigned int size;
    unsigned long elf_entry;
    unsigned long interp_load_addr = 0;
    unsigned long start_code, end_code, start_data, end_data;
    unsigned long reloc_func_desc __maybe_unused = 0;
    int executable_stack = EXSTACK_DEFAULT;
    unsigned long def_flags = 0;
    struct pt_regs *regs = current_pt_regs();
    struct {
        struct elfhdr elf_ex;
        struct elfhdr interp_elf_ex;
    } *loc;

    loc = kmalloc(sizeof(*loc), GFP_KERNEL);
    
    /* 读取可执行文件的首部。首部描述程序的段和所需的共享库。 */
    loc->elf_ex = *((struct elfhdr *)bprm->buf);

    /* 检测一致性 */
    if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
        goto out;

    if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
        goto out;
    if (!elf_check_arch(&loc->elf_ex))
        goto out;
    if (!bprm->file->f_op || !bprm->file->f_op->mmap)
        goto out;

    /* 读取所有的首部信息 */
    loc->elf_ex.e_phentsize != sizeof(struct elf_phdr);
    if (loc->elf_ex.e_phnum < 1 || loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr))
        goto out;
    size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
    retval = -ENOMEM;
    elf_phdata = kmalloc(size, GFP_KERNEL);

    retval = kernel_read(bprm->file, loc->elf_ex.e_phoff, (char *)elf_phdata, size);
    if (retval != size) {
        if (retval >= 0)
            retval = -EIO;
        goto out_free_ph;
    }

    elf_ppnt = elf_phdata;
    elf_bss = 0;
    elf_brk = 0;

    start_code = ~0UL;
    end_code = 0;
    start_data = 0;
    end_data = 0;

    for (i = 0; i < loc->elf_ex.e_phnum; i++) {
        if (elf_ppnt->p_type == PT_INTERP) {
            /* This is the program interpreter used for
             * shared libraries - for now assume that this
             * is an a.out format binary
             */
            retval = -ENOEXEC;
            if (elf_ppnt->p_filesz > PATH_MAX || 
                elf_ppnt->p_filesz < 2)
                goto out_free_ph;

            retval = -ENOMEM;
            elf_interpreter = kmalloc(elf_ppnt->p_filesz,
                          GFP_KERNEL);
            if (!elf_interpreter)
                goto out_free_ph;

            retval = kernel_read(bprm->file, elf_ppnt->p_offset,
                         elf_interpreter,
                         elf_ppnt->p_filesz);
            if (retval != elf_ppnt->p_filesz) {
                if (retval >= 0)
                    retval = -EIO;
                goto out_free_interp;
            }
            /* make sure path is NULL terminated */
            retval = -ENOEXEC;
            if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
                goto out_free_interp;

            interpreter = open_exec(elf_interpreter);
            retval = PTR_ERR(interpreter);
            if (IS_ERR(interpreter))
                goto out_free_interp;

            /*
             * If the binary is not readable then enforce
             * mm->dumpable = 0 regardless of the interpreter's
             * permissions.
             */
            would_dump(bprm, interpreter);

            retval = kernel_read(interpreter, 0, bprm->buf,
                         BINPRM_BUF_SIZE);
            if (retval != BINPRM_BUF_SIZE) {
                if (retval >= 0)
                    retval = -EIO;
                goto out_free_dentry;
            }

            /* Get the exec headers */
            loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
            break;
        }
        elf_ppnt++;
    }

    elf_ppnt = elf_phdata;
    for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
        if (elf_ppnt->p_type == PT_GNU_STACK) {
            if (elf_ppnt->p_flags & PF_X)
                executable_stack = EXSTACK_ENABLE_X;
            else
                executable_stack = EXSTACK_DISABLE_X;
            break;
        }

    /* Some simple consistency checks for the interpreter */
    if (elf_interpreter) {
        retval = -ELIBBAD;
        /* Not an ELF interpreter */
        if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
            goto out_free_dentry;
        /* Verify the interpreter has a valid arch */
        if (!elf_check_arch(&loc->interp_elf_ex))
            goto out_free_dentry;
    }

    // 释放前一个计算所占用的几乎所有资源
    retval = flush_old_exec(bprm);

    /* OK, This is the point of no return */
    current->mm->def_flags = def_flags;

    /* Do this immediately, since STACK_TOP as used in setup_arg_pages
       may depend on the personality.  */
    SET_PERSONALITY(loc->elf_ex);
    if (elf_read_implies_exec(loc->elf_ex, executable_stack))
        current->personality |= READ_IMPLIES_EXEC;

    if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
        current->flags |= PF_RANDOMIZE;

    setup_new_exec(bprm);

    /* Do this so that we can load the interpreter, if need be.  We will
       change some of these later */
    current->mm->free_area_cache = current->mm->mmap_base;
    current->mm->cached_hole_size = 0;
    // 为进程的用户态堆栈分配一个新的线性区描述符，并把那个线性区插入到进程的地址空间。
    setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack);
    
    
    current->mm->start_stack = bprm->p;

    /* 现在将ELF镜像文件映射到内存中正确的位置 */
    for(i = 0, elf_ppnt = elf_phdata;
        i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
        int elf_prot = 0, elf_flags;
        unsigned long k, vaddr;

        if (elf_ppnt->p_type != PT_LOAD)
            continue;

        if (unlikely (elf_brk > elf_bss)) {
            unsigned long nbyte;
                
            /* There was a PT_LOAD segment with p_memsz > p_filesz
               before this one. Map anonymous pages, if needed,
               and clear the area.  */
            retval = set_brk(elf_bss + load_bias,
                     elf_brk + load_bias);
            if (retval) {
                send_sig(SIGKILL, current, 0);
                goto out_free_dentry;
            }
            nbyte = ELF_PAGEOFFSET(elf_bss);
            if (nbyte) {
                nbyte = ELF_MIN_ALIGN - nbyte;
                if (nbyte > elf_brk - elf_bss)
                    nbyte = elf_brk - elf_bss;
                if (clear_user((void __user *)elf_bss +
                            load_bias, nbyte)) {
                    /*
                     * This bss-zeroing can fail if the ELF
                     * file specifies odd protections. So
                     * we don't check the return value
                     */
                }
            }
        }

        if (elf_ppnt->p_flags & PF_R)
            elf_prot |= PROT_READ;
        if (elf_ppnt->p_flags & PF_W)
            elf_prot |= PROT_WRITE;
        if (elf_ppnt->p_flags & PF_X)
            elf_prot |= PROT_EXEC;

        elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;

        vaddr = elf_ppnt->p_vaddr;
        if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
            elf_flags |= MAP_FIXED;
        } else if (loc->elf_ex.e_type == ET_DYN) {
            /* Try and get dynamic programs out of the way of the
             * default mmap base, as well as whatever program they
             * might try to exec.  This is because the brk will
             * follow the loader, and is not movable.  */
#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
            /* Memory randomization might have been switched off
             * in runtime via sysctl.
             * If that is the case, retain the original non-zero
             * load_bias value in order to establish proper
             * non-randomized mappings.
             */
            if (current->flags & PF_RANDOMIZE)
                load_bias = 0;
            else
                load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
#else
            load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
#endif
        }

        error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
                elf_prot, elf_flags, 0);
        if (BAD_ADDR(error)) {
            send_sig(SIGKILL, current, 0);
            retval = IS_ERR((void *)error) ?
                PTR_ERR((void*)error) : -EINVAL;
            goto out_free_dentry;
        }

        if (!load_addr_set) {
            load_addr_set = 1;
            load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
            if (loc->elf_ex.e_type == ET_DYN) {
                load_bias += error -
                             ELF_PAGESTART(load_bias + vaddr);
                load_addr += load_bias;
                reloc_func_desc = load_bias;
            }
        }
        k = elf_ppnt->p_vaddr;
        if (k < start_code)
            start_code = k;
        if (start_data < k)
            start_data = k;

        /*
         * Check to see if the section's size will overflow the
         * allowed task size. Note that p_filesz must always be
         * <= p_memsz so it is only necessary to check p_memsz.
         */
        if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
            elf_ppnt->p_memsz > TASK_SIZE ||
            TASK_SIZE - elf_ppnt->p_memsz < k) {
            /* set_brk can never work. Avoid overflows. */
            send_sig(SIGKILL, current, 0);
            retval = -EINVAL;
            goto out_free_dentry;
        }

        k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;

        if (k > elf_bss)
            elf_bss = k;
        if ((elf_ppnt->p_flags & PF_X) && end_code < k)
            end_code = k;
        if (end_data < k)
            end_data = k;
        k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
        if (k > elf_brk)
            elf_brk = k;
    }

    loc->elf_ex.e_entry += load_bias;
    elf_bss += load_bias;
    elf_brk += load_bias;
    start_code += load_bias;
    end_code += load_bias;
    start_data += load_bias;
    end_data += load_bias;

    /* Calling set_brk effectively mmaps the pages that we need
     * for the bss and break sections.  We must do this before
     * mapping in the interpreter, to make sure it doesn't wind
     * up getting placed where the bss needs to go.
     */
    retval = set_brk(elf_bss, elf_brk);
    if (retval) {
        send_sig(SIGKILL, current, 0);
        goto out_free_dentry;
    }
    if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
        send_sig(SIGSEGV, current, 0);
        retval = -EFAULT; /* Nobody gets to see this, but.. */
        goto out_free_dentry;
    }
    // 调用一个动态链接程序的函数。如果动态链接程序是elf可执行的，这
    // 个函数就叫做load_elf_interp()。
    if (elf_interpreter) {
        unsigned long interp_map_addr = 0;
        
        elf_entry = load_elf_interp(&loc->interp_elf_ex,
                        interpreter,
                        &interp_map_addr,
                        load_bias);
        if (!IS_ERR((void *)elf_entry)) {
            /*
             * load_elf_interp() returns relocation
             * adjustment
             */
            interp_load_addr = elf_entry;
            elf_entry += loc->interp_elf_ex.e_entry;
        }
        if (BAD_ADDR(elf_entry)) {
            force_sig(SIGSEGV, current);
            retval = IS_ERR((void *)elf_entry) ?
                    (int)elf_entry : -EINVAL;
            goto out_free_dentry;
        }
        reloc_func_desc = interp_load_addr;

        allow_write_access(interpreter);
        fput(interpreter);
        kfree(elf_interpreter);
    } else {
        elf_entry = loc->elf_ex.e_entry;
        if (BAD_ADDR(elf_entry)) {
            force_sig(SIGSEGV, current);
            retval = -EINVAL;
            goto out_free_dentry;
        }
    }

    kfree(elf_phdata);
    // 把可执行格式的linux_binfmt对象的地址存放在进程描述符的binfmt字段中。
    set_binfmt(&elf_format);

#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
    retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
    if (retval < 0) {
        send_sig(SIGKILL, current, 0);
        goto out;
    }
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */

    install_exec_creds(bprm);
    retval = create_elf_tables(bprm, &loc->elf_ex,
              load_addr, interp_load_addr);
    if (retval < 0) {
        send_sig(SIGKILL, current, 0);
        goto out;
    }
    /* N.B. passed_fileno might not be initialized? */
    current->mm->end_code = end_code;
    current->mm->start_code = start_code;
    current->mm->start_data = start_data;
    current->mm->end_data = end_data;
    current->mm->start_stack = bprm->p;

#ifdef arch_randomize_brk
    if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
        current->mm->brk = current->mm->start_brk =
            arch_randomize_brk(current->mm);
#ifdef CONFIG_COMPAT_BRK
        current->brk_randomized = 1;
#endif
    }
#endif

    if (current->personality & MMAP_PAGE_ZERO) {
        /* Why this, you ask???  Well SVr4 maps page 0 as read-only,
           and some applications "depend" upon this behavior.
           Since we do not have the power to recompile these, we
           emulate the SVr4 behavior. Sigh. */
        error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
                MAP_FIXED | MAP_PRIVATE, 0);
    }

#ifdef ELF_PLAT_INIT
    /*
     * The ABI may specify that certain registers be set up in special
     * ways (on i386 %edx is the address of a DT_FINI function, for
     * example.  In addition, it may also specify (eg, PowerPC64 ELF)
     * that the e_entry field is the address of the function descriptor
     * for the startup routine, rather than the address of the startup
     * routine itself.  This macro performs whatever initialization to
     * the regs structure is required as well as any relocations to the
     * function descriptor entries when executing dynamically links apps.
     */
    ELF_PLAT_INIT(regs, reloc_func_desc);
#endif

    start_thread(regs, elf_entry, bprm->p);
    retval = 0;
out:
    kfree(loc);
out_ret:
    return retval;

    /* error cleanup */
out_free_dentry:
    allow_write_access(interpreter);
    if (interpreter)
        fput(interpreter);
out_free_interp:
    kfree(elf_interpreter);
out_free_ph:
    kfree(elf_phdata);
    goto out;
}

相关阅读:
AtCoder Beginner Contest 205
Codeforces Round #725 (Div. 3)
Educational Codeforces Round 110 (Rated for Div. 2)【A
Codeforces Round #722 (Div. 2)
AtCoder Beginner Contest 203（Sponsored by Panasonic）
AISing Programming Contest 2021（AtCoder Beginner Contest 202）
PTA 520 钻石争霸赛 2021
Educational Codeforces Round 109 (Rated for Div. 2)【ABCD】
AtCoder Beginner Contest 200 E
Educational Codeforces Round 108 (Rated for Div. 2)【ABCD】
原文地址：https://www.cnblogs.com/sj20082663/p/3108587.html