进程管理
进程:处于执行期的程序。
线程:在进程中活动的对象
虚拟机制
虚拟处理器:多个进程分享一个处理器
虚拟内存:多个线程共享虚拟内存
一、进程描述符和任务结构
进程存放在双向循环链表中(队列),链表中的项为task_struct,称为进程描述符。在头文件<linux/sched.h>中。
struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; atomic_t usage; unsigned int flags; /* per process flags, defined below */ unsigned int ptrace; #ifdef CONFIG_SMP struct task_struct *wake_entry; int on_cpu; #endif int on_rq; int prio, static_prio, normal_prio; unsigned int rt_priority; const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; #ifdef CONFIG_PREEMPT_NOTIFIERS /* list of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif /* * fpu_counter contains the number of consecutive context switches * that the FPU is used. If this is over a threshold, the lazy fpu * saving becomes unlazy to save the trap. This is an unsigned char * so that after 256 times the counter wraps and the behavior turns * lazy again; this to deal with bursty apps that only use FPU for * a short time */ unsigned char fpu_counter; #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; char rcu_read_unlock_special; #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) int rcu_boosted; #endif /* #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) */ struct list_head rcu_node_entry; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST struct rt_mutex *rcu_boost_mutex; #endif /* #ifdef CONFIG_RCU_BOOST */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; #endif struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; #endif struct mm_struct *mm, *active_mm; #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #if defined(SPLIT_RSS_COUNTING) struct task_rss_stat rss_stat; #endif /* task state */ int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ unsigned int group_stop; /* GROUP_STOP_*, siglock protected */ /* ??? */ unsigned int personality; unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ unsigned in_iowait:1; /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; pid_t pid; pid_t tgid; #ifdef CONFIG_CC_STACKPROTECTOR /* Canary value for the -fstack-protector gcc feature */ unsigned long stack_canary; #endif /* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ struct task_struct *real_parent; /* real parent process */ struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */ /* * children/sibling forms the list of my natural children */ struct list_head children; /* list of my children */ struct list_head sibling; /* linkage in my parent's children list */ struct task_struct *group_leader; /* threadgroup leader */ /* * ptraced is the list of tasks this task is using ptrace on. * This includes both natural children and PTRACE_ATTACH targets. * p->ptrace_entry is p's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; struct completion *vfork_done; /* for vfork() */ int __user *set_child_tid; /* CLONE_CHILD_SETTID */ int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING cputime_t prev_utime, prev_stime; #endif unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ struct timespec real_start_time; /* boot based time */ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; struct task_cputime cputime_expires; struct list_head cpu_timers[3]; /* process credentials */ const struct cred __rcu *real_cred; /* objective and real subjective task * credentials (COW) */ const struct cred __rcu *cred; /* effective (overridable) subjective task * credentials (COW) */ struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock it with task_lock()) - initialized normally by setup_new_exec */ /* file system info */ int link_count, total_link_count; #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ unsigned long last_switch_count; #endif /* CPU-specific state of this task */ struct thread_struct thread; /* filesystem information */ struct fs_struct *fs; /* open file information */ struct files_struct *files; /* namespaces */ struct nsproxy *nsproxy; /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; sigset_t blocked, real_blocked; sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL uid_t loginuid; unsigned int sessionid; #endif seccomp_t seccomp; /* Thread group tracking */ u32 parent_exec_id; u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, * mempolicy */ spinlock_t alloc_lock; #ifdef CONFIG_GENERIC_HARDIRQS /* IRQ handler threads */ struct irqaction *irqaction; #endif /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ struct plist_head pi_waiters; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; #endif #ifdef CONFIG_DEBUG_MUTEXES /* mutex deadlock detection */ struct mutex_waiter *blocked_on; #endif #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; unsigned long hardirq_enable_ip; unsigned long hardirq_disable_ip; unsigned int hardirq_enable_event; unsigned int hardirq_disable_event; int hardirqs_enabled; int hardirq_context; unsigned long softirq_disable_ip; unsigned long softirq_enable_ip; unsigned int softirq_disable_event; unsigned int softirq_enable_event; int softirqs_enabled; int softirq_context; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; gfp_t lockdep_reclaim_gfp; #endif /* journalling filesystem info */ void *journal_info; /* stacked block device info */ struct bio_list *bio_list; #ifdef CONFIG_BLOCK /* stack plugging */ struct blk_plug *plug; #endif /* VM state */ struct reclaim_state *reclaim_state; struct backing_dev_info *backing_dev_info; struct io_context *io_context; unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ struct task_io_accounting ioac; #if defined(CONFIG_TASK_XACCT) u64 acct_rss_mem1; /* accumulated rss usage */ u64 acct_vm_mem1; /* accumulated virtual memory usage */ cputime_t acct_timexpd; /* stime + utime since last update */ #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; /* Protected by alloc_lock */ int mems_allowed_change_disable; int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock */ struct list_head cg_list; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ short il_next; short pref_node_fork; #endif atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; /* * cache last used pipe for splice */ struct pipe_inode_info *splice_pipe; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; #endif struct prop_local_single dirties; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ unsigned long timer_slack_ns; unsigned long default_timer_slack_ns; struct list_head *scm_work_list; #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack */ int curr_ret_stack; /* Stack of return addresses for return function tracing */ struct ftrace_ret_stack *ret_stack; /* time stamp for last schedule */ unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun. */ atomic_t trace_overrun; /* Pause for the tracing */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* state flags for use by tracers */ unsigned long trace; /* bitmask and counter of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ struct memcg_batch_info { int do_batch; /* incremented when batch uncharge started */ struct mem_cgroup *memcg; /* target memcg of uncharge */ unsigned long nr_pages; /* uncharged usage */ unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT atomic_t ptrace_bp_refcnt; #endif };
slab分配器分配task_struct结构,2.6后slab分配器动态生成task struct,只需要在栈底(向下增长的栈)或栈顶(向上增长的栈)创建一个新的结构struct thread_info。
在头文件<asm/thread_info.h>中,找了半天在arch/arm/include/asm/thread_info.h:
/* * low level task data that entry.S needs immediate access to. * __switch_to() assumes cpu_context follows immediately after cpu_domain. */ struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ __u32 cpu; /* cpu */ __u32 cpu_domain; /* cpu domain */ struct cpu_context_save cpu_context; /* cpu context */ __u32 syscall; /* syscall number */ __u8 used_cp[16]; /* thread used copro */ unsigned long tp_value; struct crunch_state crunchstate; union fp_state fpstate __attribute__((aligned(8))); union vfp_state vfpstate; #ifdef CONFIG_ARM_THUMBEE unsigned long thumbee_state; /* ThumbEE Handler Base register */ #endif struct restart_block restart_block; };
在内核栈的尾端,分配了threa_info,结构体task域存放指向该任务的实际task_struct指针。
二、进程描述符
PID是一个数,实际上是int类型。内核把各自的进程的PID放在各自的进程描述符中。
修改进程最大限制可以在文件<linux/threads.h>修改,或者修改/proc/sys/kernel/pid_max来提高上限
2.1 进程状态
TASK_RUNNING:进程使可运行的
TASK_INTERRUPTIBLE:进程正在睡眠
TASK_UNINTERRUPTIBLE:进程就算收到信号也不会唤醒
__TASK_TRACED:被其他进程跟踪的进程
__TASK_STOPPED:进程停止执行
2.2 进程状态设置
使用函数调整某个进程的状态
set_task_state(task, state); /* 将任务task的状态设置为state */
进程上下文:当一个程序执行了系统调用,或触发了某个异常,就陷入了内核空间。此时,内核代表进程执行并处于进程上下文中。
进程家族树:拥有同一个父进程的所有进程被称为兄弟。
每个task_struct都包含一个指向其父进程task_struct叫做parent的指针,还包含一个称为children的子进程链表。
/* 获取其父进程的进程描述符 */ struct task_struct *my_parent = current->parent; /* 一次访问子进程 */ struct task_struct *task; struct list_head *list; list_for_each(list, ¤t->children) { task = list_entry(list, struct task_struct, sibling); /* task现在是指向当前的某个子进程 */ } /* 访问init */ struct task_struct *task; for(task = current; task != &init_task; task = task->parent) ; /* 遍历系统中所有进程 */ list_entry(task->tasks.next, struct task_struct, tasks) list_entry(task->tasks.prev, struct task_struct, tasks) struct task_struct *task; for_each_process(task) { /* 它打印出每一个任务的名称和PID */ printk("%s[%d] ", task->comm, task->pid); } /* 注意:代价极大*/
三、进程创建
首先,fork()通过拷贝当前进程创建要给子进程。子进程与父进程的区别仅仅在于PID和PPID,和某些资源和统计量(如挂起信号)。
exec()负责读取可执行我呢见并将其载入地址空间开始运行。
写时拷贝:一种可以推迟甚至免除拷贝数据的技术。内核并不复制整个进程地址空间,而是父子进程共享同一个拷贝。
只有在要写入时,数据才会被复制。在此之前,都是只读方式共享。
3.1 fork()
Linux通过clone()系统调用实现fork()。do_fork()完成创建中的大部分工作,定义在kernel/fork.c中。
该函数调用copy_process()函数,然后让进程开始运行。
/* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ long do_fork(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { struct task_struct *p; int trace = 0; long nr; /* * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ if (clone_flags & CLONE_NEWUSER) { if (clone_flags & CLONE_THREAD) return -EINVAL; /* hopefully this check will go away when userns support is * complete */ if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || !capable(CAP_SETGID)) return -EPERM; } /* * When called from kernel_thread, don't do user tracing stuff. */ if (likely(user_mode(regs))) trace = tracehook_prepare_clone(clone_flags); /* 1) 调用copy_process()函数,然后让进程开始执行 */ p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ if (!IS_ERR(p)) { struct completion vfork; trace_sched_process_fork(current, p); nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); } audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that * hasn't gotten to tracehook_report_clone() yet. Now we * clear it and set the child going. */ p->flags &= ~PF_STARTING; wake_up_new_task(p); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); tracehook_report_vfork_done(p, nr); } } else { nr = PTR_ERR(p); } return nr; } /* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace) { int retval; struct task_struct *p; int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags); if (retval) goto fork_out; /* 1) 调用dup_task_struct()为新进程创建一个内核栈、thread_info结构和task_struct,这些值 * 与当前进程的值相同。此时,子进程和父进程的描述符是完全相同的。 */ retval = -ENOMEM; p = dup_task_struct(current); if (!p) goto fork_out; ftrace_graph_init_task(p); rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif /* 第一个if检查用户的进程数量是否超过限制 * 第二个if检查用户是否有足够的权限操作 */ retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->real_cred->user != INIT_USER) goto bad_fork_free; } retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. * 2) 检查并确保新创建的这个子进程后, * 当前用户所拥有的进程数目没有超出给它分配的资源限制 */ retval = -EAGAIN; if (nr_threads >= max_threads) goto bad_fork_cleanup_count; /* 确认exec_domain模块是否被加载 */ if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ /* 5) 调用copy_flags()以更新task_struct的flags成员, * 表明进程是否拥有超级用户权限的PF_SUPERPRIV标志被清0。 * 表明进程还没有调用exec()函数的PF_FORKNOEXEC标志被设备 */ copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = cputime_zero; p->stime = cputime_zero; p->gtime = cputime_zero; p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; #ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac); acct_clear_integrals(p); posix_cpu_timers_init(p); do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) threadgroup_fork_read_lock(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_cgroup; } mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW p->hardirqs_enabled = 1; #else p->hardirqs_enabled = 0; #endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; p->hardirq_disable_event = 0; p->softirqs_enabled = 1; p->softirq_enable_ip = _THIS_IP_; p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; p->hardirq_context = 0; p->softirq_context = 0; #endif #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; #endif #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif #ifdef CONFIG_CGROUP_MEM_RES_CTLR p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p); retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) goto bad_fork_cleanup_policy; /* 下面的操作中,根据flag中是否设置了相关标志 * 进行重新分配或者共享父进程的内容 */ /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) goto bad_fork_cleanup_audit; if ((retval = copy_files(clone_flags, p))) goto bad_fork_cleanup_semundo; if ((retval = copy_fs(clone_flags, p))) goto bad_fork_cleanup_files; if ((retval = copy_sighand(clone_flags, p))) goto bad_fork_cleanup_fs; if ((retval = copy_signal(clone_flags, p))) goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_mm; if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; /* 更新子进程的内核栈和寄存器中的值 */ retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_io; /* 6)为新进程分配一个有效的PID */ if (pid != &init_struct_pid) { retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; } p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) p->sas_ss_sp = p->sas_ss_size = 0; /* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif clear_all_latency_tracing(p); /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; p->exit_state = 0; /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible * on the tasklist. */ cgroup_fork_callbacks(p); cgroup_callbacks_done = 1; /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; } spin_lock(¤t->sighand->siglock); /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_free_pid; } if (clone_flags & CLONE_THREAD) { current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } /* 8)做一些扫尾工作,并返回一个指向子进程的指针 */ if (likely(p->pid)) { tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { if (is_child_reaper(pid)) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); __this_cpu_inc(process_counts); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; } total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) threadgroup_fork_read_unlock(current); perf_event_fork(p); return p; bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { task_lock(p); if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) atomic_dec(&p->mm->oom_disable_count); task_unlock(p); mmput(p->mm); } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif if (clone_flags & CLONE_THREAD) threadgroup_fork_read_unlock(current); cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: free_task(p); fork_out: return ERR_PTR(retval); } NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; int group_dead; profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl)); WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); /* * If do_exit is called because this processes oopsed, it's possible * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before * continuing. Amongst other possible reasons, this is to prevent * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. */ set_fs(USER_DS); tracehook_report_exit(&code); validate_creds_for_do_exit(tsk); /* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { printk(KERN_ALERT "Fixing recursive fault but reboot is needed! "); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state * cleanup has been done or not. In the worst case it * loops once more. We pretend that the cleanup was * done as there is no way to return. Either the * OWNER_DIED bit is set by now or we push the blocked * task into the wait for ever nirwana as well. */ tsk->flags |= PF_EXITPIDONE; set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } exit_irq_thread(); /* 1)将task_struct中的标志成员设置为PF_EXTING */ exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d ", current->comm, task_pid_nr(current), preempt_count()); acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk, tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); if (unlikely(tsk->audit_context)) audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); /* 4)释放进程占用的mm_struct,如果没有别的进程使用它们,就彻底释放它们 */ exit_mm(tsk); if (group_dead) acct_process(); trace_sched_process_exit(tsk); /* 5)如果进程派对等候IPC信号,则离开队列 */ exit_sem(tsk); /* 6)分别递减文件描述符、文件系统数据的引用计数 * 如果其中某个引用计数的数值降为零, * 那么就代表没有进程在使用相应的资源,可以释放 */ exit_files(tsk); exit_fs(tsk); check_stack_usage(); exit_thread(); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. * * because of cgroup mode, must be called before cgroup_exit() */ /* 把存放task_struct的exit_code成员中的任务退出代码,置为由exit()提供的退出代码 * 或者去完成任何其他由内核机制规定的退出动作。 * 退出代码存放在这里供父进程随时检索 */ perf_event_exit_task(tsk); cgroup_exit(tsk, 1); if (group_dead) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); proc_exit_connector(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ ptrace_put_breakpoints(tsk); /* 8)向父进程发送信号,给子进程重新找养父,养父为线程组中的其他线程或者init进程, * 并把进程状态设成EXIT_ZOMBIE */ exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; task_unlock(tsk); #endif #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(tsk); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done * or not. In the worst case it loops once more. */ tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); validate_creds_for_do_exit(tsk); preempt_disable(); exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; /* 切换到新的进程。因为处于EXIT_ZOMBIE状态 * 进程不会再被调度,所以这是进程所执行的最后一段代码 */ schedule(); BUG(); /* Avoid "noreturn function does return". */ for (;;) cpu_relax(); /* For when BUG is null */ }
3.2 创建线程
和创建进程类似,使用clone()的时候需要一些参数标志来指明共享资源。
clone(CLONE_VM|CLONE|FS|CLONE_FILES|CLONE_SIGHAND, 0);
这些标志在<linux/sched.h>中定义的
CLONE_FILES 父子进程共享打开文件
CLONE_FS 父子进程共享文件系统信息
CLONE_IDLETASK 将PID设置为0(只供idle进程使用)
CLONE_NEWNS 为子进程创建新的命名空间
CLONE_PARENT 指定子进程与父进程拥有同一个父进程
CLONE_PTRACE 继续调试子进程
CLONE_SETTID 将TID回写至用户空间
CLONE_SETTLS 为子进程创建新的TLS
CLONE_SIGHAND 父子进程共享信号处理函数及被阻断的信号
CLONE_SYSVSEM 父子进程共享system V SEM_UNDO语义
CLONE_THREAD 父子进程放入相同的线程组
CLONE_VFORK 调用vfork(),所以父进程准备睡眠等待子进程将其唤醒
CLONE_UNTRACED 防止跟踪进程在子进程上强制执行CLONE_PTRACE
CLONE_STOP 以TASK_STOPPED状态开始进程
CLONE_SETTLS 为子进程创建新的TLS
CLONE_CHILD_CLEARTID 清除子进程TID
CLONE_CHILD_SETTID 设置子进程TID
CLONE_PARENT_SETTID 设置父进程TID
CLONE_VM 父子进程共享地址空间
3.3 内核线程
内核线程和普通线程的区别在于内核线程没有独立的地址空间,并且只运行在内核空间,同样可以被调度和抢占。
在装有Linux系统上的运行ps -ef就可以看到内核线程了,在<linux/kthread.h>中声明有接口。创建一个新的内核线程的方法:
struct task_struct *kthread_create(int (*threadfn)(void *data), void *data, const char namefmt[], ...) threadfn:运行的函数 data:给其传递的参数为data namefmt:接受可变参数列表,类似于printf()
新创建的线程处于不可运行状态,需要通过wake_up_process()明确的唤醒,才会主动运行。
两步一起的函数为:
struct task_struct *kthread_run(int (*threadfn)(void *data), void *data, const char namefmt[], ...)
内核线程启动后一直运行直到调用do_exit(),或者kthread_stop(),参数是创建时的task_struct结构地址。
int kthread_stop(struct task_struct *k)
四、进程终结
大部分任务依靠do_exit()来终结,定义于<kernel/exit.c>。
NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; int group_dead; profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl)); WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); /* * If do_exit is called because this processes oopsed, it's possible * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before * continuing. Amongst other possible reasons, this is to prevent * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. */ set_fs(USER_DS); tracehook_report_exit(&code); validate_creds_for_do_exit(tsk); /* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { printk(KERN_ALERT "Fixing recursive fault but reboot is needed! "); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state * cleanup has been done or not. In the worst case it * loops once more. We pretend that the cleanup was * done as there is no way to return. Either the * OWNER_DIED bit is set by now or we push the blocked * task into the wait for ever nirwana as well. */ tsk->flags |= PF_EXITPIDONE; set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } exit_irq_thread(); /* 1)设置进程标志为PF_EXITING */ exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d ", current->comm, task_pid_nr(current), preempt_count()); acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk, tsk->mm); /* 2)调用del_timer_sync()删除任一内核定时器 */ group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); if (unlikely(tsk->audit_context)) audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); /* 4)调用exit_mm()释放进程占用的mm_struct,如果没有别的进程使用它们 */ exit_mm(tsk); if (group_dead) acct_process(); trace_sched_process_exit(tsk); /* 5)如果进程派对等候IPC信号,它则离开队列 */ exit_sem(tsk); /* 6)以分别递减文件描述符、文件系统数据的引用计数。如果其中某个引用计数的数值降为零 * 那么就代表没有进程在使用相应的资源,此时可以释放 */ exit_files(tsk); exit_fs(tsk); check_stack_usage(); exit_thread(); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. * * because of cgroup mode, must be called before cgroup_exit() */ perf_event_exit_task(tsk); cgroup_exit(tsk, 1); if (group_dead) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); proc_exit_connector(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ ptrace_put_breakpoints(tsk); /* 8)向父进程发送信号,给子进程重新找养父,养父为线程组中的其他线程或者位init线程 * 并把进程状态设为EXIT_ZOMBIE */ exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; task_unlock(tsk); #endif #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(tsk); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done * or not. In the worst case it loops once more. */ tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); validate_creds_for_do_exit(tsk); preempt_disable(); exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; /* 调用schedule切换到新的进程,处于ZOMBIE不会再被调度 */ schedule(); BUG(); /* Avoid "noreturn function does return". */ for (;;) cpu_relax(); /* For when BUG is null */ }
4.1 删除进程描述符
调用了do_exit()后,在父进程获得已结束的子进程信息后,或者通知内核它并不关注这些信息后,task_struct结构才被释放。
当最终释放进程描述符时,release_task()或被调用:
void release_task(struct task_struct * p) { struct task_struct *leader; int zap_leader; repeat: tracehook_prepare_release_task(p); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); atomic_dec(&__task_cred(p)->user->processes); rcu_read_unlock(); proc_flush_task(p); write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); /* 1)该函数调用_unhash_process(),后者调用detach_pid()从pidhash * 上删除该进程,同时也要从任务列表中删除该进程 * 2)释放目前僵死进程所使用的所有剩余资源,并进行最终统计和记录 */ __exit_signal(p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ /* 3)如果这个进程是线程组最后一个进程,并且领头进程已经死掉, * 那么release_task()就要通知僵死的头领进程的父进程 */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { BUG_ON(task_detached(leader)); do_notify_parent(leader, leader->exit_signal); /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. * * do_notify_parent() will have marked it self-reaping in * that case. */ zap_leader = task_detached(leader); /* * This maintains the invariant that release_task() * only runs on a task in EXIT_DEAD, just for sanity. */ if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); /* 4)释放进程内核栈和thread_info结构所占的页, * 并释放task_struct所占的slab高速缓存 */ release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); p = leader; if (unlikely(zap_leader)) goto repeat; }