Linux SMP 多核调用示例
在多core系统中,系统启动后内核将会在一个core上执行,且会在不同core上进行调度。即内核模块(驱动)加载只会在一个core中执行一次初始化。那么需要在所有core,或某个特殊的core执行那么我们需要smp提供的多核接口。
smp相关api
在linux/smp.h
中定义了多core调用的函数以及相关的数据结构。
在所有的core上执行函数func
,info
是传递给func
的参数。
void on_each_cpu(smp_call_func_t func, void *info, int wait);
在给定cpumask
中所有的core上执行函数func
,info
是传递给func
的参数。
void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait);
除了调用core外所有的core都执行
void smp_call_function(smp_call_func_t func, void *info, int wait);
在指定的cpumask
所对应的core上执行,但是需要除去当前调用的core。
void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait);
在指定的cpumask
所对应的core中的其中一个core上执行一次
int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait);
在指定的cpuid上执行一次
int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait);
在内核空间中,定义了几个全局cpumask变量。
cpu_possible_mask
- has bit ‘cpu’ set iff cpu is populatablecpu_present_mask
- has bit ‘cpu’ set iff cpu is populatedcpu_online_mask
- has bit ‘cpu’ set iff cpu available to schedulercpu_active_mask
- has bit ‘cpu’ set iff cpu available to migration
static int rps_ipi_queued(struct softnet_data *sd) { #ifdef CONFIG_RPS struct softnet_data *mysd = &__get_cpu_var(softnet_data); if (sd != mysd) { sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; __raise_softirq_irqoff(NET_RX_SOFTIRQ); return 1; } #endif /* CONFIG_RPS */ return 0; }
// rps决定的cpu input_pkt_queue,收到第一个包,需要调度对方cpu的napi执行,通过ipi的方式 /* Schedule NAPI for backlog device * We can use non atomic operation since we own the queue lock */ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {// backlog还没调度 if (!rps_ipi_queued(sd)) //rps决定的cpu不是本地cpu, 把sd添加到本地sd的rps_ipi_list链表中, 并设置本地软中断 ____napi_schedule(sd, &sd->backlog); //sd是本地cpu上的,直接____napi_schedule调度napi }
/* * Incoming packets are placed on per-cpu queues */ struct softnet_data { struct Qdisc *output_queue; struct Qdisc **output_queue_tailp; struct list_head poll_list; struct sk_buff *completion_queue; struct sk_buff_head process_queue; /* stats */ unsigned int processed; unsigned int time_squeeze; unsigned int cpu_collision; unsigned int received_rps; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; /* Elements below can be accessed between CPUs for RPS */ struct call_single_data csd ____cacheline_aligned_in_smp; struct softnet_data *rps_ipi_next; unsigned int cpu; unsigned int input_queue_head; unsigned int input_queue_tail; #endif unsigned dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; };
/* * Incoming packets are placed on per-cpu queues */ struct softnet_data { struct Qdisc *output_queue; struct Qdisc **output_queue_tailp; struct list_head poll_list; struct sk_buff *completion_queue; struct sk_buff_head process_queue; /* stats */ unsigned int processed; unsigned int time_squeeze; unsigned int cpu_collision; unsigned int received_rps; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; /* Elements below can be accessed between CPUs for RPS */ struct call_single_data csd ____cacheline_aligned_in_smp; struct softnet_data *rps_ipi_next; unsigned int cpu; unsigned int input_queue_head; unsigned int input_queue_tail; #endif unsigned dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; };
/* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); sd->received_rps++; }
/* * This is called single threaded during boot, so no need * to take the rtnl semaphore. */ static int __init net_dev_init(void) { int i, rc = -ENOMEM; BUG_ON(!dev_boot_phase); if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; * This is called single threaded during boot, so no need * to take the rtnl semaphore. */ static int __init net_dev_init(void) { int i, rc = -ENOMEM; BUG_ON(!dev_boot_phase); if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); INIT_LIST_HEAD(&offload_base); if (register_pernet_subsys(&netdev_net_ops)) goto out; /* * Initialise the packet receive queues. */ for_each_possible_cpu(i) { struct work_struct *flush = per_cpu_ptr(&flush_works, i); struct softnet_data *sd = &per_cpu(softnet_data, i); INIT_WORK(flush, flush_backlog); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD skb_queue_head_init(&sd->xfrm_backlog); #endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS sd->csd.func = rps_trigger_softirq; sd->csd.info = sd; sd->cpu = i; #endif init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; } dev_boot_phase = 0; /* The loopback device is special if any other network devices * is present in a network namespace the loopback device must * be present. Since we now dynamically allocate and free the * loopback device ensure this invariant is maintained by * keeping the loopback device as the first device on the * list of network devices. Ensuring the loopback devices * is the first device that appears and the last network device * that disappears. */ */ if (register_pernet_device(&loopback_net_ops)) goto out; if (register_pernet_device(&default_device_ops)) goto out; open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; out: return rc; }
void __smp_call_function_single(int cpu, struct call_single_data *data, int wait) { unsigned int this_cpu; unsigned long flags; this_cpu = get_cpu(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); data->func(data->info); local_irq_restore(flags); } else { csd_lock(data); generic_exec_single(cpu, data, wait); } put_cpu(); }
/* * net_rps_action sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ static void net_rps_action_and_irq_enable(struct softnet_data *sd) { #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; if (remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); /* Send pending IPI's to kick RPS processing on remote cpus. */ while (remsd) { struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) __smp_call_function_single(remsd->cpu, &remsd->csd, 0); remsd = next; } } else #endif local_irq_enable(); } static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); #ifdef CONFIG_RPS /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ if (sd->rps_ipi_list) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } #endif napi->weight = weight_p; local_irq_disable(); while (work < quota) { struct sk_buff *skb; unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); __netif_receive_skb(skb); local_irq_disable(); input_queue_head_incr(sd); if (++work >= quota) { local_irq_enable(); return work; } } rps_lock(sd); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen) skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); if (qlen < quota - work) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, * and NAPI_STATE_SCHED is the only possible flag set on backlog. * we can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ list_del(&napi->poll_list); napi->state = 0; quota = work + qlen; } rps_unlock(sd); } local_irq_enable(); return work; }
static void net_rps_send_ipi(struct softnet_data *remsd) { #ifdef CONFIG_RPS while (remsd) { struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) smp_call_function_single_async(remsd->cpu, &remsd->csd); remsd = next; } #endif }