• nvme ioctl解密


    对于Nvme SSD,我们有的时候会用到ioctl系统调用,该调用的流程是怎样的呢?

    首先,在注册nvme设备的时候,会初始化该设备的注册了file operations:

    static const struct file_operations nvme_dev_fops = {
        .owner        = THIS_MODULE,
        .open        = nvme_dev_open,
        .release    = nvme_dev_release,
        .unlocked_ioctl    = nvme_dev_ioctl,
        .compat_ioctl    = nvme_dev_ioctl,
    };

    在nvme_dev_ioctl里,存在switch语句,列举ioctl的几种cmd,其中我们主要关注的是:NVME_IOCTL_ADMIN_CMD和NVME_IO_CMD。

    static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
            unsigned long arg)
    {
        struct nvme_ctrl *ctrl = file->private_data;
        void __user *argp = (void __user *)arg;
    
        switch (cmd) {
        case NVME_IOCTL_ADMIN_CMD:
            return nvme_user_cmd(ctrl, NULL, argp);
        case NVME_IOCTL_IO_CMD:
            return nvme_dev_user_cmd(ctrl, argp);
        case NVME_IOCTL_RESET:
            dev_warn(ctrl->device, "resetting controller
    ");
            return ctrl->ops->reset_ctrl(ctrl);
        case NVME_IOCTL_SUBSYS_RESET:
            return nvme_reset_subsystem(ctrl);
        case NVME_IOCTL_RESCAN:
            nvme_queue_scan(ctrl);
            return 0;
        default:
            return -ENOTTY;
        }
    }

     对于ssd的读写命令,显然是要走 NVME_IOCTL_IO_CMD这一分支,该分支的函数主要做的事情是填充了nvme_command c命令:

    static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
                struct nvme_passthru_cmd __user *ucmd)
    {
        struct nvme_passthru_cmd cmd;
        struct nvme_command c;
        unsigned timeout = 0;
        int status;
    
        if (!capable(CAP_SYS_ADMIN))
            return -EACCES;
        if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
            return -EFAULT;
        if (cmd.flags)
            return -EINVAL;
    
        memset(&c, 0, sizeof(c));
        c.common.opcode = cmd.opcode;
        c.common.flags = cmd.flags;
        c.common.nsid = cpu_to_le32(cmd.nsid);
        c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
        c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
        c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
        c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
        c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
        c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
        c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
        c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
    
        if (cmd.timeout_ms)
            timeout = msecs_to_jiffies(cmd.timeout_ms);
    
        status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
                (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
                &cmd.result, timeout);
        if (status >= 0) {
            if (put_user(cmd.result, &ucmd->result))
                return -EFAULT;
        }
    
        return status;
    }
    int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
            void __user *ubuffer, unsigned bufflen, u32 *result,
            unsigned timeout)
    {
        return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
                result, timeout);
    }
    
    int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
            void __user *ubuffer, unsigned bufflen,
            void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
            u32 *result, unsigned timeout)
    {
        bool write = nvme_is_write(cmd);
        struct nvme_ns *ns = q->queuedata;
        struct gendisk *disk = ns ? ns->disk : NULL;
        struct request *req;
        struct bio *bio = NULL;
        void *meta = NULL;
        int ret;
    
        req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
        if (IS_ERR(req))
            return PTR_ERR(req);
    
        req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
    
        if (ubuffer && bufflen) {
            ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
                    GFP_KERNEL);
            if (ret)
                goto out;
            bio = req->bio;
    
            if (!disk)
                goto submit;
            bio->bi_bdev = bdget_disk(disk, 0);
            if (!bio->bi_bdev) {
                ret = -ENODEV;
                goto out_unmap;
            }
    
            if (meta_buffer && meta_len) {
                struct bio_integrity_payload *bip;
    
                meta = kmalloc(meta_len, GFP_KERNEL);
                if (!meta) {
                    ret = -ENOMEM;
                    goto out_unmap;
                }
    
                if (write) {
                    if (copy_from_user(meta, meta_buffer,
                            meta_len)) {
                        ret = -EFAULT;
                        goto out_free_meta;
                    }
                }
    
                bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
                if (IS_ERR(bip)) {
                    ret = PTR_ERR(bip);
                    goto out_free_meta;
                }
    
                bip->bip_iter.bi_size = meta_len;
                bip->bip_iter.bi_sector = meta_seed;
    
                ret = bio_integrity_add_page(bio, virt_to_page(meta),
                        meta_len, offset_in_page(meta));
                if (ret != meta_len) {
                    ret = -ENOMEM;
                    goto out_free_meta;
                }
            }
        }
     submit:
        blk_execute_rq(req->q, disk, req, 0);
        if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
            ret = -EINTR;
        else
            ret = nvme_req(req)->status;
        if (result)
            *result = le32_to_cpu(nvme_req(req)->result.u32);
        if (meta && !ret && !write) {
            if (copy_to_user(meta_buffer, meta, meta_len))
                ret = -EFAULT;
        }
     out_free_meta:
        kfree(meta);
     out_unmap:
        if (bio) {
            if (disk && bio->bi_bdev)
                bdput(bio->bi_bdev);
            blk_rq_unmap_user(bio);
        }
     out:
        blk_mq_free_request(req);
        return ret;
    }

    __nvme_submit_user_cmd做的主要事情是,通过调用nvme_alloc_request函数分配一个request,对于读写命令,还要对request的bio进行初始化。最后就是提交,调用的函数是blk_execute_rq。

    调用的函数也从驱动层到了block层。

    在blk-exec.c文件中,找到了blk_execute_rq函数:

    void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
               struct request *rq, int at_head)
    {
        DECLARE_COMPLETION_ONSTACK(wait);
        unsigned long hang_check;
    
        rq->end_io_data = &wait;
        blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
    
        /* Prevent hang_check timer from firing at us during very long I/O */
        hang_check = sysctl_hung_task_timeout_secs;
        if (hang_check)
            while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
        else
            wait_for_completion_io(&wait);
    }

    该函数主要包括两个部分,第一部分,调用blk_execute_rq_nowait将请求发送下去,第二部分,则是调用wait_for_completion_io函数来等待请求完成(下面也会提到)。

    对于blk_execute_rq_nowait函数,其作用是将request insert到software queue中,然后返回。由于NVMe现在是用的是多队列,因此在if(q->mq_ops)中,就会调用blk_mq_sched_insert_request函数。

    void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
                   struct request *rq, int at_head,
                   rq_end_io_fn *done)
    {
        int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
    
        WARN_ON(irqs_disabled());
        WARN_ON(!blk_rq_is_passthrough(rq));
    
        rq->rq_disk = bd_disk;
        rq->end_io = done;
    
        /*
         * don't check dying flag for MQ because the request won't
         * be reused after dying flag is set
         */
        if (q->mq_ops) {
            blk_mq_sched_insert_request(rq, at_head, true, false, false);
            return;
        }
    
        spin_lock_irq(q->queue_lock);
    
        if (unlikely(blk_queue_dying(q))) {
            rq->rq_flags |= RQF_QUIET;
            __blk_end_request_all(rq, -ENXIO);
            spin_unlock_irq(q->queue_lock);
            return;
        }
    
        __elv_add_request(q, rq, where);
        __blk_run_queue(q);
        spin_unlock_irq(q->queue_lock);
    }

    然后就调用到了blk_mq_sched_insert_request函数,在这个函数中,由于不是flush,而且也不是不插入队列,而blk-mq走的是mq-deadline调度器,因此会进入if(e && e->type->ops.mq.insert)里面,

    void blk_mq_sched_insert_request(struct request *rq, bool at_head,
                     bool run_queue, bool async, bool can_block)
    {
        struct request_queue *q = rq->q;
        struct elevator_queue *e = q->elevator;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
    
        if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
            blk_mq_sched_insert_flush(hctx, rq, can_block);
            return;
        }
    
        if (e && blk_mq_sched_bypass_insert(hctx, rq))
            goto run;
    
        if (e && e->type->ops.mq.insert_requests) {
            LIST_HEAD(list);
    
            list_add(&rq->queuelist, &list);
            e->type->ops.mq.insert_requests(hctx, &list, at_head);
        } else {
            spin_lock(&ctx->lock);
            __blk_mq_insert_request(hctx, rq, at_head);
            spin_unlock(&ctx->lock);
        }
    
    run:
        if (run_queue)
            blk_mq_run_hw_queue(hctx, async);
    }

    针对mq设计的deadline调度器在文件mq-deadline.c里,我们可以看到这里定义了一些操作,其中就包括insert_requests。

    static struct elevator_type mq_deadline = {
        .ops.mq = {
            .insert_requests    = dd_insert_requests,
            .dispatch_request    = dd_dispatch_request,
            .next_request        = elv_rb_latter_request,
            .former_request        = elv_rb_former_request,
            .bio_merge        = dd_bio_merge,
            .request_merge        = dd_request_merge,
            .requests_merged    = dd_merged_requests,
            .request_merged        = dd_request_merged,
            .has_work        = dd_has_work,
            .init_sched        = dd_init_queue,
            .exit_sched        = dd_exit_queue,
        },
    
        .uses_mq    = true,
    #ifdef CONFIG_BLK_DEBUG_FS
        .queue_debugfs_attrs = deadline_queue_debugfs_attrs,
    #endif
        .elevator_attrs = deadline_attrs,
        .elevator_name = "mq-deadline",
        .elevator_owner = THIS_MODULE,
    };

    因此调用的是dd_insert_requests函数,在这个函数中,会将请求加入到hardware queue中,而且前后需要对hardware queue队列加锁。

    static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
                       struct list_head *list, bool at_head)
    {
        struct request_queue *q = hctx->queue;
        struct deadline_data *dd = q->elevator->elevator_data;
    
        spin_lock(&dd->lock);
        while (!list_empty(list)) {
            struct request *rq;
    
            rq = list_first_entry(list, struct request, queuelist);
            list_del_init(&rq->queuelist);
            dd_insert_request(hctx, rq, at_head);
        }
        spin_unlock(&dd->lock);
    }

    然后对list中的每一个request,依次插入到队列中,在本次调用过程中,只存在一个request,调用的函数也非常简单,由于该request是passthrough的,而且会将request插入到队尾,因此调用的函数是list_add_tail。

    static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                      bool at_head)
    {
        struct request_queue *q = hctx->queue;
        struct deadline_data *dd = q->elevator->elevator_data;
        const int data_dir = rq_data_dir(rq);
    
        if (blk_mq_sched_try_insert_merge(q, rq))
            return;
    
        blk_mq_sched_request_inserted(rq);
    
        if (at_head || blk_rq_is_passthrough(rq)) {
            if (at_head)
                list_add(&rq->queuelist, &dd->dispatch);
            else
                list_add_tail(&rq->queuelist, &dd->dispatch);
        } else {
            deadline_add_rq_rb(dd, rq);
    
            if (rq_mergeable(rq)) {
                elv_rqhash_add(q, rq);
                if (!q->last_merge)
                    q->last_merge = rq;
            }
    
            /*
             * set expire time and add to fifo list
             */
            rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
            list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
        }
    }

    将request插入到队列之后,就会回到blk_mq_sched_insert_request函数中,调用blk_mq_run_hw_queue函数,在这个函数以及接下来调用的函数,最终会调用dispatch函数,即blk_mq_sched_dispatch_requests函数,

    void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
    {
        __blk_mq_delay_run_hw_queue(hctx, async, 0);
    }
    
    static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
                        unsigned long msecs)
    {
        if (unlikely(blk_mq_hctx_stopped(hctx) ||
                 !blk_mq_hw_queue_mapped(hctx)))
            return;
    
        if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
            int cpu = get_cpu();
            if (cpumask_test_cpu(cpu, hctx->cpumask)) {
                __blk_mq_run_hw_queue(hctx);
                put_cpu();
                return;
            }
    
            put_cpu();
        }
    
        kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
                         &hctx->run_work,
                         msecs_to_jiffies(msecs));
    }
    
    static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
    {
        int srcu_idx;
    
        WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
            cpu_online(hctx->next_cpu));
    
        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
            rcu_read_lock();
            blk_mq_sched_dispatch_requests(hctx);
            rcu_read_unlock();
        } else {
            might_sleep();
    
            srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
            blk_mq_sched_dispatch_requests(hctx);
            srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
        }
    }

    在blk_mq_sched_dispatch_requests函数中,首先会检查一下是否存在以前的entries,然后调用mq-deadline调度器的dispatch函数用来dispatch请求

    void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
    {
        struct request_queue *q = hctx->queue;
        struct elevator_queue *e = q->elevator;
        const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
        bool did_work = false;
        LIST_HEAD(rq_list);
    
        if (unlikely(blk_mq_hctx_stopped(hctx)))
            return;
    
        hctx->run++;
    
        /*
         * If we have previous entries on our dispatch list, grab them first for
         * more fair dispatch.
         */
        if (!list_empty_careful(&hctx->dispatch)) {
            spin_lock(&hctx->lock);
            if (!list_empty(&hctx->dispatch))
                list_splice_init(&hctx->dispatch, &rq_list);
            spin_unlock(&hctx->lock);
        }
    
        /*
         * Only ask the scheduler for requests, if we didn't have residual
         * requests from the dispatch list. This is to avoid the case where
         * we only ever dispatch a fraction of the requests available because
         * of low device queue depth. Once we pull requests out of the IO
         * scheduler, we can no longer merge or sort them. So it's best to
         * leave them there for as long as we can. Mark the hw queue as
         * needing a restart in that case.
         */
        if (!list_empty(&rq_list)) {
            blk_mq_sched_mark_restart_hctx(hctx);
            did_work = blk_mq_dispatch_rq_list(q, &rq_list);
        } else if (!has_sched_dispatch) {
            blk_mq_flush_busy_ctxs(hctx, &rq_list);
            blk_mq_dispatch_rq_list(q, &rq_list);
        }
    
        /*
         * We want to dispatch from the scheduler if we had no work left
         * on the dispatch list, OR if we did have work but weren't able
         * to make progress.
         */
        if (!did_work && has_sched_dispatch) {
            do {
                struct request *rq;
    
                rq = e->type->ops.mq.dispatch_request(hctx);
                if (!rq)
                    break;
                list_add(&rq->queuelist, &rq_list);
            } while (blk_mq_dispatch_rq_list(q, &rq_list));
        }
    }

    在mq-dealine里定义的dispatch函数是dd_dispatch_request函数,该函数会根据deadline的调度器规则,从中选出一个请求返回给上层

    static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
    {
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
        struct request *rq;
        bool reads, writes;
        int data_dir;
    
        if (!list_empty(&dd->dispatch)) {
            rq = list_first_entry(&dd->dispatch, struct request, queuelist);
            list_del_init(&rq->queuelist);
            goto done;
        }
    
        reads = !list_empty(&dd->fifo_list[READ]);
        writes = !list_empty(&dd->fifo_list[WRITE]);
    
        /*
         * batches are currently reads XOR writes
         */
        if (dd->next_rq[WRITE])
            rq = dd->next_rq[WRITE];
        else
            rq = dd->next_rq[READ];
    
        if (rq && dd->batching < dd->fifo_batch)
            /* we have a next request are still entitled to batch */
            goto dispatch_request;
    
        /*
         * at this point we are not running a batch. select the appropriate
         * data direction (read / write)
         */
    
        if (reads) {
            BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
    
            if (writes && (dd->starved++ >= dd->writes_starved))
                goto dispatch_writes;
    
            data_dir = READ;
    
            goto dispatch_find_request;
        }
    
        /*
         * there are either no reads or writes have been starved
         */
    
        if (writes) {
    dispatch_writes:
            BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
    
            dd->starved = 0;
    
            data_dir = WRITE;
    
            goto dispatch_find_request;
        }
    
        return NULL;
    
    dispatch_find_request:
        /*
         * we are not running a batch, find best request for selected data_dir
         */
        if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
            /*
             * A deadline has expired, the last request was in the other
             * direction, or we have run out of higher-sectored requests.
             * Start again from the request with the earliest expiry time.
             */
            rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
        } else {
            /*
             * The last req was the same dir and we have a next request in
             * sort order. No expired requests so continue on from here.
             */
            rq = dd->next_rq[data_dir];
        }
    
        dd->batching = 0;
    
    dispatch_request:
        /*
         * rq is the selected appropriate request.
         */
        dd->batching++;
        deadline_move_request(dd, rq);
    done:
        rq->rq_flags |= RQF_STARTED;
        return rq;
    }
    
    static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
    {
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
        struct request *rq;
    
        spin_lock(&dd->lock);
        rq = __dd_dispatch_request(hctx);
        spin_unlock(&dd->lock);
    
        return rq;
    }

    接着会调用块层函数blk_mq_dispatch_rq_list函数,该函数也是再次和驱动交互的一个函数,在该函数中,会调用queue_rq函数,这也是在nvme driver里定义的operations。

    bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
    {
        struct blk_mq_hw_ctx *hctx;
        struct request *rq;
        int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
    
        if (list_empty(list))
            return false;
    
        /*
         * Now process all the entries, sending them to the driver.
         */
        errors = queued = 0;
        do {
            struct blk_mq_queue_data bd;
    
            rq = list_first_entry(list, struct request, queuelist);
            if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
                if (!queued && reorder_tags_to_front(list))
                    continue;
    
                /*
                 * The initial allocation attempt failed, so we need to
                 * rerun the hardware queue when a tag is freed.
                 */
                if (!blk_mq_dispatch_wait_add(hctx))
                    break;
    
                /*
                 * It's possible that a tag was freed in the window
                 * between the allocation failure and adding the
                 * hardware queue to the wait queue.
                 */
                if (!blk_mq_get_driver_tag(rq, &hctx, false))
                    break;
            }
    
            list_del_init(&rq->queuelist);
    
            bd.rq = rq;
    
            /*
             * Flag last if we have no more requests, or if we have more
             * but can't assign a driver tag to it.
             */
            if (list_empty(list))
                bd.last = true;
            else {
                struct request *nxt;
    
                nxt = list_first_entry(list, struct request, queuelist);
                bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
            }
    
            ret = q->mq_ops->queue_rq(hctx, &bd);
            switch (ret) {
            case BLK_MQ_RQ_QUEUE_OK:
                queued++;
                break;
            case BLK_MQ_RQ_QUEUE_BUSY:
                blk_mq_put_driver_tag_hctx(hctx, rq);
                list_add(&rq->queuelist, list);
                __blk_mq_requeue_request(rq);
                break;
            default:
                pr_err("blk-mq: bad return on queue: %d
    ", ret);
            case BLK_MQ_RQ_QUEUE_ERROR:
                errors++;
                blk_mq_end_request(rq, -EIO);
                break;
            }
    
            if (ret == BLK_MQ_RQ_QUEUE_BUSY)
                break;
        } while (!list_empty(list));
    
        hctx->dispatched[queued_to_index(queued)]++;
    
        /*
         * Any items that need requeuing? Stuff them into hctx->dispatch,
         * that is where we will continue on next queue run.
         */
        if (!list_empty(list)) {
            /*
             * If an I/O scheduler has been configured and we got a driver
             * tag for the next request already, free it again.
             */
            rq = list_first_entry(list, struct request, queuelist);
            blk_mq_put_driver_tag(rq);
    
            spin_lock(&hctx->lock);
            list_splice_init(list, &hctx->dispatch);
            spin_unlock(&hctx->lock);
    
            /*
             * If SCHED_RESTART was set by the caller of this function and
             * it is no longer set that means that it was cleared by another
             * thread and hence that a queue rerun is needed.
             *
             * If TAG_WAITING is set that means that an I/O scheduler has
             * been configured and another thread is waiting for a driver
             * tag. To guarantee fairness, do not rerun this hardware queue
             * but let the other thread grab the driver tag.
             *
             * If no I/O scheduler has been configured it is possible that
             * the hardware queue got stopped and restarted before requests
             * were pushed back onto the dispatch list. Rerun the queue to
             * avoid starvation. Notes:
             * - blk_mq_run_hw_queue() checks whether or not a queue has
             *   been stopped before rerunning a queue.
             * - Some but not all block drivers stop a queue before
             *   returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq
             *   and dm-rq.
             */
            if (!blk_mq_sched_needs_restart(hctx) &&
                !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
                blk_mq_run_hw_queue(hctx, true);
        }
    
        return (queued + errors) != 0;
    }

    对于nvme driver里的queue_rq就是nvme_queue_rq函数,在这个函数中做的事是:setup cmd,init iod,然后发送命令,最后会调用nvme_process_cq函数,这个函数在命令完成之后,ssd controller发中断后也会调用这个函数

    static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
                 const struct blk_mq_queue_data *bd)
    {
        struct nvme_ns *ns = hctx->queue->queuedata;
        struct nvme_queue *nvmeq = hctx->driver_data;
        struct nvme_dev *dev = nvmeq->dev;
        struct request *req = bd->rq;
        struct nvme_command cmnd;
        int ret = BLK_MQ_RQ_QUEUE_OK;
    
        /*
         * If formated with metadata, require the block layer provide a buffer
         * unless this namespace is formated such that the metadata can be
         * stripped/generated by the controller with PRACT=1.
         */
        if (ns && ns->ms && !blk_integrity_rq(req)) {
            if (!(ns->pi_type && ns->ms == 8) &&
                !blk_rq_is_passthrough(req)) {
                blk_mq_end_request(req, -EFAULT);
                return BLK_MQ_RQ_QUEUE_OK;
            }
        }
    
        ret = nvme_setup_cmd(ns, req, &cmnd);
        if (ret != BLK_MQ_RQ_QUEUE_OK)
            return ret;
    
        ret = nvme_init_iod(req, dev);
        if (ret != BLK_MQ_RQ_QUEUE_OK)
            goto out_free_cmd;
    
        if (blk_rq_nr_phys_segments(req))
            ret = nvme_map_data(dev, req, &cmnd);
    
        if (ret != BLK_MQ_RQ_QUEUE_OK)
            goto out_cleanup_iod;
    
        blk_mq_start_request(req);
    
        spin_lock_irq(&nvmeq->q_lock);
        if (unlikely(nvmeq->cq_vector < 0)) {
            ret = BLK_MQ_RQ_QUEUE_ERROR;
            spin_unlock_irq(&nvmeq->q_lock);
            goto out_cleanup_iod;
        }
        __nvme_submit_cmd(nvmeq, &cmnd);
        nvme_process_cq(nvmeq);
        spin_unlock_irq(&nvmeq->q_lock);
        return BLK_MQ_RQ_QUEUE_OK;
    out_cleanup_iod:
        nvme_free_iod(dev, req);
    out_free_cmd:
        nvme_cleanup_cmd(req);
        return ret;
    }

    接着我们看一下__nvme_submit_cmd函数,该函数将cmd copy到SQ中,然后向SQ的doorbell里写入tail信息,告诉SSD controller来了新命令。

    static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
                            struct nvme_command *cmd)
    {
        u16 tail = nvmeq->sq_tail;
    
        if (nvmeq->sq_cmds_io)
            memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
        else
            memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
    
        if (++tail == nvmeq->q_depth)
            tail = 0;
        if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
                              nvmeq->dbbuf_sq_ei))
            writel(tail, nvmeq->q_db);
        nvmeq->sq_tail = tail;
    }

    至此,从ioctl到写入SQ已经结束了,接下来的事情就是需要交给SSD controller取命令,执行命令了。

    ssd controller执行完命令之后,会向CQ里写入完成信息,并向host端发出中断,中断调用的函数即nvme_process_cq函数

    在这个函数中,执行的主要流程是,从CQ的head检查CQ,对于有效的CQ信息(通过CQ entry里的P位,这个可以参考nvme driver specification),进行处理,并调用nvme_end_request函数,来进行一个I/O request结束处理

    static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
    {
        u16 head, phase;
    
        head = nvmeq->cq_head;
        phase = nvmeq->cq_phase;
    
        while (nvme_cqe_valid(nvmeq, head, phase)) {
            struct nvme_completion cqe = nvmeq->cqes[head];
            struct request *req;
    
            if (++head == nvmeq->q_depth) {
                head = 0;
                phase = !phase;
            }
    
            if (tag && *tag == cqe.command_id)
                *tag = -1;
    
            if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
                dev_warn(nvmeq->dev->ctrl.device,
                    "invalid id %d completed on queue %d
    ",
                    cqe.command_id, le16_to_cpu(cqe.sq_id));
                continue;
            }
    
            /*
             * AEN requests are special as they don't time out and can
             * survive any kind of queue freeze and often don't respond to
             * aborts.  We don't even bother to allocate a struct request
             * for them but rather special case them here.
             */
            if (unlikely(nvmeq->qid == 0 &&
                    cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) {
                nvme_complete_async_event(&nvmeq->dev->ctrl,
                        cqe.status, &cqe.result);
                continue;
            }
    
            req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
            nvme_end_request(req, cqe.status, cqe.result);
        }
    
        if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
            return;
    
        if (likely(nvmeq->cq_vector >= 0))
            if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
                                  nvmeq->dbbuf_cq_ei))
                writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
        nvmeq->cq_head = head;
        nvmeq->cq_phase = phase;
    
        nvmeq->cqe_seen = 1;
    }
    
    static void nvme_process_cq(struct nvme_queue *nvmeq)
    {
        __nvme_process_cq(nvmeq, NULL);
    }

    接着我们看一下nvme_end_request函数,在这个函数中,依次调用了blk_mq_complete_request、 __blk_mq_complete_request,最后调用到nvme driver定义的softirq_done_fn函数

    static void __blk_mq_complete_request(struct request *rq)
    {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        bool shared = false;
        int cpu;
    
        if (rq->internal_tag != -1)
            blk_mq_sched_completed_request(rq);
        if (rq->rq_flags & RQF_STATS) {
            blk_mq_poll_stats_start(rq->q);
            blk_stat_add(rq);
        }
    
        if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
            rq->q->softirq_done_fn(rq);
            return;
        }
    
        cpu = get_cpu();
        if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
            shared = cpus_share_cache(cpu, ctx->cpu);
    
        if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
            rq->csd.func = __blk_mq_complete_request_remote;
            rq->csd.info = rq;
            rq->csd.flags = 0;
            smp_call_function_single_async(ctx->cpu, &rq->csd);
        } else {
            rq->q->softirq_done_fn(rq);
        }
        put_cpu();
    }

    该函数(定义的过程省略,这一部分是在对nvme 设备初始化定义的)是nvme_pci_complete_rq函数

    static void nvme_pci_complete_rq(struct request *req)
    {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    
        nvme_unmap_data(iod->nvmeq->dev, req);
        nvme_complete_rq(req);
    }
    void nvme_complete_rq(struct request *req)
    {
        if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
            nvme_req(req)->retries++;
            blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
            return;
        }
    
        blk_mq_end_request(req, nvme_error_status(req));
    }
    void blk_mq_end_request(struct request *rq, int error)
    {
        if (blk_update_request(rq, error, blk_rq_bytes(rq)))
            BUG();
        __blk_mq_end_request(rq, error);
    }
    inline void __blk_mq_end_request(struct request *rq, int error)
    {
        blk_account_io_done(rq);
    
        if (rq->end_io) {
            wbt_done(rq->q->rq_wb, &rq->issue_stat);
            rq->end_io(rq, error);
        } else {
            if (unlikely(blk_bidi_rq(rq)))
                blk_mq_free_request(rq->next_rq);
            blk_mq_free_request(rq);
        }
    }

    可以看出,上面这个__blk_mq_end_request函数会调用request的end_io,这个是在哪里定义的呢,可以看一下blk_execute_rq和blk_execute_rq_nowait函数,request的end_io就是blk_end_sync_rq函数

    这个函数的作用就是唤醒上面blk_execute_rq里的wait_for_completion_io(&wait)

    static void blk_end_sync_rq(struct request *rq, int error)
    {
        struct completion *waiting = rq->end_io_data;
    
        rq->end_io_data = NULL;
    
        /*
         * complete last, if this is a stack request the process (and thus
         * the rq pointer) could be invalid right after this complete()
         */
        complete(waiting);
    }

    至此,整个ioctl的过程几乎完全结束了,剩下的就是将result向上返回了

  • 相关阅读:
    8.【原创】使用Java8中的Stream特性筛选数据
    27.【转载】如何避免回表查询,什么是索引覆盖
    17.【转载】广东省2020学位英语考试报考须知
    Java 正则?:?=?!的理解
    集合的优化操作
    ArrayList中remove方法和set(null)的区别
    POI 导入、导出Excel
    JS 跳转到新页面并用post传参
    win 10 如何关闭自动更新
    JSON定义及应用
  • 原文地址:https://www.cnblogs.com/mmmmmmmelody/p/10500263.html
Copyright © 2020-2023  润新知