• Linux3.10.0块IO子系统流程(3)-- SCSI策略例程


    很长时间以来,Linux块设备使用了一种称为“蓄流/泄流”(plugging/unplugging)的技术来改进吞吐率。简单而言,这种工作方式类似浴盆排水系统的塞子。当IO被提交时,它被储存在一个队列,稍后的某个时间,我们才允许IO从队列派发出去。之所以这么做是为IO尽可能做合并和排序。

      1 static void scsi_request_fn(struct request_queue *q)
      2 {
      3     struct scsi_device *sdev = q->queuedata;
      4     struct Scsi_Host *shost;
      5     struct scsi_cmnd *cmd;
      6     struct request *req;
      7     if(!get_device(&sdev->sdev_gendev))
      8         /* We must be tearing the block queue down already */
      9         return;
     10     /*
     11      * To start with, we keep looping until the queue is empty, or until
     12      * the host is no longer able to accept any more requests.
     13      */
     14     shost = sdev->host;
     15     for (;;) {
     16         int rtn;
     17         /*
     18          * get next queueable request.  We do this early to make sure
     19          * that the request is fully prepared even if we cannot
     20          * accept it.
     21          */
     22         req = blk_peek_request(q);    // 获得下一个可排队的请求,如果没有请求或者现在还不能想SCSI设备发送请求,则退出循环
     23         if (!req || !scsi_dev_queue_ready(q, sdev))
     24             break;
     25         /* 如果设备已经离线,则输出错误消息, 调用scsi_kill_request函数释放请求,并以此方式处理后面所有的请求 */
     26         if (unlikely(!scsi_device_online(sdev))) {
     27             sdev_printk(KERN_ERR, sdev,
     28                     "rejecting I/O to offline device
    ");
     29             scsi_kill_request(req, q);
     30             continue;
     31         }
     32         /*
     33          * Remove the request from the request list.
     34          * 如果队列不是使用generic tag queueing,并且没有为请求启动tagged操作,调用blk_start_request开始由驱动处理请求,这个函数将请求从队列中取出,为它启动超时定时器
     35          */
     36         if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))    // 
     37             blk_start_request(req);
     38         sdev->device_busy++;
     39         spin_unlock(q->queue_lock);
     40         /* 从块设备驱动层请求描述符的special域获得SCSI命令描述符,这是在之前的blk_peek_request函数中调用请求队列的prep_rq_fn回调函数准备的 */
     41         cmd = req->special;
     42         if (unlikely(cmd == NULL)) {
     43             printk(KERN_CRIT "impossible request in %s.
    "
     44                      "please mail a stack trace to "
     45                      "linux-scsi@vger.kernel.org
    ",
     46                      __func__);
     47             blk_dump_rq_flags(req, "foo");
     48             BUG();
     49         }
     50         spin_lock(shost->host_lock);
     51         /*
     52          * We hit this when the driver is using a host wide
     53          * tag map. For device level tag maps the queue_depth check
     54          * in the device ready fn would prevent us from trying
     55          * to allocate a tag. Since the map is a shared host resource
     56          * we add the dev to the starved list so it eventually gets
     57          * a run when a tag is freed.
     58          */
     59         if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {
     60             if (list_empty(&sdev->starved_entry))
     61                 list_add_tail(&sdev->starved_entry,
     62                           &shost->starved_list);
     63             goto not_ready;
     64         }
     65         if (!scsi_target_queue_ready(shost, sdev))
     66             goto not_ready;
     67         if (!scsi_host_queue_ready(q, shost, sdev))
     68             goto not_ready;
     69         scsi_target(sdev)->target_busy++;
     70         shost->host_busy++;
     71         /*
     72          * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will
     73          *        take the lock again.
     74          */
     75         spin_unlock_irq(shost->host_lock);
     76         /*
     77          * Finally, initialize any error handling parameters, and set up the timers for timeouts.
     78          * 初始化错误处理参数, 设置超时定时器
     79          */
     80         scsi_init_cmd_errh(cmd);
     81         /*
     82          * Dispatch the command to the low-level driver.
     83          * 将命令派发到底层驱动
     84          */
     85         rtn = scsi_dispatch_cmd(cmd);
     86         spin_lock_irq(q->queue_lock);
     87         if (rtn)
     88             goto out_delay;
     89     }
     90     goto out;
     91 
     92 not_ready:
     93     spin_unlock_irq(shost->host_lock);
     94     /*
     95      * lock q, handle tag, requeue req, and decrement device_busy. We
     96      * must return with queue_lock held.
     97      *
     98      * Decrementing device_busy without checking it is OK, as all such
     99      * cases (host limits or settings) should run the queue at some
    100      * later time.
    101      */
    102     spin_lock_irq(q->queue_lock);
    103     blk_requeue_request(q, req);
    104     sdev->device_busy--;
    105 out_delay:
    106     if (sdev->device_busy == 0)
    107         blk_delay_queue(q, SCSI_QUEUE_DELAY);
    108 out:
    109     /* must be careful here...if we trigger the ->remove() function
    110      * we cannot be holding the q lock */
    111     spin_unlock_irq(q->queue_lock);
    112     put_device(&sdev->sdev_gendev);
    113     spin_lock_irq(q->queue_lock);
    114 }

    blk_peek_request从请求队列“顶部”取得下一个请求。函数的实现就是一个大循环,每次调用__elv_next_request从电梯队列中取出一个请求进行处理

      1 /**
      2 * blk_peek_request - peek at the top of a request queue
      3 * @q: request queue to peek at
      4 *
      5 * Description:
      6 *     Return the request at the top of @q.  The returned request
      7 *     should be started using blk_start_request() before LLD starts
      8 *     processing it.
      9 *
     10 * Return:
     11 *     Pointer to the request at the top of @q if available.  Null
     12 *     otherwise.
     13 *
     14 * Context:
     15 *     queue_lock must be held.
     16 */
     17 struct request *blk_peek_request(struct request_queue *q)
     18 {
     19     struct request *rq;
     20     int ret;
     21 
     22     while ((rq = __elv_next_request(q)) != NULL) {
     23 
     24         rq = blk_pm_peek_request(q, rq);
     25         if (!rq)
     26             break;
     27         /* 请求可能是全新的或者是由于暂时不能处理而重新排入队列的,对于后一种情况,必然设置了REQ_STARTED标志。
     28           * 换句话说,如果没有该标志,则表示第一次看见此请求,如果请求被插入还需要排序,则调用elv_activate_rq函数确定合适执行该请求
     29           */
     30         if (!(rq->cmd_flags & REQ_STARTED)) {
     31             /*
     32              * This is the first time the device driver
     33              * sees this request (possibly after
     34              * requeueing).  Notify IO scheduler.
     35              */
     36             if (rq->cmd_flags & REQ_SORTED)
     37                 elv_activate_rq(q, rq);
     38 
     39             /*
     40              * just mark as started even if we don't start
     41              * it, a request that has been delayed should
     42              * not be passed by new incoming requests
     43              */
     44             rq->cmd_flags |= REQ_STARTED;
     45             trace_block_rq_issue(q, rq);
     46         }
     47         /* 配合IO调度器 */
     48         if (!q->boundary_rq || q->boundary_rq == rq) {
     49             q->end_sector = rq_end_sector(rq);
     50             q->boundary_rq = NULL;
     51         }
     52 
     53         /* 如果请求队列设置了REQ_DONTPREP,表明不需要准备SCSI命令,退出循环,向调用者返回这个请求 */
     54         if (rq->cmd_flags & REQ_DONTPREP)
     55             break;
     56 
     57         /* 
     58           * 如果请求队列的dma_drain_size不为0,说明存在“过剩DMA”问题,这种情况下,需要为请求增加一个额外的段
     59           * 以便将来在聚散列表后追加“抽干缓冲区”
     60           */
     61         if (q->dma_drain_size && blk_rq_bytes(rq)) {
     62             /*
     63              * make sure space for the drain appears we
     64              * know we can do this because max_hw_segments
     65              * has been adjusted to be one fewer than the
     66              * device can handle
     67              */
     68             rq->nr_phys_segments++;
     69         }
     70         /* 
     71           * 如果没有定义 prep_rq_fn回调,则返回
     72           * 否则调用回调为请求准备SCSI命令描述符,它有三种返回值:
     73           *     BLKPREP_OK:表示命令初期准备成功
     74           *     BLKPREP_DEFER:表示暂时还不能继续处理,需要将命令重新排入队列
     75           *     BLKPREP_KILL:该请求没办法继续处理,上上层报告IO错误,这里不退出循环,而是继续尝试下一个请求
     76           */
     77         if (!q->prep_rq_fn)
     78             break;
     79 
     80         ret = q->prep_rq_fn(q, rq);
     81         if (ret == BLKPREP_OK) {
     82             break;
     83         } else if (ret == BLKPREP_DEFER) {
     84             /*
     85              * the request may have been (partially) prepped.
     86              * we need to keep this request in the front to
     87              * avoid resource deadlock.  REQ_STARTED will
     88              * prevent other fs requests from passing this one.
     89              */
     90             if (q->dma_drain_size && blk_rq_bytes(rq) &&
     91                 !(rq->cmd_flags & REQ_DONTPREP)) {
     92                 /*
     93                  * remove the space for the drain we added
     94                  * so that we don't add it again
     95                  */
     96                 --rq->nr_phys_segments;
     97             }
     98 
     99             rq = NULL;
    100             break;
    101         } else if (ret == BLKPREP_KILL) {
    102             rq->cmd_flags |= REQ_QUIET;
    103             /*
    104              * Mark this request as started so we don't trigger
    105              * any debug logic in the end I/O path.
    106              */
    107             blk_start_request(rq);
    108             __blk_end_request_all(rq, -EIO);
    109         } else {
    110             printk(KERN_ERR "%s: bad return=%d
    ", __func__, ret);
    111             break;
    112         }
    113     }
    114 
    115     return rq;
    116 }
    请求队列中的prep_rq_fn回调函数实现了从请求构造SCSI命令的方法,prep_rq_fn回调函数关键有两个任务:
    1. 构造命令描述块
    2. 如果需要的话为数据传输准备聚散列表
    命令描述块和聚散列表都被封装到SCSI命令描述符中,我们知道,请求至少有两个来源
    1. 来自上层bio
    2. 来自SCSI公共服务层
    在刚找到SCSI设备为其初始化请求队列时,这个回调函数被设置为scsi_prep_fn
     
     1 struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
     2 {
     3     struct request_queue *q;
     4 
     5     q = __scsi_alloc_queue(sdev->host, scsi_request_fn);
     6     if (!q)
     7         return NULL;
     8 
     9     blk_queue_prep_rq(q, scsi_prep_fn);
    10     blk_queue_softirq_done(q, scsi_softirq_done);
    11     blk_queue_rq_timed_out(q, scsi_times_out);
    12     blk_queue_lld_busy(q, scsi_lld_busy);
    13     return q;
    14 }
    15 
    16 /**
    17 * blk_queue_prep_rq - set a prepare_request function for queue
    18 * @q:        queue
    19 * @pfn:    prepare_request function
    20 *
    21 * It's possible for a queue to register a prepare_request callback which
    22 * is invoked before the request is handed to the request_fn. The goal of
    23 * the function is to prepare a request for I/O, it can be used to build a
    24 * cdb from the request data for instance.
    25 *
    26 */
    27 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
    28 {
    29     q->prep_rq_fn = pfn;
    30 }
    初始化回调
    如果SCSI设备被高层驱动绑定,这个回调函数会被修改,例如,在sd_probe中被设置成sd_prep_fn
     
     1 static void sd_probe_async(void *data, async_cookie_t cookie)
     2 {
     3     struct scsi_disk *sdkp = data;
     4     struct scsi_device *sdp;
     5     struct gendisk *gd;
     6     u32 index;
     7     struct device *dev;
     8 
     9     sdp = sdkp->device;
    10     gd = sdkp->disk;
    11     index = sdkp->index;
    12     dev = &sdp->sdev_gendev;
    13 
    14     gd->major = sd_major((index & 0xf0) >> 4);
    15     gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
    16     gd->minors = SD_MINORS;
    17 
    18     gd->fops = &sd_fops;
    19     gd->private_data = &sdkp->driver;
    20     gd->queue = sdkp->device->request_queue;
    21 
    22     /* defaults, until the device tells us otherwise */
    23     sdp->sector_size = 512;
    24     sdkp->capacity = 0;
    25     sdkp->media_present = 1;
    26     sdkp->write_prot = 0;
    27     sdkp->cache_override = 0;
    28     sdkp->WCE = 0;
    29     sdkp->RCD = 0;
    30     sdkp->ATO = 0;
    31     sdkp->first_scan = 1;
    32     sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS;
    33 
    34     sd_revalidate_disk(gd);
    35 
    36     blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);
    37     blk_queue_unprep_rq(sdp->request_queue, sd_unprep_fn);
    38 
    39     gd->driverfs_dev = &sdp->sdev_gendev;
    40     gd->flags = GENHD_FL_EXT_DEVT;
    41     if (sdp->removable) {
    42         gd->flags |= GENHD_FL_REMOVABLE;
    43         gd->events |= DISK_EVENT_MEDIA_CHANGE;
    44     }
    45 
    46     add_disk(gd);
    47     if (sdkp->capacity)
    48         sd_dif_config_host(sdkp);
    49 
    50     sd_revalidate_disk(gd);
    51 
    52     sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk
    ",
    53           sdp->removable ? "removable " : "");
    54     blk_pm_runtime_init(sdp->request_queue, dev);
    55     scsi_autopm_put_device(sdp);
    56     put_device(&sdkp->dev);
    57 }
    初始化回调

    在前一种情况下,SCSI设备只能处理来自SCSI公共服务层的请求,后一种情况下,SCSI命令不仅能处理来自SCSI公共服务层的请求,还能够处理来自上层的bio请求,分析见下一节

     
     
  • 相关阅读:
    kafka-->storm-->mongodb
    zuul filter
    使用Spring Cloud Feign
    kafka客户端发布record(消息)
    kafka java api消费者
    kafka java api生产者
    kafka安装和使用
    多线程分析
    springboot入门
    centos7上svn安装
  • 原文地址:https://www.cnblogs.com/luxiaodai/p/9266309.html
Copyright © 2020-2023  润新知