• linux内核源码阅读之facebook硬盘加速flashcache之六



    其实到目前为止,如果对读流程已经能轻松地看懂了,那么写流程不需要太多脑细胞。我觉得再写下去没有太大的必要了,后面想想为了保持flashcache完整性,还是写出来吧。接着到写流程:
    1530static void
    1531flashcache_write(struct cache_c *dmc, struct bio *bio)
    1532{
    1533     int index;
    1534     int res;
    1535     struct cacheblock *cacheblk;
    1536     int queued;
    1537    
    1538     spin_lock_irq(&dmc->cache_spin_lock);
    1539     res = flashcache_lookup(dmc, bio, &index);
    1540     /*
    1541     * If cache hit and !BUSY, simply redirty page.
    1542     * If cache hit and BUSY, must wait for IO in prog to complete.
    1543     * If cache miss and found a block to recycle, we need to
    1544     * (a) invalidate any partial hits,
    1545     * (b) write to cache.
    1546     */
    1547     if (res != -1) {
    1548          /* Cache Hit */
    1549          cacheblk = &dmc->cache[index];         
    1550          if ((cacheblk->cache_state & VALID) &&
    1551              (cacheblk->dbn == bio->bi_sector)) {
    1552               /* Cache Hit */
    1553               flashcache_write_hit(dmc, bio, index);
    1554          } else {
    1555               /* Cache Miss, found block to recycle */
    1556               flashcache_write_miss(dmc, bio, index);
    1557          }
    1558          return;
    1559     }
    1560     /*
    1561     * No room in the set. We cannot write to the cache and have to
    1562     * send the request to disk. Before we do that, we must check
    1563     * for potential invalidations !
    1564     */
    1565     queued = flashcache_inval_blocks(dmc, bio);
    1566     spin_unlock_irq(&dmc->cache_spin_lock);
    1567     if (queued) {
    1568          if (unlikely(queued < 0))
    1569               flashcache_bio_endio(bio, -EIO);
    1570          return;
    1571     }
    1572     /* Start uncached IO */
    1573     flashcache_start_uncached_io(dmc, bio);
    1574     flashcache_clean_set(dmc, hash_block(dmc, bio->bi_sector));
    1575}

    第1539行查找是否命中,这里有几种情况:
    1)命中且cache空闲,直接写cache块并设置DIRTY标志
    2)命中且cache忙,等待上一个请求完成
    3)不命中并且找到可用的cache块,invalid有交集的cache块,然后再写到cache
    4)没有可用cache块,invalid有次的cache块,写到磁盘
    第4种情况在第1573行直接写到磁盘,最后调用的还是dm_io_async_bvec。
    再看第1种情况,进入到命中处理分支:
    1468static void
    1469flashcache_write_hit(struct cache_c *dmc, struct bio *bio, int index)
    1470{
    1471     struct cacheblock *cacheblk;
    1472     struct pending_job *pjob;
    1473     struct kcached_job *job;
    1474
    1475     cacheblk = &dmc->cache[index];
    1476     if (!(cacheblk->cache_state & BLOCK_IO_INPROG) && (cacheblk->head == NULL)) {
    1477          if (cacheblk->cache_state & DIRTY)
    1478               dmc->dirty_write_hits++;
    1479          dmc->write_hits++;
    1480          cacheblk->cache_state |= CACHEWRITEINPROG;
    1481          spin_unlock_irq(&dmc->cache_spin_lock);
    1482          job = new_kcached_job(dmc, bio, index);
    1483          if (unlikely(sysctl_flashcache_error_inject & WRITE_HIT_JOB_ALLOC_FAIL)) {
    1484               if (job)
    1485                    flashcache_free_cache_job(job);
    1486               job = NULL;
    1487               sysctl_flashcache_error_inject &= ~WRITE_HIT_JOB_ALLOC_FAIL;
    1488          }
    1489          if (unlikely(job == NULL)) {
    1490               /* 
    1491               * We have a write hit, and can't allocate a job.
    1492               * Since we dropped the spinlock, we have to drain any 
    1493               * pending jobs.
    1494               */
    1495               DMERR("flashcache: Write (hit) failed ! Can't allocate memory for cache IO, block %lu", 
    1496                     cacheblk->dbn);
    1497               flashcache_bio_endio(bio, -EIO);
    1498               spin_lock_irq(&dmc->cache_spin_lock);
    1499               flashcache_free_pending_jobs(dmc, cacheblk, -EIO);
    1500               cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
    1501               spin_unlock_irq(&dmc->cache_spin_lock);
    1502          } else {
    1503               job->action = WRITECACHE; /* Write data to the source device */
    1504               DPRINTK("Queue job for %llu", bio->bi_sector);
    1505               atomic_inc(&dmc->nr_jobs);
    1506               dmc->ssd_writes++;
    1507               dm_io_async_bvec(1, &job->cache, WRITE, 
    1508                         bio->bi_io_vec + bio->bi_idx,
    1509                         flashcache_io_callback, job);
    1510               flashcache_unplug_device(dmc->cache_dev->bdev);
    1511               flashcache_clean_set(dmc, index / dmc->assoc);
    1512          }
    1513     } else {
    1514          pjob = flashcache_alloc_pending_job(dmc);
    1515          if (unlikely(sysctl_flashcache_error_inject & WRITE_HIT_PENDING_JOB_ALLOC_FAIL)) {
    1516               if (pjob) {
    1517                    flashcache_free_pending_job(pjob);
    1518                    pjob = NULL;
    1519               }
    1520               sysctl_flashcache_error_inject &= ~WRITE_HIT_PENDING_JOB_ALLOC_FAIL;
    1521          }
    1522          if (unlikely(pjob == NULL))
    1523               flashcache_bio_endio(bio, -EIO);
    1524          else
    1525               flashcache_enq_pending(dmc, bio, index, WRITECACHE, pjob);
    1526          spin_unlock_irq(&dmc->cache_spin_lock);
    1527     }
    1528}

    在1475行获得cache块,在1476行判断是否空闲,在有IO处理或者有pending_job挂着的时候都视为忙。如果cache块空闲,则进入if分支,接下来又是套路了,创建kcached_job,成功的话就在1507行下发写请求。然后接着看写返回时做了哪些处理?进入写回调函数之前,要记住这里设置了两个标志,一个是1480行cache块的CACHEWRITEINPROG,另一个是1503行kcached_job的WRITECACHE,带着这两个标志进入到写回调函数flashcache_io_callback,并直接找到需要的地方:
    188     case WRITECACHE:
    189          DPRINTK("flashcache_io_callback: WRITECACHE %d",
    190               index);
    191          spin_lock_irqsave(&dmc->cache_spin_lock, flags);
    192          if (unlikely(sysctl_flashcache_error_inject & WRITECACHE_ERROR)) {
    193               job->error = error = -EIO;
    194               sysctl_flashcache_error_inject &= ~WRITECACHE_ERROR;
    195          }
    196          VERIFY(cacheblk->cache_state & CACHEWRITEINPROG);
    197          if (likely(error == 0)) {
    198#ifdef FLASHCACHE_DO_CHECKSUMS
    199               dmc->checksum_store++;
    200               spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
    201               flashcache_store_checksum(job);
    202               /* 
    203               * We need to update the metadata on a DIRTY->DIRTY as well 
    204               * since we save the checksums.
    205               */
    206               push_md_io(job);
    207               schedule_work(&_kcached_wq);
    208               return;
    209#else
    210               spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
    211               /* Only do cache metadata update on a non-DIRTY->DIRTY transition */
    212               if ((cacheblk->cache_state & DIRTY) == 0) {
    213                    push_md_io(job);
    214                    schedule_work(&_kcached_wq);
    215                    return;
    216               }
    217#endif
    218          } else {
    219               dmc->ssd_write_errors++;               
    220               spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
    221          }
    222          flashcache_bio_endio(bio, error);
    223          break;

    写到缓存成功的话,暂不管cache块的校验值,会来到210行,判断原来的cache块是否为脏,如果为脏那就什么事情都不用做了。因为如果cache块本来就是脏,那新来的IO可以直接覆盖到cache块上去。反之如果原来cache块是干净的,那么这个时候要把cache块已经变脏记录到SSD上,于是进入了第213行开始写cache块管理信息。到了这里似乎cache块已经写到缓存中,IO可以返回了,但是到了第215行为什么直接return呢?这里涉及到数据一致性的问题。其实cache块管理结构没有写到缓存中,这个写请求不能算完成。试想如果在这里调用了第222行flashcache_bio_endio把IO返回了,会有什么样的后果?其实大多数情况下是没有什么问题的,但如果在这个时候系统掉电或者宕机了,这时候缓存中记录的cache块状态是干净的,但又已经跟上层返回说IO已经写成功了,那么最后这一次写的数据就丢失了。当然对于大部分用户来说,这一点数据算什么?但对于像银行这样的系统,当你把辛苦了十年的积蓄存到自动取款机,这时自动取款机告诉你存成功了,但不幸的是后台刚好发生了我们上面描述的问题。结果你再查的时候没有你刚才存进去的钱,但你的钱确确实实被取款机收进去了,这时你会有怎样的感受?这里只是举个数据一致性在某些应用中是非常重要的,当然现实中绝大数银行是不会有这样的问题,银行可以有日志查出来,系统也有热备,也是带UPS保护的。
    如果原来的cache块为脏的情况就以第222行flashcache_bio_endio结束了。
    如果不为脏,那么调用213行将cache块管理结构写到缓存。
    272void
    273push_md_io(struct kcached_job *job)
    274{
    275     push(&_md_io_jobs, job);     
    276}

    这里只是简单放到队列中,具体处理的是第214行唤醒的工作队列。该工作队列对应的处理函数是:
    303     process_jobs(&_md_io_jobs, flashcache_md_write);

    这个函数怎么这么面熟呢?因为在第一小节里已经介绍过了:
    这里小结一下写命中并且原cache块为干净的数据流程:
    1)写命中调用dm_io_async_bvec写缓存
    2)写缓存完成回调函数flashcache_io_callback,判断原cache块为干净,需要写cache块管理结构
    3)由工作队列_kcached_wq调用flashcache_md_write写cache块管理结构,最终由flashcache_md_write_kickoff调用dm_io_async_bvec将cache块管理结构写到缓存
    4)写缓存完成之后调用flashcache_md_write_callback
    5)由工作队列_kcached_wq调用flashcache_md_write_done处理
    6)在flashcache_md_write_done中判断job类型为WRITECACHE,最后调用flashcache_bio_endio返回
    至此,这个IO才完成使命。
    接下来讲第3种情况,这种情况就非常简单了。
    1411static void
    1412flashcache_write_miss(struct cache_c *dmc, struct bio *bio, int index)
    1413{
    1414     struct cacheblock *cacheblk;
    1415     struct kcached_job *job;
    1416     int queued;
    1417
    1418     cacheblk = &dmc->cache[index];
    1419     queued = flashcache_inval_blocks(dmc, bio);
    1420     if (queued) {
    1421          if (unlikely(queued < 0))
    1422               flashcache_bio_endio(bio, -EIO);
    1423          spin_unlock_irq(&dmc->cache_spin_lock);
    1424          return;
    1425     }
    1426     if (cacheblk->cache_state & VALID)
    1427          dmc->wr_replace++;
    1428     else
    1429          dmc->cached_blocks++;
    1430     cacheblk->cache_state = VALID | CACHEWRITEINPROG;
    1431     cacheblk->dbn = bio->bi_sector;
    1432     spin_unlock_irq(&dmc->cache_spin_lock);
    1433     job = new_kcached_job(dmc, bio, index);
    1434     if (unlikely(sysctl_flashcache_error_inject & WRITE_MISS_JOB_ALLOC_FAIL)) {
    1435          if (job)
    1436               flashcache_free_cache_job(job);
    1437          job = NULL;
    1438          sysctl_flashcache_error_inject &= ~WRITE_MISS_JOB_ALLOC_FAIL;
    1439     }
    1440     if (unlikely(job == NULL)) {
    1441          /* 
    1442          * We have a write miss, and can't allocate a job.
    1443          * Since we dropped the spinlock, we have to drain any 
    1444          * pending jobs.
    1445          */
    1446          DMERR("flashcache: Write (miss) failed ! Can't allocate memory for cache IO, block %lu", 
    1447                cacheblk->dbn);
    1448          flashcache_bio_endio(bio, -EIO);
    1449          spin_lock_irq(&dmc->cache_spin_lock);
    1450          dmc->cached_blocks--;
    1451          cacheblk->cache_state &= ~VALID;
    1452          cacheblk->cache_state |= INVALID;
    1453          flashcache_free_pending_jobs(dmc, cacheblk, -EIO);
    1454          cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
    1455          spin_unlock_irq(&dmc->cache_spin_lock);
    1456     } else {
    1457          job->action = WRITECACHE; 
    1458          atomic_inc(&dmc->nr_jobs);
    1459          dmc->ssd_writes++;
    1460          dm_io_async_bvec(1, &job->cache, WRITE, 
    1461                    bio->bi_io_vec + bio->bi_idx,
    1462                    flashcache_io_callback, job);
    1463          flashcache_unplug_device(dmc->cache_dev->bdev);
    1464          flashcache_clean_set(dmc, index / dmc->assoc);
    1465     }
    1466}

    大多数函数都已经是老朋友了。第1430行cache块设置了VALID标志,表示在有效数据,第1431行设置cache块对应的磁盘的bi_sector扇区。接着到第1460行下发写缓存请求,写缓存的情况与写命中的一样就不再继续跟进了。
    下一节讲缓存超水位线写回磁盘。
  • 相关阅读:
    线性代数07.Ax=0:主变量,特解
    线性代数06.列空间和零空间
    线性代数05.转置、置换、向量空间
    线性代数04.A的LU分解
    线性代数03.矩阵的乘法和逆
    .线性代数02.矩阵消元
    KEIL中三种编译模式以及对变量空间的影响
    Python之常用模块(三)random模块和序列化
    Python之常用模块(二)time模块与datetime模块
    Python之常用模块(一)自定义模块
  • 原文地址:https://www.cnblogs.com/pangblog/p/3329090.html
Copyright © 2020-2023  润新知