其实到目前为止,如果对读流程已经能轻松地看懂了,那么写流程不需要太多脑细胞。我觉得再写下去没有太大的必要了,后面想想为了保持flashcache完整性,还是写出来吧。接着到写流程:
1530static void 1531flashcache_write(struct cache_c *dmc, struct bio *bio) 1532{ 1533 int index; 1534 int res; 1535 struct cacheblock *cacheblk; 1536 int queued; 1537 1538 spin_lock_irq(&dmc->cache_spin_lock); 1539 res = flashcache_lookup(dmc, bio, &index); 1540 /* 1541 * If cache hit and !BUSY, simply redirty page. 1542 * If cache hit and BUSY, must wait for IO in prog to complete. 1543 * If cache miss and found a block to recycle, we need to 1544 * (a) invalidate any partial hits, 1545 * (b) write to cache. 1546 */ 1547 if (res != -1) { 1548 /* Cache Hit */ 1549 cacheblk = &dmc->cache[index]; 1550 if ((cacheblk->cache_state & VALID) && 1551 (cacheblk->dbn == bio->bi_sector)) { 1552 /* Cache Hit */ 1553 flashcache_write_hit(dmc, bio, index); 1554 } else { 1555 /* Cache Miss, found block to recycle */ 1556 flashcache_write_miss(dmc, bio, index); 1557 } 1558 return; 1559 } 1560 /* 1561 * No room in the set. We cannot write to the cache and have to 1562 * send the request to disk. Before we do that, we must check 1563 * for potential invalidations ! 1564 */ 1565 queued = flashcache_inval_blocks(dmc, bio); 1566 spin_unlock_irq(&dmc->cache_spin_lock); 1567 if (queued) { 1568 if (unlikely(queued < 0)) 1569 flashcache_bio_endio(bio, -EIO); 1570 return; 1571 } 1572 /* Start uncached IO */ 1573 flashcache_start_uncached_io(dmc, bio); 1574 flashcache_clean_set(dmc, hash_block(dmc, bio->bi_sector)); 1575}
第1539行查找是否命中,这里有几种情况:
1)命中且cache空闲,直接写cache块并设置DIRTY标志
2)命中且cache忙,等待上一个请求完成
3)不命中并且找到可用的cache块,invalid有交集的cache块,然后再写到cache
4)没有可用cache块,invalid有次的cache块,写到磁盘
第4种情况在第1573行直接写到磁盘,最后调用的还是dm_io_async_bvec。
再看第1种情况,进入到命中处理分支:
1468static void 1469flashcache_write_hit(struct cache_c *dmc, struct bio *bio, int index) 1470{ 1471 struct cacheblock *cacheblk; 1472 struct pending_job *pjob; 1473 struct kcached_job *job; 1474 1475 cacheblk = &dmc->cache[index]; 1476 if (!(cacheblk->cache_state & BLOCK_IO_INPROG) && (cacheblk->head == NULL)) { 1477 if (cacheblk->cache_state & DIRTY) 1478 dmc->dirty_write_hits++; 1479 dmc->write_hits++; 1480 cacheblk->cache_state |= CACHEWRITEINPROG; 1481 spin_unlock_irq(&dmc->cache_spin_lock); 1482 job = new_kcached_job(dmc, bio, index); 1483 if (unlikely(sysctl_flashcache_error_inject & WRITE_HIT_JOB_ALLOC_FAIL)) { 1484 if (job) 1485 flashcache_free_cache_job(job); 1486 job = NULL; 1487 sysctl_flashcache_error_inject &= ~WRITE_HIT_JOB_ALLOC_FAIL; 1488 } 1489 if (unlikely(job == NULL)) { 1490 /* 1491 * We have a write hit, and can't allocate a job. 1492 * Since we dropped the spinlock, we have to drain any 1493 * pending jobs. 1494 */ 1495 DMERR("flashcache: Write (hit) failed ! Can't allocate memory for cache IO, block %lu", 1496 cacheblk->dbn); 1497 flashcache_bio_endio(bio, -EIO); 1498 spin_lock_irq(&dmc->cache_spin_lock); 1499 flashcache_free_pending_jobs(dmc, cacheblk, -EIO); 1500 cacheblk->cache_state &= ~(BLOCK_IO_INPROG); 1501 spin_unlock_irq(&dmc->cache_spin_lock); 1502 } else { 1503 job->action = WRITECACHE; /* Write data to the source device */ 1504 DPRINTK("Queue job for %llu", bio->bi_sector); 1505 atomic_inc(&dmc->nr_jobs); 1506 dmc->ssd_writes++; 1507 dm_io_async_bvec(1, &job->cache, WRITE, 1508 bio->bi_io_vec + bio->bi_idx, 1509 flashcache_io_callback, job); 1510 flashcache_unplug_device(dmc->cache_dev->bdev); 1511 flashcache_clean_set(dmc, index / dmc->assoc); 1512 } 1513 } else { 1514 pjob = flashcache_alloc_pending_job(dmc); 1515 if (unlikely(sysctl_flashcache_error_inject & WRITE_HIT_PENDING_JOB_ALLOC_FAIL)) { 1516 if (pjob) { 1517 flashcache_free_pending_job(pjob); 1518 pjob = NULL; 1519 } 1520 sysctl_flashcache_error_inject &= ~WRITE_HIT_PENDING_JOB_ALLOC_FAIL; 1521 } 1522 if (unlikely(pjob == NULL)) 1523 flashcache_bio_endio(bio, -EIO); 1524 else 1525 flashcache_enq_pending(dmc, bio, index, WRITECACHE, pjob); 1526 spin_unlock_irq(&dmc->cache_spin_lock); 1527 } 1528}
在1475行获得cache块,在1476行判断是否空闲,在有IO处理或者有pending_job挂着的时候都视为忙。如果cache块空闲,则进入if分支,接下来又是套路了,创建kcached_job,成功的话就在1507行下发写请求。然后接着看写返回时做了哪些处理?进入写回调函数之前,要记住这里设置了两个标志,一个是1480行cache块的CACHEWRITEINPROG,另一个是1503行kcached_job的WRITECACHE,带着这两个标志进入到写回调函数flashcache_io_callback,并直接找到需要的地方:
188 case WRITECACHE: 189 DPRINTK("flashcache_io_callback: WRITECACHE %d", 190 index); 191 spin_lock_irqsave(&dmc->cache_spin_lock, flags); 192 if (unlikely(sysctl_flashcache_error_inject & WRITECACHE_ERROR)) { 193 job->error = error = -EIO; 194 sysctl_flashcache_error_inject &= ~WRITECACHE_ERROR; 195 } 196 VERIFY(cacheblk->cache_state & CACHEWRITEINPROG); 197 if (likely(error == 0)) { 198#ifdef FLASHCACHE_DO_CHECKSUMS 199 dmc->checksum_store++; 200 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 201 flashcache_store_checksum(job); 202 /* 203 * We need to update the metadata on a DIRTY->DIRTY as well 204 * since we save the checksums. 205 */ 206 push_md_io(job); 207 schedule_work(&_kcached_wq); 208 return; 209#else 210 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 211 /* Only do cache metadata update on a non-DIRTY->DIRTY transition */ 212 if ((cacheblk->cache_state & DIRTY) == 0) { 213 push_md_io(job); 214 schedule_work(&_kcached_wq); 215 return; 216 } 217#endif 218 } else { 219 dmc->ssd_write_errors++; 220 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 221 } 222 flashcache_bio_endio(bio, error); 223 break;
写到缓存成功的话,暂不管cache块的校验值,会来到210行,判断原来的cache块是否为脏,如果为脏那就什么事情都不用做了。因为如果cache块本来就是脏,那新来的IO可以直接覆盖到cache块上去。反之如果原来cache块是干净的,那么这个时候要把cache块已经变脏记录到SSD上,于是进入了第213行开始写cache块管理信息。到了这里似乎cache块已经写到缓存中,IO可以返回了,但是到了第215行为什么直接return呢?这里涉及到数据一致性的问题。其实cache块管理结构没有写到缓存中,这个写请求不能算完成。试想如果在这里调用了第222行flashcache_bio_endio把IO返回了,会有什么样的后果?其实大多数情况下是没有什么问题的,但如果在这个时候系统掉电或者宕机了,这时候缓存中记录的cache块状态是干净的,但又已经跟上层返回说IO已经写成功了,那么最后这一次写的数据就丢失了。当然对于大部分用户来说,这一点数据算什么?但对于像银行这样的系统,当你把辛苦了十年的积蓄存到自动取款机,这时自动取款机告诉你存成功了,但不幸的是后台刚好发生了我们上面描述的问题。结果你再查的时候没有你刚才存进去的钱,但你的钱确确实实被取款机收进去了,这时你会有怎样的感受?这里只是举个数据一致性在某些应用中是非常重要的,当然现实中绝大数银行是不会有这样的问题,银行可以有日志查出来,系统也有热备,也是带UPS保护的。
如果原来的cache块为脏的情况就以第222行flashcache_bio_endio结束了。
如果不为脏,那么调用213行将cache块管理结构写到缓存。
272void 273push_md_io(struct kcached_job *job) 274{ 275 push(&_md_io_jobs, job); 276}
这里只是简单放到队列中,具体处理的是第214行唤醒的工作队列。该工作队列对应的处理函数是:
303 process_jobs(&_md_io_jobs, flashcache_md_write);
这个函数怎么这么面熟呢?因为在第一小节里已经介绍过了:
这里小结一下写命中并且原cache块为干净的数据流程:
1)写命中调用dm_io_async_bvec写缓存
2)写缓存完成回调函数flashcache_io_callback,判断原cache块为干净,需要写cache块管理结构
3)由工作队列_kcached_wq调用flashcache_md_write写cache块管理结构,最终由flashcache_md_write_kickoff调用dm_io_async_bvec将cache块管理结构写到缓存
4)写缓存完成之后调用flashcache_md_write_callback
5)由工作队列_kcached_wq调用flashcache_md_write_done处理
6)在flashcache_md_write_done中判断job类型为WRITECACHE,最后调用flashcache_bio_endio返回
至此,这个IO才完成使命。
接下来讲第3种情况,这种情况就非常简单了。
1411static void 1412flashcache_write_miss(struct cache_c *dmc, struct bio *bio, int index) 1413{ 1414 struct cacheblock *cacheblk; 1415 struct kcached_job *job; 1416 int queued; 1417 1418 cacheblk = &dmc->cache[index]; 1419 queued = flashcache_inval_blocks(dmc, bio); 1420 if (queued) { 1421 if (unlikely(queued < 0)) 1422 flashcache_bio_endio(bio, -EIO); 1423 spin_unlock_irq(&dmc->cache_spin_lock); 1424 return; 1425 } 1426 if (cacheblk->cache_state & VALID) 1427 dmc->wr_replace++; 1428 else 1429 dmc->cached_blocks++; 1430 cacheblk->cache_state = VALID | CACHEWRITEINPROG; 1431 cacheblk->dbn = bio->bi_sector; 1432 spin_unlock_irq(&dmc->cache_spin_lock); 1433 job = new_kcached_job(dmc, bio, index); 1434 if (unlikely(sysctl_flashcache_error_inject & WRITE_MISS_JOB_ALLOC_FAIL)) { 1435 if (job) 1436 flashcache_free_cache_job(job); 1437 job = NULL; 1438 sysctl_flashcache_error_inject &= ~WRITE_MISS_JOB_ALLOC_FAIL; 1439 } 1440 if (unlikely(job == NULL)) { 1441 /* 1442 * We have a write miss, and can't allocate a job. 1443 * Since we dropped the spinlock, we have to drain any 1444 * pending jobs. 1445 */ 1446 DMERR("flashcache: Write (miss) failed ! Can't allocate memory for cache IO, block %lu", 1447 cacheblk->dbn); 1448 flashcache_bio_endio(bio, -EIO); 1449 spin_lock_irq(&dmc->cache_spin_lock); 1450 dmc->cached_blocks--; 1451 cacheblk->cache_state &= ~VALID; 1452 cacheblk->cache_state |= INVALID; 1453 flashcache_free_pending_jobs(dmc, cacheblk, -EIO); 1454 cacheblk->cache_state &= ~(BLOCK_IO_INPROG); 1455 spin_unlock_irq(&dmc->cache_spin_lock); 1456 } else { 1457 job->action = WRITECACHE; 1458 atomic_inc(&dmc->nr_jobs); 1459 dmc->ssd_writes++; 1460 dm_io_async_bvec(1, &job->cache, WRITE, 1461 bio->bi_io_vec + bio->bi_idx, 1462 flashcache_io_callback, job); 1463 flashcache_unplug_device(dmc->cache_dev->bdev); 1464 flashcache_clean_set(dmc, index / dmc->assoc); 1465 } 1466}
大多数函数都已经是老朋友了。第1430行cache块设置了VALID标志,表示在有效数据,第1431行设置cache块对应的磁盘的bi_sector扇区。接着到第1460行下发写缓存请求,写缓存的情况与写命中的一样就不再继续跟进了。
下一节讲缓存超水位线写回磁盘。