• linux内核源码阅读之facebook硬盘加速flashcache之五



    正常流程到flashcache_map的1623行或1625行,按顺序先看读流程:
    1221static void
    1222flashcache_read(struct cache_c *dmc, struct bio *bio)
    1223{
    1224	int index;
    1225	int res;
    1226	struct cacheblock *cacheblk;
    1227	int queued;
    1228
    1229	DPRINTK("Got a %s for %llu  %u bytes)",
    1230	        (bio_rw(bio) == READ ? "READ":"READA"), 
    1231		bio->bi_sector, bio->bi_size);
    1232
    1233	spin_lock_irq(&dmc->cache_spin_lock);
    1234	res = flashcache_lookup(dmc, bio, &index);
    1235	/* 
    1236	 * Handle Cache Hit case first.
    1237	 * We need to handle 2 cases, BUSY and !BUSY. If BUSY, we enqueue the
    1238	 * bio for later.
    1239	 */
    1240	if (res > 0) {
    1241		cacheblk = &dmc->cache[index];
    1242		if ((cacheblk->cache_state & VALID) && 
    1243		    (cacheblk->dbn == bio->bi_sector)) {
    1244			flashcache_read_hit(dmc, bio, index);
    1245			return;
    1246		}
    1247	}
    1248	/*
    1249	 * In all cases except for a cache hit (and VALID), test for potential 
    1250	 * invalidations that we need to do.
    1251	 */
    1252	queued = flashcache_inval_blocks(dmc, bio);
    1253	if (queued) {
    1254		if (unlikely(queued < 0))
    1255			flashcache_bio_endio(bio, -EIO);
    1256		spin_unlock_irq(&dmc->cache_spin_lock);
    1257		return;
    1258	}
    1259	if (res == -1 || flashcache_uncacheable(dmc)) {
    1260		/* No room or non-cacheable */
    1261		spin_unlock_irq(&dmc->cache_spin_lock);
    1262		DPRINTK("Cache read: Block %llu(%lu):%s",
    1263			bio->bi_sector, bio->bi_size, "CACHE MISS & NO ROOM");
    1264		if (res == -1)
    1265			flashcache_clean_set(dmc, hash_block(dmc, bio->bi_sector));
    1266		/* Start uncached IO */
    1267		flashcache_start_uncached_io(dmc, bio);
    1268		return;
    1269	}
    1270	/* 
    1271	 * (res == INVALID) Cache Miss 
    1272	 * And we found cache blocks to replace
    1273	 * Claim the cache blocks before giving up the spinlock
    1274	 */
    1275	if (dmc->cache[index].cache_state & VALID)
    1276		dmc->replace++;
    1277	else
    1278		dmc->cached_blocks++;
    1279	dmc->cache[index].cache_state = VALID | DISKREADINPROG;
    1280	dmc->cache[index].dbn = bio->bi_sector;
    1281	spin_unlock_irq(&dmc->cache_spin_lock);
    1282
    1283	DPRINTK("Cache read: Block %llu(%lu), index = %d:%s",
    1284		bio->bi_sector, bio->bi_size, index, "CACHE MISS & REPLACE");
    1285	flashcache_read_miss(dmc, bio, index);
    1286}

    我非常喜欢flashcache这种小资的感觉,每个函数都比较短,大部分都没有超过100行的。不像neil大哥写的md代码的函数动则几百行,上千行,总是望啊望啊望不到边。当然不是说我不喜欢neil的代码,实际上他的代码是非常非常好的,因为md代码已经有十多年的历史了,大的框架仍然没有太大的改变,仍能保持那么优雅已经是十分难得了。最新版本的md还加了许多新功能,像bad block和replacement机制都是非常实用的。所以对于一名优秀的软件工程师来说,并不在于写了多少行代码,而是编写的软件运行在多少台机器上,为用户创造了多少价值。
    第1234行是查找bio是否命中,flashcache_lookup函数我们在之前的文章里已经分析过了。第1244行是命中的情况,我们跟进看看
    1119static void
    1120flashcache_read_hit(struct cache_c *dmc, struct bio* bio, int index)
    1121{
    1122	struct cacheblock *cacheblk;
    1123	struct pending_job *pjob;
    1124
    1125	cacheblk = &dmc->cache[index];
    1126	if (!(cacheblk->cache_state & BLOCK_IO_INPROG) && (cacheblk->head == NULL)) {
    1127		struct kcached_job *job;
    1128			
    1129		cacheblk->cache_state |= CACHEREADINPROG;
    1130		dmc->read_hits++;
    1131		spin_unlock_irq(&dmc->cache_spin_lock);
    1132		DPRINTK("Cache read: Block %llu(%lu), index = %d:%s",
    1133			bio->bi_sector, bio->bi_size, index, "CACHE HIT");
    1134		job = new_kcached_job(dmc, bio, index);
    1135		if (unlikely(sysctl_flashcache_error_inject & READ_HIT_JOB_ALLOC_FAIL)) {
    1136			if (job)
    1137				flashcache_free_cache_job(job);
    1138			job = NULL;
    1139			sysctl_flashcache_error_inject &= ~READ_HIT_JOB_ALLOC_FAIL;
    1140		}
    1141		if (unlikely(job == NULL)) {
    1142			/* 
    1143			 * We have a read hit, and can't allocate a job.
    1144			 * Since we dropped the spinlock, we have to drain any 
    1145			 * pending jobs.
    1146			 */
    1147			DMERR("flashcache: Read (hit) failed ! Can't allocate memory for cache IO, block %lu", 
    1148			      cacheblk->dbn);
    1149			flashcache_bio_endio(bio, -EIO);
    1150			spin_lock_irq(&dmc->cache_spin_lock);
    1151			flashcache_free_pending_jobs(dmc, cacheblk, -EIO);
    1152			cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
    1153			spin_unlock_irq(&dmc->cache_spin_lock);
    1154		} else {
    1155			job->action = READCACHE; /* Fetch data from cache */
    1156			atomic_inc(&dmc->nr_jobs);
    1157			dmc->ssd_reads++;
    1158			dm_io_async_bvec(1, &job->cache, READ,
    1159					 bio->bi_io_vec + bio->bi_idx,
    1160					 flashcache_io_callback, job);
    1161			flashcache_unplug_device(dmc->cache_dev->bdev);
    1162		}
    1163	} else {
    1164		pjob = flashcache_alloc_pending_job(dmc);
    1165		if (unlikely(sysctl_flashcache_error_inject & READ_HIT_PENDING_JOB_ALLOC_FAIL)) {
    1166			if (pjob) {
    1167				flashcache_free_pending_job(pjob);
    1168				pjob = NULL;
    1169			}
    1170			sysctl_flashcache_error_inject &= ~READ_HIT_PENDING_JOB_ALLOC_FAIL;
    1171		}
    1172		if (pjob == NULL)
    1173			flashcache_bio_endio(bio, -EIO);
    1174		else
    1175			flashcache_enq_pending(dmc, bio, index, READCACHE, pjob);
    1176		spin_unlock_irq(&dmc->cache_spin_lock);
    1177	}
    1178}

    首先获取这个cache块管理结构,第1126行判断cache块不忙的情况,进入1129行设置状态为从cache读,第1134行创建一个kcached_job,在1141行申请kcached_job失败时就对bio返回失败。申请成功到1155行将kcached_job设置为READCACHE,再调用dm_io_async_bvec下发请求,当请求回来时就会调用这里设置的回调函数flashcache_io_callback。再继续看读SSD返回是怎么处理的?我们只看该函数是对READCACHE的处理:
    151	case READCACHE:
    152		DPRINTK("flashcache_io_callback: READCACHE %d",
    153			index);
    154		spin_lock_irqsave(&dmc->cache_spin_lock, flags);
    155		if (unlikely(sysctl_flashcache_error_inject & READCACHE_ERROR)) {
    156			job->error = error = -EIO;
    157			sysctl_flashcache_error_inject &= ~READCACHE_ERROR;
    158		}
    159		VERIFY(cacheblk->cache_state & CACHEREADINPROG);
    160		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
    161		if (unlikely(error))
    162			dmc->ssd_read_errors++;
    163#ifdef FLASHCACHE_DO_CHECKSUMS
    164		if (likely(error == 0)) {
    165			if (flashcache_validate_checksum(job)) {
    166				DMERR("flashcache_io_callback: Checksum mismatch at disk offset %lu", 
    167				      job->disk.sector);
    168				error = -EIO;
    169			}
    170		}
    171#endif
    172		flashcache_bio_endio(bio, error);
    173		break;		       
    

    实际上真正有意义的就是第172行,将请求done回去了。这样我们就完成的一次读命中的处理。
    读命中处理还有一种情况就是第1163行cache块忙的情况,这个时候就申请一个pending_job,挂到cache块上,等cache块上一个请求回来的时候调度。

    继续回到flashcache_read,看不命中的情况。
    到第1259行flashcache_uncacheable函数是管理命令相关的,加了黑名单之后会跳过cache,直接下发到磁盘。
    到1264行res是flashcache_lookup返回的,为-1就表示获取不到可用的cache块,这时就调用flashcache_clean_set清除一下脏块。但获取不到cache块并不是说请求就结束了,还得下发到磁盘,1267行flashcache_start_uncached_io将请求直接下发到磁盘。
    第1275行到1278行是统计信息,根据这些信息可以知道flashcache的运行状况,用于flashcache的性能优化。
    接着1279行设置cache块的状态,1280行设置cache块对应磁盘上的扇区,最后调用flashcache_read_miss下发请求:
    1180static void
    1181flashcache_read_miss(struct cache_c *dmc, struct bio* bio,
    1182               int index)
    1183{
    1184     struct kcached_job *job;
    1185     struct cacheblock *cacheblk = &dmc->cache[index];
    1186
    1187     job = new_kcached_job(dmc, bio, index);
    1188     if (unlikely(sysctl_flashcache_error_inject & READ_MISS_JOB_ALLOC_FAIL)) {
    1189          if (job)
    1190               flashcache_free_cache_job(job);
    1191          job = NULL;
    1192          sysctl_flashcache_error_inject &= ~READ_MISS_JOB_ALLOC_FAIL;
    1193     }
    1194     if (unlikely(job == NULL)) {
    1195          /* 
    1196          * We have a read miss, and can't allocate a job.
    1197          * Since we dropped the spinlock, we have to drain any 
    1198          * pending jobs.
    1199          */
    1200          DMERR("flashcache: Read (miss) failed ! Can't allocate memory for cache IO, block %lu", 
    1201                cacheblk->dbn);
    1202          flashcache_bio_endio(bio, -EIO);
    1203          spin_lock_irq(&dmc->cache_spin_lock);
    1204          dmc->cached_blocks--;
    1205          cacheblk->cache_state &= ~VALID;
    1206          cacheblk->cache_state |= INVALID;
    1207          flashcache_free_pending_jobs(dmc, cacheblk, -EIO);
    1208          cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
    1209          spin_unlock_irq(&dmc->cache_spin_lock);
    1210     } else {
    1211          job->action = READDISK; /* Fetch data from the source device */
    1212          atomic_inc(&dmc->nr_jobs);
    1213          dmc->disk_reads++;
    1214          dm_io_async_bvec(1, &job->disk, READ,
    1215                    bio->bi_io_vec + bio->bi_idx,
    1216                    flashcache_io_callback, job);
    1217          flashcache_clean_set(dmc, index / dmc->assoc);
    1218     }
    1219}

    在第1187行申请了一个kcached_job,申请成功就到1211行,设置job->action=READDISK,调用dm_io_async_bvec直接从磁盘读取数据。接着调用flashcache_clean_set检查一下水位线。再看这里读磁盘的回调函数flashcache_io_callback,按理说读完磁盘就可以直接向上层返回数据,但这里还要把数据缓存起来之后再返回。
    113void 
    114flashcache_io_callback(unsigned long error, void *context)
    115{
    116     struct kcached_job *job = (struct kcached_job *) context;
    117     struct cache_c *dmc = job->dmc;
    118     struct bio *bio;
    119     unsigned long flags;
    120     int index = job->index;
    121     struct cacheblock *cacheblk = &dmc->cache[index];
    122
    123     VERIFY(index != -1);          
    124     bio = job->bio;
    125     VERIFY(bio != NULL);
    126     if (error)
    127          DMERR("flashcache_io_callback: io error %ld block %lu action %d", 
    128                error, job->disk.sector, job->action);
    129     job->error = error;
    130     switch (job->action) {
    131     case READDISK:
    132          DPRINTK("flashcache_io_callback: READDISK  %d",
    133               index);
    134          spin_lock_irqsave(&dmc->cache_spin_lock, flags);
    135          if (unlikely(sysctl_flashcache_error_inject & READDISK_ERROR)) {
    136               job->error = error = -EIO;
    137               sysctl_flashcache_error_inject &= ~READDISK_ERROR;
    138          }
    139          VERIFY(cacheblk->cache_state & DISKREADINPROG);
    140          spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
    141          if (likely(error == 0)) {
    142               /* Kick off the write to the cache */
    143               job->action = READFILL;
    144               flashcache_enqueue_readfill(dmc, job);
    145               return;
    146          } else {
    147               dmc->disk_read_errors++;               
    148               flashcache_bio_endio(bio, error);
    149          }
    150          break;
              
    174     case READFILL:
    175          DPRINTK("flashcache_io_callback: READFILL %d",
    176               index);
    177          spin_lock_irqsave(&dmc->cache_spin_lock, flags);
    178          if (unlikely(sysctl_flashcache_error_inject & READFILL_ERROR)) {
    179               job->error = error = -EIO;
    180               sysctl_flashcache_error_inject &= ~READFILL_ERROR;
    181          }
    182          if (unlikely(error))
    183               dmc->ssd_write_errors++;
    184          VERIFY(cacheblk->cache_state & DISKREADINPROG);
    185          spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
    186          flashcache_bio_endio(bio, error);
    187          break;

    归纳一下读不命中的流程:
    1)创建一个kcached_job,直接下发到磁盘
    2)读磁盘返回到flashcache_io_callback,到131行下发READFILL,将读出来的数据写到缓存中
    3)写缓存成功并返回到flashcache_io_callback,到174行将数据返回给上层
    到这里已经将读流程简单过了一遍,下一个小节介绍写流程。
  • 相关阅读:
    FZU.Software Engineering1816 ·The Second Assignment of the Team
    18软工实践-第五次作业-结对作业2
    福大软工1816 · 第四次作业
    软件工程实践第三次作业——结对作业(一)
    软工第二次作业——个人项目
    福大软工1816 · 团队现场编程实战(抽奖系统)
    Alpha 冲刺 (3/10)
    Alpha 冲刺 (2/10)
    Alpha 冲刺 (1/10)
    福大软工 · 第七次作业
  • 原文地址:https://www.cnblogs.com/keanuyaoo/p/3327665.html
Copyright © 2020-2023  润新知