正常流程到flashcache_map的1623行或1625行,按顺序先看读流程:
1221static void 1222flashcache_read(struct cache_c *dmc, struct bio *bio) 1223{ 1224 int index; 1225 int res; 1226 struct cacheblock *cacheblk; 1227 int queued; 1228 1229 DPRINTK("Got a %s for %llu %u bytes)", 1230 (bio_rw(bio) == READ ? "READ":"READA"), 1231 bio->bi_sector, bio->bi_size); 1232 1233 spin_lock_irq(&dmc->cache_spin_lock); 1234 res = flashcache_lookup(dmc, bio, &index); 1235 /* 1236 * Handle Cache Hit case first. 1237 * We need to handle 2 cases, BUSY and !BUSY. If BUSY, we enqueue the 1238 * bio for later. 1239 */ 1240 if (res > 0) { 1241 cacheblk = &dmc->cache[index]; 1242 if ((cacheblk->cache_state & VALID) && 1243 (cacheblk->dbn == bio->bi_sector)) { 1244 flashcache_read_hit(dmc, bio, index); 1245 return; 1246 } 1247 } 1248 /* 1249 * In all cases except for a cache hit (and VALID), test for potential 1250 * invalidations that we need to do. 1251 */ 1252 queued = flashcache_inval_blocks(dmc, bio); 1253 if (queued) { 1254 if (unlikely(queued < 0)) 1255 flashcache_bio_endio(bio, -EIO); 1256 spin_unlock_irq(&dmc->cache_spin_lock); 1257 return; 1258 } 1259 if (res == -1 || flashcache_uncacheable(dmc)) { 1260 /* No room or non-cacheable */ 1261 spin_unlock_irq(&dmc->cache_spin_lock); 1262 DPRINTK("Cache read: Block %llu(%lu):%s", 1263 bio->bi_sector, bio->bi_size, "CACHE MISS & NO ROOM"); 1264 if (res == -1) 1265 flashcache_clean_set(dmc, hash_block(dmc, bio->bi_sector)); 1266 /* Start uncached IO */ 1267 flashcache_start_uncached_io(dmc, bio); 1268 return; 1269 } 1270 /* 1271 * (res == INVALID) Cache Miss 1272 * And we found cache blocks to replace 1273 * Claim the cache blocks before giving up the spinlock 1274 */ 1275 if (dmc->cache[index].cache_state & VALID) 1276 dmc->replace++; 1277 else 1278 dmc->cached_blocks++; 1279 dmc->cache[index].cache_state = VALID | DISKREADINPROG; 1280 dmc->cache[index].dbn = bio->bi_sector; 1281 spin_unlock_irq(&dmc->cache_spin_lock); 1282 1283 DPRINTK("Cache read: Block %llu(%lu), index = %d:%s", 1284 bio->bi_sector, bio->bi_size, index, "CACHE MISS & REPLACE"); 1285 flashcache_read_miss(dmc, bio, index); 1286}
我非常喜欢flashcache这种小资的感觉,每个函数都比较短,大部分都没有超过100行的。不像neil大哥写的md代码的函数动则几百行,上千行,总是望啊望啊望不到边。当然不是说我不喜欢neil的代码,实际上他的代码是非常非常好的,因为md代码已经有十多年的历史了,大的框架仍然没有太大的改变,仍能保持那么优雅已经是十分难得了。最新版本的md还加了许多新功能,像bad block和replacement机制都是非常实用的。所以对于一名优秀的软件工程师来说,并不在于写了多少行代码,而是编写的软件运行在多少台机器上,为用户创造了多少价值。
第1234行是查找bio是否命中,flashcache_lookup函数我们在之前的文章里已经分析过了。第1244行是命中的情况,我们跟进看看
1119static void 1120flashcache_read_hit(struct cache_c *dmc, struct bio* bio, int index) 1121{ 1122 struct cacheblock *cacheblk; 1123 struct pending_job *pjob; 1124 1125 cacheblk = &dmc->cache[index]; 1126 if (!(cacheblk->cache_state & BLOCK_IO_INPROG) && (cacheblk->head == NULL)) { 1127 struct kcached_job *job; 1128 1129 cacheblk->cache_state |= CACHEREADINPROG; 1130 dmc->read_hits++; 1131 spin_unlock_irq(&dmc->cache_spin_lock); 1132 DPRINTK("Cache read: Block %llu(%lu), index = %d:%s", 1133 bio->bi_sector, bio->bi_size, index, "CACHE HIT"); 1134 job = new_kcached_job(dmc, bio, index); 1135 if (unlikely(sysctl_flashcache_error_inject & READ_HIT_JOB_ALLOC_FAIL)) { 1136 if (job) 1137 flashcache_free_cache_job(job); 1138 job = NULL; 1139 sysctl_flashcache_error_inject &= ~READ_HIT_JOB_ALLOC_FAIL; 1140 } 1141 if (unlikely(job == NULL)) { 1142 /* 1143 * We have a read hit, and can't allocate a job. 1144 * Since we dropped the spinlock, we have to drain any 1145 * pending jobs. 1146 */ 1147 DMERR("flashcache: Read (hit) failed ! Can't allocate memory for cache IO, block %lu", 1148 cacheblk->dbn); 1149 flashcache_bio_endio(bio, -EIO); 1150 spin_lock_irq(&dmc->cache_spin_lock); 1151 flashcache_free_pending_jobs(dmc, cacheblk, -EIO); 1152 cacheblk->cache_state &= ~(BLOCK_IO_INPROG); 1153 spin_unlock_irq(&dmc->cache_spin_lock); 1154 } else { 1155 job->action = READCACHE; /* Fetch data from cache */ 1156 atomic_inc(&dmc->nr_jobs); 1157 dmc->ssd_reads++; 1158 dm_io_async_bvec(1, &job->cache, READ, 1159 bio->bi_io_vec + bio->bi_idx, 1160 flashcache_io_callback, job); 1161 flashcache_unplug_device(dmc->cache_dev->bdev); 1162 } 1163 } else { 1164 pjob = flashcache_alloc_pending_job(dmc); 1165 if (unlikely(sysctl_flashcache_error_inject & READ_HIT_PENDING_JOB_ALLOC_FAIL)) { 1166 if (pjob) { 1167 flashcache_free_pending_job(pjob); 1168 pjob = NULL; 1169 } 1170 sysctl_flashcache_error_inject &= ~READ_HIT_PENDING_JOB_ALLOC_FAIL; 1171 } 1172 if (pjob == NULL) 1173 flashcache_bio_endio(bio, -EIO); 1174 else 1175 flashcache_enq_pending(dmc, bio, index, READCACHE, pjob); 1176 spin_unlock_irq(&dmc->cache_spin_lock); 1177 } 1178}
首先获取这个cache块管理结构,第1126行判断cache块不忙的情况,进入1129行设置状态为从cache读,第1134行创建一个kcached_job,在1141行申请kcached_job失败时就对bio返回失败。申请成功到1155行将kcached_job设置为READCACHE,再调用dm_io_async_bvec下发请求,当请求回来时就会调用这里设置的回调函数flashcache_io_callback。再继续看读SSD返回是怎么处理的?我们只看该函数是对READCACHE的处理:
151 case READCACHE: 152 DPRINTK("flashcache_io_callback: READCACHE %d", 153 index); 154 spin_lock_irqsave(&dmc->cache_spin_lock, flags); 155 if (unlikely(sysctl_flashcache_error_inject & READCACHE_ERROR)) { 156 job->error = error = -EIO; 157 sysctl_flashcache_error_inject &= ~READCACHE_ERROR; 158 } 159 VERIFY(cacheblk->cache_state & CACHEREADINPROG); 160 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 161 if (unlikely(error)) 162 dmc->ssd_read_errors++; 163#ifdef FLASHCACHE_DO_CHECKSUMS 164 if (likely(error == 0)) { 165 if (flashcache_validate_checksum(job)) { 166 DMERR("flashcache_io_callback: Checksum mismatch at disk offset %lu", 167 job->disk.sector); 168 error = -EIO; 169 } 170 } 171#endif 172 flashcache_bio_endio(bio, error); 173 break;
实际上真正有意义的就是第172行,将请求done回去了。这样我们就完成的一次读命中的处理。
读命中处理还有一种情况就是第1163行cache块忙的情况,这个时候就申请一个pending_job,挂到cache块上,等cache块上一个请求回来的时候调度。
继续回到flashcache_read,看不命中的情况。
到第1259行flashcache_uncacheable函数是管理命令相关的,加了黑名单之后会跳过cache,直接下发到磁盘。
到1264行res是flashcache_lookup返回的,为-1就表示获取不到可用的cache块,这时就调用flashcache_clean_set清除一下脏块。但获取不到cache块并不是说请求就结束了,还得下发到磁盘,1267行flashcache_start_uncached_io将请求直接下发到磁盘。
第1275行到1278行是统计信息,根据这些信息可以知道flashcache的运行状况,用于flashcache的性能优化。
接着1279行设置cache块的状态,1280行设置cache块对应磁盘上的扇区,最后调用flashcache_read_miss下发请求:
1180static void 1181flashcache_read_miss(struct cache_c *dmc, struct bio* bio, 1182 int index) 1183{ 1184 struct kcached_job *job; 1185 struct cacheblock *cacheblk = &dmc->cache[index]; 1186 1187 job = new_kcached_job(dmc, bio, index); 1188 if (unlikely(sysctl_flashcache_error_inject & READ_MISS_JOB_ALLOC_FAIL)) { 1189 if (job) 1190 flashcache_free_cache_job(job); 1191 job = NULL; 1192 sysctl_flashcache_error_inject &= ~READ_MISS_JOB_ALLOC_FAIL; 1193 } 1194 if (unlikely(job == NULL)) { 1195 /* 1196 * We have a read miss, and can't allocate a job. 1197 * Since we dropped the spinlock, we have to drain any 1198 * pending jobs. 1199 */ 1200 DMERR("flashcache: Read (miss) failed ! Can't allocate memory for cache IO, block %lu", 1201 cacheblk->dbn); 1202 flashcache_bio_endio(bio, -EIO); 1203 spin_lock_irq(&dmc->cache_spin_lock); 1204 dmc->cached_blocks--; 1205 cacheblk->cache_state &= ~VALID; 1206 cacheblk->cache_state |= INVALID; 1207 flashcache_free_pending_jobs(dmc, cacheblk, -EIO); 1208 cacheblk->cache_state &= ~(BLOCK_IO_INPROG); 1209 spin_unlock_irq(&dmc->cache_spin_lock); 1210 } else { 1211 job->action = READDISK; /* Fetch data from the source device */ 1212 atomic_inc(&dmc->nr_jobs); 1213 dmc->disk_reads++; 1214 dm_io_async_bvec(1, &job->disk, READ, 1215 bio->bi_io_vec + bio->bi_idx, 1216 flashcache_io_callback, job); 1217 flashcache_clean_set(dmc, index / dmc->assoc); 1218 } 1219}
在第1187行申请了一个kcached_job,申请成功就到1211行,设置job->action=READDISK,调用dm_io_async_bvec直接从磁盘读取数据。接着调用flashcache_clean_set检查一下水位线。再看这里读磁盘的回调函数flashcache_io_callback,按理说读完磁盘就可以直接向上层返回数据,但这里还要把数据缓存起来之后再返回。
113void 114flashcache_io_callback(unsigned long error, void *context) 115{ 116 struct kcached_job *job = (struct kcached_job *) context; 117 struct cache_c *dmc = job->dmc; 118 struct bio *bio; 119 unsigned long flags; 120 int index = job->index; 121 struct cacheblock *cacheblk = &dmc->cache[index]; 122 123 VERIFY(index != -1); 124 bio = job->bio; 125 VERIFY(bio != NULL); 126 if (error) 127 DMERR("flashcache_io_callback: io error %ld block %lu action %d", 128 error, job->disk.sector, job->action); 129 job->error = error; 130 switch (job->action) { 131 case READDISK: 132 DPRINTK("flashcache_io_callback: READDISK %d", 133 index); 134 spin_lock_irqsave(&dmc->cache_spin_lock, flags); 135 if (unlikely(sysctl_flashcache_error_inject & READDISK_ERROR)) { 136 job->error = error = -EIO; 137 sysctl_flashcache_error_inject &= ~READDISK_ERROR; 138 } 139 VERIFY(cacheblk->cache_state & DISKREADINPROG); 140 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 141 if (likely(error == 0)) { 142 /* Kick off the write to the cache */ 143 job->action = READFILL; 144 flashcache_enqueue_readfill(dmc, job); 145 return; 146 } else { 147 dmc->disk_read_errors++; 148 flashcache_bio_endio(bio, error); 149 } 150 break; 174 case READFILL: 175 DPRINTK("flashcache_io_callback: READFILL %d", 176 index); 177 spin_lock_irqsave(&dmc->cache_spin_lock, flags); 178 if (unlikely(sysctl_flashcache_error_inject & READFILL_ERROR)) { 179 job->error = error = -EIO; 180 sysctl_flashcache_error_inject &= ~READFILL_ERROR; 181 } 182 if (unlikely(error)) 183 dmc->ssd_write_errors++; 184 VERIFY(cacheblk->cache_state & DISKREADINPROG); 185 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); 186 flashcache_bio_endio(bio, error); 187 break;
归纳一下读不命中的流程:
1)创建一个kcached_job,直接下发到磁盘
2)读磁盘返回到flashcache_io_callback,到131行下发READFILL,将读出来的数据写到缓存中
3)写缓存成功并返回到flashcache_io_callback,到174行将数据返回给上层
到这里已经将读流程简单过了一遍,下一个小节介绍写流程。