LevelDB的源码阅读（三） Put操作

在Linux上leveldb的安装和使用中我们写了这么一段测试代码，内容以及输出结果如下：

#include <iostream>
#include <string>
#include <assert.h>    
#include "leveldb/db.h"    

using namespace std;

int main(void) 
{       

    leveldb::DB      *db;    
    leveldb::Options  options;    
    options.create_if_missing = true;    

    // open
    leveldb::Status status = leveldb::DB::Open(options,"/tmp/testdb", &db);    
    assert(status.ok());    

    string key = "name";    
    string value = "chenqi";    

    // write
    status = db->Put(leveldb::WriteOptions(), key, value);    
    assert(status.ok());    

    // read
    status = db->Get(leveldb::ReadOptions(), key, &value);    
    assert(status.ok());    

    cout<<value<<endl;    

    // delete
    status = db->Delete(leveldb::WriteOptions(), key);    
    assert(status.ok());        

    status = db->Get(leveldb::ReadOptions(),key, &value);    
    if(!status.ok()) {
        cerr<<key<<"    "<<status.ToString()<<endl;
    } else {
        cout<<key<<"==="<<value<<endl;    
    }   

    // close 
    delete db;    

    return 0;    
}

chenqi
name    NotFound:

Leveldb的写数据流程入口为db文件夹下db_impl.cc文件中的DBImpl::Put和DBImpl::Delete,这两个文件是DBImpl::Write接口的封装，将写操作封装成WriteBatch传入DBImpl::Write进行操作，可见leveldb在内部是将单独的写操作也作为只有一个操作的批量写操作来进行的.源码内容如下:

// Convenience methods
Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
  return DB::Put(o, key, val);
}

Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
  return DB::Delete(options, key);
}

// Default implementations of convenience methods that subclasses of DB
// can call if they wish
Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
  WriteBatch batch;
  batch.Put(key, value);
  return Write(opt, &batch);
}

Status DB::Delete(const WriteOptions& opt, const Slice& key) {
  WriteBatch batch;
  batch.Delete(key);
  return Write(opt, &batch);
}

DBImpl::Put和DBImpl::Delete最终调用 DBImpl::Write完成写操作,DBImpl::Write源码内容如下:

Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
  Writer w(&mutex_);
  w.batch = my_batch;
  w.sync = options.sync;
  w.done = false;

  MutexLock l(&mutex_);
  writers_.push_back(&w);
  while (!w.done && &w != writers_.front()) {
    w.cv.Wait();
  }
  if (w.done) {
    return w.status;
  }

  // May temporarily unlock and wait.
  Status status = MakeRoomForWrite(my_batch == NULL);
  uint64_t last_sequence = versions_->LastSequence();
  Writer* last_writer = &w;
  if (status.ok() && my_batch != NULL) {  // NULL batch is for compactions
    WriteBatch* updates = BuildBatchGroup(&last_writer);
    WriteBatchInternal::SetSequence(updates, last_sequence + 1);
    last_sequence += WriteBatchInternal::Count(updates);

    // Add to log and apply to memtable.  We can release the lock
    // during this phase since &w is currently responsible for logging
    // and protects against concurrent loggers and concurrent writes
    // into mem_.
    {
      mutex_.Unlock();
      status = log_->AddRecord(WriteBatchInternal::Contents(updates));
      bool sync_error = false;
      if (status.ok() && options.sync) {
        status = logfile_->Sync();
        if (!status.ok()) {
          sync_error = true;
        }
      }
      if (status.ok()) {
        status = WriteBatchInternal::InsertInto(updates, mem_);
      }
      mutex_.Lock();
      if (sync_error) {
        // The state of the log file is indeterminate: the log record we
        // just added may or may not show up when the DB is re-opened.
        // So we force the DB into a mode where all future writes fail.
        RecordBackgroundError(status);
      }
    }
    if (updates == tmp_batch_) tmp_batch_->Clear();

    versions_->SetLastSequence(last_sequence);
  }

  while (true) {
    Writer* ready = writers_.front();
    writers_.pop_front();
    if (ready != &w) {
      ready->status = status;
      ready->done = true;
      ready->cv.Signal();
    }
    if (ready == last_writer) break;
  }

  // Notify new head of write queue
  if (!writers_.empty()) {
    writers_.front()->cv.Signal();
  }

  return status;
}

以下逐段进行分析

  Writer w(&mutex_);// writer可以理解为一个任务
  w.batch = my_batch;
  w.sync = options.sync;
  w.done = false;

  MutexLock l(&mutex_);//构造时上锁, 析构时解锁
  writers_.push_back(&w);// 把w推进writers_队列
  // 生产者消费者模型
   while (!w.done && &w != writers_.front()) {
    w.cv.Wait();//线程可能多次被唤醒
  }
  //写操作有可能被合并处理，因此有可能取到的时候写入已经完成。完成的话直接返回
  if (w.done) {
    return w.status;
  }

mutex l上锁之后, 到了"w.cv.Wait()"的时候, 会先释放锁等待, 然后收到signal时再次上锁. 这段代码的作用就是多线程在提交任务的时候, 一个接一个push_back进队列. 但只有位于队首的线程有资格继续运行下去. 目的是把尽可能多的写请求（(要求sync选项一致)）合并成一个大batch提升效率.

Status status = MakeRoomForWrite(my_batch == NULL);

接下来是调用MakeRoomForWrite查看是否允许写，如果后台Compact失败，则返回错误，如果level0的文件数达到配置的SlowdownWritesTrigger(默认为8)，则对每个写操作都延迟1ms，如果level0的文件数达到配置的kL0_StopWritesTrigger(默认为12)，则阻塞写操作，等待后台Compact结束.如果memtable不满，则直接返回，可写.如果memtable已满，并且immutable table不为空，阻塞，等待Compact结束.否则，将memtable改为immutable table,新建memtable,返回可写.源码内容如下:　

// REQUIRES: mutex_ is held
// REQUIRES: this thread is currently at the front of the writer queue
Status DBImpl::MakeRoomForWrite(bool force) {
  mutex_.AssertHeld();
  assert(!writers_.empty());
  bool allow_delay = !force;
  Status s;
  while (true) {
    if (!bg_error_.ok()) {
      // Yield previous error
      s = bg_error_;//后台Compact出错
      break;
    } else if (
        allow_delay &&
        versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
      // We are getting close to hitting a hard limit on the number of
      // L0 files.  Rather than delaying a single write by several
      // seconds when we hit the hard limit, start delaying each
      // individual write by 1ms to reduce latency variance.  Also,
      // this delay hands over some CPU to the compaction thread in
      // case it is sharing the same core as the writer.
      mutex_.Unlock();
      env_->SleepForMicroseconds(1000);
      allow_delay = false;  // Do not delay a single write more than once
      mutex_.Lock();
    } else if (!force &&
               (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
      // There is room in current memtable 也就是说可以写
      break;
    } else if (imm_ != NULL) {
      // We have filled up the current memtable, but the previous
      // one is still being compacted, so we wait.
      Log(options_.info_log, "Current memtable full; waiting...
");
      bg_cv_.Wait();
    } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
      // There are too many level-0 files.
      Log(options_.info_log, "Too many L0 files; waiting...
");
      bg_cv_.Wait();
    } else {
      // Attempt to switch to a new memtable and trigger compaction of old
      assert(versions_->PrevLogNumber() == 0);
      uint64_t new_log_number = versions_->NewFileNumber();
      WritableFile* lfile = NULL;
      s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
      if (!s.ok()) {
        // Avoid chewing through file number space in a tight loop.
        versions_->ReuseFileNumber(new_log_number);
        break;
      }
      delete log_;
      delete logfile_;
      logfile_ = lfile;
      logfile_number_ = new_log_number;
      log_ = new log::Writer(lfile);
      imm_ = mem_;
      has_imm_.Release_Store(imm_);
      mem_ = new MemTable(internal_comparator_);
      mem_->Ref();
      force = false;   // Do not force another compaction if have room
      MaybeScheduleCompaction();//检查是否启动后台compact
    }
  }
  return s;
}

这里有几点值得注意的优化：

1.在level 0的table数量快要接近阈值的时候, sleep 1ms.

因为文件系统表示写入完成并不一定真写到硬盘里了. 数量接近阈值说明快要进行下一次compaction了. 这时候如果文件系统的buffer积压了太多, 会造成硬盘一下子满负载. 还有可能已经在compact了, 这时候sleep就可以让CPU周期给更重要的任务.

2.log具有最高优先级无论如何都要写, 但immemtable只能一张一张写.

MakeRoomForWrite分析结束，我们重新回到DBImpl::Write往下看.写的时候一次性写入，首先写入log文件，然后写到memtable里.

  uint64_t last_sequence = versions_->LastSequence();
  Writer* last_writer = &w;
  if (status.ok() && my_batch != NULL) {  // NULL batch is for compactions
    WriteBatch* updates = BuildBatchGroup(&last_writer);
    WriteBatchInternal::SetSequence(updates, last_sequence + 1); //设置writebatch的起始sequence
    last_sequence += WriteBatchInternal::Count(updates);  //写成功后的sequence

    // Add to log and apply to memtable.  We can release the lock
    // during this phase since &w is currently responsible for logging
    // and protects against concurrent loggers and concurrent writes
    // into mem_.
    {
      mutex_.Unlock();
      status = log_->AddRecord(WriteBatchInternal::Contents(updates));  //写入日志文件
      bool sync_error = false;
      if (status.ok() && options.sync) {
        status = logfile_->Sync();
        if (!status.ok()) {
          sync_error = true;
        }
      }
      if (status.ok()) {
        status = WriteBatchInternal::InsertInto(updates, mem_);  //写入memtable
      }
      mutex_.Lock();
      if (sync_error) {
        // The state of the log file is indeterminate: the log record we
        // just added may or may not show up when the DB is re-opened.
        // So we force the DB into a mode where all future writes fail.
        RecordBackgroundError(status);
      }
    }
    if (updates == tmp_batch_) tmp_batch_->Clear();

    versions_->SetLastSequence(last_sequence); //设置现在的最新更新sequence
  }

这里调用了BuildBatchGroup，从等待写队列中获取尽可能多的写任务，BuildBatchGroup源码内容如下：

// REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-NULL batch
WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
  assert(!writers_.empty());
  Writer* first = writers_.front();
  WriteBatch* result = first->batch;
  assert(result != NULL);

  size_t size = WriteBatchInternal::ByteSize(first->batch);

  // Allow the group to grow up to a maximum size, but if the
  // original write is small, limit the growth so we do not slow
  // down the small write too much.
  size_t max_size = 1 << 20;
  if (size <= (128<<10)) {
    max_size = size + (128<<10);
  }

  *last_writer = first;
  std::deque<Writer*>::iterator iter = writers_.begin();
  ++iter;  // Advance past "first"
  for (; iter != writers_.end(); ++iter) {
    Writer* w = *iter;
    if (w->sync && !first->sync) {
      // Do not include a sync write into a batch handled by a non-sync write.
      break;
    }

    if (w->batch != NULL) {
      size += WriteBatchInternal::ByteSize(w->batch);
      if (size > max_size) {
        // Do not make batch too big
        break;
      }

      // Append to *result
      // 把合并的写请求保存在成员变量 tmp_batch_ 中，避免和调用者的写请求混淆在一起
      if (result == first->batch) {
        // Switch to temporary batch instead of disturbing caller's batch
        result = tmp_batch_;
        assert(WriteBatchInternal::Count(result) == 0);
        WriteBatchInternal::Append(result, first->batch);
      }
      WriteBatchInternal::Append(result, w->batch);
    }
    *last_writer = w;
  }
  return result;
}

调用到的WriteBatchInternal相关代码在db文件夹下write_batch.cc中。

再次回到DBImpl::Write看最后一段代码，从头开始检查队列, 把完成的任务标记为done，如果队列还有别的任务, 继续唤醒第一个.

while (true) {
    Writer* ready = writers_.front();
    writers_.pop_front();
    if (ready != &w) {
      ready->status = status;
      ready->done = true;
      ready->cv.Signal();
    }
    if (ready == last_writer) break;
  }

  // Notify new head of write queue
  if (!writers_.empty()) {
    writers_.front()->cv.Signal();
  }

  return status;

参考文献：

1.http://blog.csdn.net/joeyon1985/article/details/47154249

2.http://masutangu.com/2017/06/leveldb_1/

3.https://zhuanlan.zhihu.com/jimderestaurant?topic=LevelDB

相关阅读:
每日总结
 SQLServer2008/2005 生成数据字典SQL语句
 python一些utils
python快速展示图片
 面向对象编程
 Arrays类讲解冒泡排序
 方法的定义、方法的调用及方法的重载
 .Net6 连接 redis
Stream流的基本使用
 uniapp 页面之前通讯传值义美
原文地址：https://www.cnblogs.com/xueqiuqiu/p/8296324.html