leveldb源碼學習之 DBImpl 類(恢復日誌到memtable:Recover)

推薦結合 leveldb-handbook 閱讀源碼

數據庫每次啓動時,都會有一個recover的過程,簡要地來說,就是利用Manifest信息重新構建一個最新的version。

過程如下:

  1. 利用Current文件讀取最近使用的manifest文件;
  2. 創建一個空的version,並利用manifest文件中的session record依次作apply操作,還原出一個最新的version,注意manifest的第一條session record是一個version的快照,後續的session record記錄的都是增量的變化;
  3. 將非current文件指向的其他過期的manifest文件刪除;
  4. 將新建的version作爲當前數據庫的version;

注意,隨着leveldb運行時間的增長,一個manifest中包含的session record會越來越多,故leveldb在每次啓動時都會重新創建一個manifest文件,並將第一條session record中記錄當前version的快照狀態。其他過期的manifest文件會在下次啓動的recover流程中進行刪除。leveldb通過這種方式,來控制manifest文件的大小,但是數據庫本身沒有重啓,manifest還是會一直增長。

Current

由於每次啓動,都會新建一個Manifest文件,因此leveldb當中可能會存在多個manifest文件。因此需要一個額外的current文件來指示當前系統使用的到底是哪個manifest文件。

該文件中只有一個內容,即當前使用的manifest文件的文件名。

 

Recover 恢復

Status DBImpl::Recover(VersionEdit* edit, bool* save_manifest) {
  mutex_.AssertHeld();

  // 創建目錄,因爲可能之前已經創建過,所以不需要.
  env_->CreateDir(dbname_);
  assert(db_lock_ == nullptr);
  // 創建一個鎖文件,就是db路徑下的LOCK
  Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
  if (!s.ok()) {
    return s;
  }

  // 檢測CURRENT 文件是否存在
  if (!env_->FileExists(CurrentFileName(dbname_))) {
    if (options_.create_if_missing) { // 文件不存在則創建db
      s = NewDB();
      if (!s.ok()) {
        return s;
      }
    } else {
      return Status::InvalidArgument(
          dbname_, "does not exist (create_if_missing is false)");
    }
  } else {
    if (options_.error_if_exists) {
      return Status::InvalidArgument(dbname_,
                                     "exists (error_if_exists is true)");
    }
  }

  // 根據manifest文件恢復 version
  s = versions_->Recover(save_manifest); 
  if (!s.ok()) {
    return s;
  }
  SequenceNumber max_sequence(0);

  // Recover from all newer log files than the ones named in the
  // descriptor (new log files may have been added by the previous
  // incarnation without registering them in the descriptor).
  //
  // Note that PrevLogNumber() is no longer used, but we pay
  // attention to it in case we are recovering a database
  // produced by an older version of leveldb.
  const uint64_t min_log = versions_->LogNumber();
  const uint64_t prev_log = versions_->PrevLogNumber();
  std::vector<std::string> filenames;
  s = env_->GetChildren(dbname_, &filenames);
  if (!s.ok()) {
    return s;
  }
  std::set<uint64_t> expected;
  versions_->AddLiveFiles(&expected);
  uint64_t number;
  FileType type;
  std::vector<uint64_t> logs;
  // 恢復所有時間上較新的log文件
  for (size_t i = 0; i < filenames.size(); i++) {
    if (ParseFileName(filenames[i], &number, &type)) {
      expected.erase(number);
      if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
        logs.push_back(number);
    }
  }
  // manifest文件裏有的,在db目錄下沒有,丟失了
  if (!expected.empty()) {
    char buf[50];
    snprintf(buf, sizeof(buf), "%d missing files; e.g.",
             static_cast<int>(expected.size()));
    return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
  }

  // 按log文件產生的順序恢復db
  std::sort(logs.begin(), logs.end());
  for (size_t i = 0; i < logs.size(); i++) {
    s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit,
                       &max_sequence);
    if (!s.ok()) {
      return s;
    }

    // The previous incarnation may not have written any MANIFEST
    // records after allocating this log number.  So we manually
    // update the file number allocation counter in VersionSet.
    versions_->MarkFileNumberUsed(logs[i]);
  }

  if (versions_->LastSequence() < max_sequence) {
    versions_->SetLastSequence(max_sequence);
  }

  return Status::OK();
}

按照log時間順序恢復,這裏用到log的Reader類,可參考:leveldb源碼學習之日誌 log 讀取

Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
                              bool* save_manifest, VersionEdit* edit,
                              SequenceNumber* max_sequence) {
  struct LogReporter : public log::Reader::Reporter {
    Env* env;
    Logger* info_log;
    const char* fname;
    Status* status;  // null if options_.paranoid_checks==false
    void Corruption(size_t bytes, const Status& s) override {
      Log(info_log, "%s%s: dropping %d bytes; %s",
          (this->status == nullptr ? "(ignoring error) " : ""), fname,
          static_cast<int>(bytes), s.ToString().c_str());
      if (this->status != nullptr && this->status->ok()) *this->status = s;
    }
  };
 
  mutex_.AssertHeld();
 
  // Open the log file
  std::string fname = LogFileName(dbname_, log_number);
  SequentialFile* file;
  Status status = env_->NewSequentialFile(fname, &file);
  if (!status.ok()) {
    MaybeIgnoreError(&status);
    return status;
  }
 
  // 創建日誌的 reader.
  LogReporter reporter;
  reporter.env = env_;
  reporter.info_log = options_.info_log;
  reporter.fname = fname.c_str();
  reporter.status = (options_.paranoid_checks ? &status : nullptr);
  // 這裏讓 log::Reader 做校驗和檢驗即使 paranoid_checks==false 這樣終端會導致整個提交被忽略,防止錯誤信息傳播.
  log::Reader reader(file, &reporter, true /*校驗和*/, 0 /*initial_offset*/);
  Log(options_.info_log, "Recovering log #%llu",
      (unsigned long long)log_number);
 
  // Read all the records and add to a memtable
  std::string scratch;
  Slice record;
  WriteBatch batch;
  int compactions = 0;
  MemTable* mem = nullptr;
  // 讀取一條日誌記錄
  while (reader.ReadRecord(&record, &scratch) && status.ok()) {
    if (record.size() < 12) {
      reporter.Corruption(record.size(),
                          Status::Corruption("log record too small"));
      continue;
    }
    WriteBatchInternal::SetContents(&batch, record);
 
    if (mem == nullptr) {
      mem = new MemTable(internal_comparator_);
      mem->Ref();
    }
	// 往memtable 插入key
    status = WriteBatchInternal::InsertInto(&batch, mem);
    MaybeIgnoreError(&status);
    if (!status.ok()) {
      break;
    }
	// 更新序列號爲 批量恢復後的序號
    const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
                                    WriteBatchInternal::Count(&batch) - 1;
    if (last_seq > *max_sequence) {
      *max_sequence = last_seq;
    }
 
	// 內存使用大於閾值
    if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
      compactions++;
      *save_manifest = true;
      status = WriteLevel0Table(mem, edit, nullptr); // 做一次minor 壓縮,將當前memtable寫入第0層sstable
      mem->Unref();
      mem = nullptr;
      if (!status.ok()) {
        // Reflect errors immediately so that conditions like full
        // file-systems cause the DB::Open() to fail.
        break;
      }
    }
  }
 
  delete file;
 
  // 是否複用上一個log文件 .
  if (status.ok() && options_.reuse_logs && last_log && compactions == 0) {
    assert(logfile_ == nullptr);
    assert(log_ == nullptr);
    assert(mem_ == nullptr);
    uint64_t lfile_size;
	 // 當上一個日誌文件fname沒有遇到memtable壓縮時,接着複用整個log文件,新的修改追加到後面
    if (env_->GetFileSize(fname, &lfile_size).ok() &&
        env_->NewAppendableFile(fname, &logfile_).ok()) {
      Log(options_.info_log, "Reusing old log %s \n", fname.c_str());
      log_ = new log::Writer(logfile_, lfile_size);
      logfile_number_ = log_number;
      if (mem != nullptr) {
        mem_ = mem;
        mem = nullptr;
      } else {
        // 當日志存在但是爲空時,mem 可能爲空.
        mem_ = new MemTable(internal_comparator_);
        mem_->Ref();
      }
    }
  }
 
  // 恢復日誌後的memtable沒有做壓縮,這裏壓縮
  if (mem != nullptr) {
    if (status.ok()) {
      *save_manifest = true;
      status = WriteLevel0Table(mem, edit, nullptr);
    }
    mem->Unref();
  }
 
  return status;
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章