推薦結合 leveldb-handbook 閱讀源碼
數據庫每次啓動時,都會有一個recover的過程,簡要地來說,就是利用Manifest信息重新構建一個最新的version。
過程如下:
- 利用Current文件讀取最近使用的manifest文件;
- 創建一個空的version,並利用manifest文件中的session record依次作apply操作,還原出一個最新的version,注意manifest的第一條session record是一個version的快照,後續的session record記錄的都是增量的變化;
- 將非current文件指向的其他過期的manifest文件刪除;
- 將新建的version作爲當前數據庫的version;
注意,隨着leveldb運行時間的增長,一個manifest中包含的session record會越來越多,故leveldb在每次啓動時都會重新創建一個manifest文件,並將第一條session record中記錄當前version的快照狀態。其他過期的manifest文件會在下次啓動的recover流程中進行刪除。leveldb通過這種方式,來控制manifest文件的大小,但是數據庫本身沒有重啓,manifest還是會一直增長。
Current
由於每次啓動,都會新建一個Manifest文件,因此leveldb當中可能會存在多個manifest文件。因此需要一個額外的current文件來指示當前系統使用的到底是哪個manifest文件。
該文件中只有一個內容,即當前使用的manifest文件的文件名。
Recover 恢復
Status DBImpl::Recover(VersionEdit* edit, bool* save_manifest) {
mutex_.AssertHeld();
// 創建目錄,因爲可能之前已經創建過,所以不需要.
env_->CreateDir(dbname_);
assert(db_lock_ == nullptr);
// 創建一個鎖文件,就是db路徑下的LOCK
Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
if (!s.ok()) {
return s;
}
// 檢測CURRENT 文件是否存在
if (!env_->FileExists(CurrentFileName(dbname_))) {
if (options_.create_if_missing) { // 文件不存在則創建db
s = NewDB();
if (!s.ok()) {
return s;
}
} else {
return Status::InvalidArgument(
dbname_, "does not exist (create_if_missing is false)");
}
} else {
if (options_.error_if_exists) {
return Status::InvalidArgument(dbname_,
"exists (error_if_exists is true)");
}
}
// 根據manifest文件恢復 version
s = versions_->Recover(save_manifest);
if (!s.ok()) {
return s;
}
SequenceNumber max_sequence(0);
// Recover from all newer log files than the ones named in the
// descriptor (new log files may have been added by the previous
// incarnation without registering them in the descriptor).
//
// Note that PrevLogNumber() is no longer used, but we pay
// attention to it in case we are recovering a database
// produced by an older version of leveldb.
const uint64_t min_log = versions_->LogNumber();
const uint64_t prev_log = versions_->PrevLogNumber();
std::vector<std::string> filenames;
s = env_->GetChildren(dbname_, &filenames);
if (!s.ok()) {
return s;
}
std::set<uint64_t> expected;
versions_->AddLiveFiles(&expected);
uint64_t number;
FileType type;
std::vector<uint64_t> logs;
// 恢復所有時間上較新的log文件
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
expected.erase(number);
if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
logs.push_back(number);
}
}
// manifest文件裏有的,在db目錄下沒有,丟失了
if (!expected.empty()) {
char buf[50];
snprintf(buf, sizeof(buf), "%d missing files; e.g.",
static_cast<int>(expected.size()));
return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
}
// 按log文件產生的順序恢復db
std::sort(logs.begin(), logs.end());
for (size_t i = 0; i < logs.size(); i++) {
s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit,
&max_sequence);
if (!s.ok()) {
return s;
}
// The previous incarnation may not have written any MANIFEST
// records after allocating this log number. So we manually
// update the file number allocation counter in VersionSet.
versions_->MarkFileNumberUsed(logs[i]);
}
if (versions_->LastSequence() < max_sequence) {
versions_->SetLastSequence(max_sequence);
}
return Status::OK();
}
按照log時間順序恢復,這裏用到log的Reader類,可參考:leveldb源碼學習之日誌 log 讀取
Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
bool* save_manifest, VersionEdit* edit,
SequenceNumber* max_sequence) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
Logger* info_log;
const char* fname;
Status* status; // null if options_.paranoid_checks==false
void Corruption(size_t bytes, const Status& s) override {
Log(info_log, "%s%s: dropping %d bytes; %s",
(this->status == nullptr ? "(ignoring error) " : ""), fname,
static_cast<int>(bytes), s.ToString().c_str());
if (this->status != nullptr && this->status->ok()) *this->status = s;
}
};
mutex_.AssertHeld();
// Open the log file
std::string fname = LogFileName(dbname_, log_number);
SequentialFile* file;
Status status = env_->NewSequentialFile(fname, &file);
if (!status.ok()) {
MaybeIgnoreError(&status);
return status;
}
// 創建日誌的 reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.fname = fname.c_str();
reporter.status = (options_.paranoid_checks ? &status : nullptr);
// 這裏讓 log::Reader 做校驗和檢驗即使 paranoid_checks==false 這樣終端會導致整個提交被忽略,防止錯誤信息傳播.
log::Reader reader(file, &reporter, true /*校驗和*/, 0 /*initial_offset*/);
Log(options_.info_log, "Recovering log #%llu",
(unsigned long long)log_number);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
int compactions = 0;
MemTable* mem = nullptr;
// 讀取一條日誌記錄
while (reader.ReadRecord(&record, &scratch) && status.ok()) {
if (record.size() < 12) {
reporter.Corruption(record.size(),
Status::Corruption("log record too small"));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
if (mem == nullptr) {
mem = new MemTable(internal_comparator_);
mem->Ref();
}
// 往memtable 插入key
status = WriteBatchInternal::InsertInto(&batch, mem);
MaybeIgnoreError(&status);
if (!status.ok()) {
break;
}
// 更新序列號爲 批量恢復後的序號
const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
WriteBatchInternal::Count(&batch) - 1;
if (last_seq > *max_sequence) {
*max_sequence = last_seq;
}
// 內存使用大於閾值
if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
compactions++;
*save_manifest = true;
status = WriteLevel0Table(mem, edit, nullptr); // 做一次minor 壓縮,將當前memtable寫入第0層sstable
mem->Unref();
mem = nullptr;
if (!status.ok()) {
// Reflect errors immediately so that conditions like full
// file-systems cause the DB::Open() to fail.
break;
}
}
}
delete file;
// 是否複用上一個log文件 .
if (status.ok() && options_.reuse_logs && last_log && compactions == 0) {
assert(logfile_ == nullptr);
assert(log_ == nullptr);
assert(mem_ == nullptr);
uint64_t lfile_size;
// 當上一個日誌文件fname沒有遇到memtable壓縮時,接着複用整個log文件,新的修改追加到後面
if (env_->GetFileSize(fname, &lfile_size).ok() &&
env_->NewAppendableFile(fname, &logfile_).ok()) {
Log(options_.info_log, "Reusing old log %s \n", fname.c_str());
log_ = new log::Writer(logfile_, lfile_size);
logfile_number_ = log_number;
if (mem != nullptr) {
mem_ = mem;
mem = nullptr;
} else {
// 當日志存在但是爲空時,mem 可能爲空.
mem_ = new MemTable(internal_comparator_);
mem_->Ref();
}
}
}
// 恢復日誌後的memtable沒有做壓縮,這裏壓縮
if (mem != nullptr) {
if (status.ok()) {
*save_manifest = true;
status = WriteLevel0Table(mem, edit, nullptr);
}
mem->Unref();
}
return status;
}