我們知道rockdb作爲kv存儲,採用的WAL方式來寫日誌,即預寫日誌,每次要對數據操作之前,先寫日誌保存起來,然後在進行相應操作。這樣當發生某些意外而導致還未寫到磁盤中的數據丟失時,我們可以採用log文件來進行恢復。通過讀取磁盤中的內容和已知的WAL日誌,就可以恢復到最新的狀態。而memtable和未寫入磁盤的immemtable則從log文件中讀出來,重做memtable和immemtable到sstable中即可。
整個恢復流程粗略如下流程圖:
在DBImpl::Recover()函數中,NEWDB()是創建新的生成全新的manifest和current文件,然後獲取wal文件目錄,如果目錄爲空,則返回,否則開始recoverlogfile()操作。恢復的是memtable及immtable中還未持久化到SSTable中的數據。
接下來的日誌操作就是恢復內存中的數據了,重做日誌操作即是將日誌中記錄的操作讀取出來,然後再將讀取到的操作重新寫入到rocksdb中,如果緩存大小大於寫緩存尺寸,就寫入到保存在sstable中,RecoverFogFiles()函數中實現,具體流程如下:
logfilename()得到log文件名,然後env_->NewSequentialFile打開日誌文件,如果打開狀態成功,得到info_log狀態,在sstble0中插入該記錄。
writeleve0tableforrecovery()函數比較簡單,具體過程如下:
開始先給文件解鎖,Buildtable()建立sstable,如果創建成功,則調用LogAndNotifyTableFileCreation()寫入到logger類,最後如果當前memtable文件size大於0,則調用edit->AddFile寫入到manifast文件。
Status DBImpl::Recover()部分代碼如下
s = env_->LockFile(LockFileName(dbname_), &db_lock_); //文件加鎖
if (!s.ok()) { return s; }
s = env_->FileExists(CurrentFileName(dbname_));
if (s.IsNotFound()) {
if (db_options_.create_if_missing) {
s = NewDB(); //生成全新的manifest和current文件
is_new_db = true;
} else {
return Status::InvalidArgument(
dbname_, "does not exist (create_if_missing is false)");
}
} else if (s.ok()) {
if (db_options_.error_if_exists) {
return Status::InvalidArgument(
dbname_, "exists (error_if_exists is true)");
}
} else {
// Unexpected error reading file
assert(s.IsIOError());
return s;
}
// Check for the IDENTITY file and create it if not there
s = env_->FileExists(IdentityFileName(dbname_));
if (s.IsNotFound()) {
s = SetIdentityFile(env_, dbname_);
if (!s.ok()) {
return s;
}
} else if (!s.ok()) {
assert(s.IsIOError());
return s;
}
}
Status s = versions_->Recover(column_families, read_only); // 恢復當前version信息
if (db_options_.paranoid_checks && s.ok()) {
s = CheckConsistency(); //檢查文件一致性
}
if (s.ok()) {
SequenceNumber max_sequence(kMaxSequenceNumber);
default_cf_handle_ = new ColumnFamilyHandleImpl(
versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
single_column_family_mode_ =
versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
// Recover from all newer log files than the ones named in the
// descriptor (new log files may have been added by the previous
// incarnation without registering them in the descriptor).
//
// Note that prev_log_number() is no longer used, but we pay
// attention to it in case we are recovering a database
// produced by an older version of rocksdb.
const uint64_t min_log = versions_->MinLogNumber();
const uint64_t prev_log = versions_->prev_log_number();
std::vector<std::string> filenames;
s = env_->GetChildren(db_options_.wal_dir, &filenames); // 獲取wal文件目錄
std::vector<uint64_t> logs;
for (size_t i = 0; i < filenames.size(); i++) {
uint64_t number;
FileType type;
if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) {
if (is_new_db) {
return Status::Corruption(
"While creating a new Db, wal_dir contains "
"existing log file: ",
filenames[i]);
} else if ((number >= min_log) || (number == prev_log)) {
logs.push_back(number); //存儲當前已有的日誌文件
}
}
}
if (logs.size() > 0 && error_if_log_file_exist) { //如果文件有錯誤
return Status::Corruption(""
"The db was opened in readonly mode with error_if_log_file_exist"
"flag but a log file already exists");
}
if (!logs.empty()) { //如果文件沒有缺失
// Recover in the order in which the logs were generated
std::sort(logs.begin(), logs.end()); //排序日誌文件
s = RecoverLogFiles(logs, &max_sequence, read_only); //重做日誌操作
if (!s.ok()) {
// Clear memtables if recovery failed
for (auto cfd : *versions_->GetColumnFamilySet()) {
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
kMaxSequenceNumber);
}
}
}
SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence());
}
// Initial value
max_total_in_memory_state_ = 0;
for (auto cfd : *versions_->GetColumnFamilySet()) {
auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
mutable_cf_options->max_write_buffer_number; }
return s;
}
DBImpl::RecoverLogFiles部分代碼
Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
SequenceNumber* max_sequence, bool read_only) {
SequenceNumber* max_sequence, bool read_only) {
int job_id = next_job_id_.fetch_add(1);
{
auto stream = event_logger_.Log();
stream << "job" << job_id << "event"
<< "recovery_started";
stream << "log_files"; //輸出log文件通知開始recorvery
stream.StartArray();
for (auto log_number : log_numbers) {
stream << log_number; //分配log number
}
stream.EndArray();
}
bool continue_replay_log = true;
for (auto log_number : log_numbers) {
versions_->MarkFileNumberUsedDuringRecovery(log_number, env_); //用log number來mark 文件number
// Open the log file
std::string fname = LogFileName(db_options_.wal_dir, log_number); //打開log file
unique_ptr<SequentialFileReader> file_reader;
{
unique_ptr<SequentialFile> file;
status = env_->NewSequentialFile(fname, &file, env_options_);
if (!status.ok()) {
MaybeIgnoreError(&status);
if (!status.ok()) {
return status;
} else {
// Fail with one log file, but that's ok. //某個log文件打開失敗了,繼續操作
// Try next one.
continue;
}
}
file_reader.reset(new SequentialFileReader(std::move(file)));
}
// Create the log reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = db_options_.info_log.get(); //得到info_log
reporter.fname = fname.c_str();
if (!db_options_.paranoid_checks ||
db_options_.wal_recovery_mode ==
WALRecoveryMode::kSkipAnyCorruptedRecords) {
reporter.status = nullptr;
} else {
reporter.status = &status;
}
// We intentially make log::Reader do checksumming even if
// paranoid_checks==false so that corruptions cause entire commits
// to be skipped instead of propagating bad information (like overly
// large sequence numbers).
log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
0 /*initial_offset*/);
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number,
db_options_.wal_recovery_mode, !continue_replay_log);
// Determine if we should tolerate incomplete records at the tail end of the
// log
bool report_eof_inconsistency;
if (db_options_.wal_recovery_mode ==
WALRecoveryMode::kAbsoluteConsistency) { //記錄一致性
// in clean shutdown we don't expect any error in the log files
report_eof_inconsistency = true;
} else {
// for other modes ignore only incomplete records in the last log file
// which is presumably due to write in progress during restart
report_eof_inconsistency = false;
// TODO krad: Evaluate if we need to move to a more strict mode where we
// restrict the inconsistency to only the last log
}
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
if (!continue_replay_log) {
uint64_t bytes;
if (env_->GetFileSize(fname, &bytes).ok()) {
auto info_log = db_options_.info_log.get(); //得到info_log
Log(InfoLogLevel::WARN_LEVEL, info_log, "%s: dropping %d bytes",
fname.c_str(), static_cast<int>(bytes));
}
}
while (continue_replay_log &&
reader.ReadRecord(&record, &scratch, report_eof_inconsistency) &&
status.ok()) {
if (record.size() < 12) { //size不對
reporter.Corruption(record.size(),
Status::Corruption("log record too small"));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
// If column family was not found, it might mean that the WAL write
// batch references to the column family that was dropped after the
// insert. We don't want to fail the whole write batch in that case --
// we just ignore the update.
// That's why we set ignore missing column families to true
status = WriteBatchInternal::InsertInto( //插入
&batch, column_family_memtables_.get(), true, log_number);
MaybeIgnoreError(&status);
if (!status.ok()) {
// We are treating this as a failure while reading since we read valid
// blocks that do not form coherent data
reporter.Corruption(record.size(), status);
continue;
}
const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
WriteBatchInternal::Count(&batch) - 1;
if ((*max_sequence == kMaxSequenceNumber) || (last_seq > *max_sequence)) {
*max_sequence = last_seq;
}
if (!read_only) {
// we can do this because this is called before client has access to the
// DB and there is only a single thread operating on DB
ColumnFamilyData* cfd;
while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) {
cfd->Unref();
// If this asserts, it means that InsertInto failed in
// filtering updates to already-flushed column families
assert(cfd->GetLogNumber() <= log_number);
auto iter = version_edits.find(cfd->GetID());
assert(iter != version_edits.end());
VersionEdit* edit = &iter->second;
status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
if (!status.ok()) {
// Reflect errors immediately so that conditions like full
// file-systems cause the DB::Open() to fail.
return status;
}
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
*max_sequence); //創建一個新的memtable文件
}
}
}
if (!status.ok()) {
if (db_options_.wal_recovery_mode ==
WALRecoveryMode::kSkipAnyCorruptedRecords) { //忽視任何error
// We should ignore all errors unconditionally
status = Status::OK();
} else if (db_options_.wal_recovery_mode ==
WALRecoveryMode::kPointInTimeRecovery) {
// We should ignore the error but not continue replaying
status = Status::OK();
continue_replay_log = false;
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"Point in time recovered to log #%" PRIu64 " seq #%" PRIu64,
log_number, *max_sequence);
} else {
assert(db_options_.wal_recovery_mode ==
WALRecoveryMode::kTolerateCorruptedTailRecords
|| db_options_.wal_recovery_mode ==
WALRecoveryMode::kAbsoluteConsistency);
return status;
}
}
flush_scheduler_.Clear();
if ((*max_sequence != kMaxSequenceNumber) &&
(versions_->LastSequence() < *max_sequence)) {
versions_->SetLastSequence(*max_sequence); //置爲最大的sequence
}
}
if (!read_only) {
// no need to refcount since client still doesn't have access
// to the DB and can not drop column families while we iterate
auto max_log_number = log_numbers.back();
for (auto cfd : *versions_->GetColumnFamilySet()) {
auto iter = version_edits.find(cfd->GetID());
assert(iter != version_edits.end());
VersionEdit* edit = &iter->second;
if (cfd->GetLogNumber() > max_log_number) {
// Column family cfd has already flushed the data
// from all logs. Memtable has to be empty because
// we filter the updates based on log_number
// (in WriteBatch::InsertInto)
assert(cfd->mem()->GetFirstSequenceNumber() == 0);
assert(edit->NumEntries() == 0);
continue;
}
// flush the final memtable (if non-empty)
if (cfd->mem()->GetFirstSequenceNumber() != 0) {
status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); //寫成SSTable
if (!status.ok()) {
// Recovery failed
break;
}
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), //新建新的memtable
*max_sequence);
}
// write MANIFEST with update
// writing log_number in the manifest means that any log file
// with number strongly less than (log_number + 1) is already
// recovered and should be ignored on next reincarnation.
// Since we already recovered max_log_number, we want all logs
// with numbers `<= max_log_number` (includes this one) to be ignored
edit->SetLogNumber(max_log_number + 1); //更新lognumber
// we must mark the next log number as used, even though it's
// not actually used. that is because VersionSet assumes
// VersionSet::next_file_number_ always to be strictly greater than any
// log number
versions_->MarkFileNumberUsedDuringRecovery(max_log_number + 1); //標記下個log number
status = versions_->LogAndApply(
cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_);
if (!status.ok()) {
// Recovery failed
break;
}
}
}
event_logger_.Log() << "job" << job_id << "event"
<< "recovery_finished";
return status;
}mutex_.AssertHeld();
const uint64_t start_micros = env_->NowMicros(); //開始時間
FileMetaData meta;
meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
auto pending_outputs_inserted_elem =
CaptureCurrentFileNumberInPendingOutputs();
ReadOptions ro;
ro.total_order_seek = true;
Arena arena;
Status s;
TableProperties table_properties;
{
ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
"[%s] [WriteLevel0TableForRecovery]"
" Level-0 table #%" PRIu64 ": started",
cfd->GetName().c_str(), meta.fd.GetNumber());
bool paranoid_file_checks =
cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
{
mutex_.Unlock();
TableFileCreationInfo info;
s = BuildTable( //新建sstable
dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
iter.get(), &meta, cfd->internal_comparator(),
cfd->int_tbl_prop_collector_factories(), cfd->GetID(),
snapshots_.GetAll(), GetCompressionFlush(*cfd->ioptions()),
cfd->ioptions()->compression_opts, paranoid_file_checks,
cfd->internal_stats(), Env::IO_HIGH, &info.table_properties);
LogFlush(db_options_.info_log);
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
"[%s] [WriteLevel0TableForRecovery]"
" Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
s.ToString().c_str());
// output to event logger
if (s.ok()) { //寫入到logger類
info.db_name = dbname_;
info.cf_name = cfd->GetName();
info.file_path = TableFileName(db_options_.db_paths,
meta.fd.GetNumber(),
meta.fd.GetPathId());
info.file_size = meta.fd.GetFileSize();
info.job_id = job_id;
EventHelpers::LogAndNotifyTableFileCreation(
&event_logger_, db_options_.listeners, meta.fd, info);
}
mutex_.Lock();
}
}
ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
// Note that if file_size is zero, the file has been deleted and
// should not be added to the manifest.
int level = 0;
if (s.ok() && meta.fd.GetFileSize() > 0) { //文件大小大於0,表示存在,寫入到manifest
edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
meta.fd.GetFileSize(), meta.smallest, meta.largest,
meta.smallest_seqno, meta.largest_seqno,
meta.marked_for_compaction, meta.priv_meta);
}
InternalStats::CompactionStats stats(1);
stats.micros = env_->NowMicros() - start_micros;
stats.bytes_written = meta.fd.GetFileSize();
stats.num_output_files = 1;
cfd->internal_stats()->AddCompactionStats(level, stats);
cfd->internal_stats()->AddCFStats(
InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize());
RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
return s;
}
Status DBImpl::FlushMemTableToOutputFile(
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) {
mutex_.AssertHeld();
assert(cfd->imm()->NumNotFlushed() != 0);
assert(cfd->imm()->IsFlushPending());
FlushJob flush_job(dbname_, cfd, db_options_, mutable_cf_options,
env_options_, versions_.get(), &mutex_, &shutting_down_,
snapshots_.GetAll(), job_context, log_buffer,
directories_.GetDbDir(), directories_.GetDataDir(0U),
GetCompressionFlush(*cfd->ioptions()), stats_,
&event_logger_);
FileMetaData file_meta;
// Within flush_job.Run, rocksdb may call event listener to notify
// file creation and deletion.
//
// Note that flush_job.Run will unlock and lock the db_mutex,
// and EventListener callback will be called when the db_mutex
// is unlocked by the current thread.
Status s = flush_job.Run(&file_meta);
if (s.ok()) {
InstallSuperVersionAndScheduleWorkWrapper(cfd, job_context,
mutable_cf_options);
if (made_progress) {
*made_progress = 1;
}
VersionStorageInfo::LevelSummaryStorage tmp;
LogToBuffer(log_buffer, "[%s] Level summary: %s\n", cfd->GetName().c_str(),
cfd->current()->storage_info()->LevelSummary(&tmp));
}
if (!s.ok() && !s.IsShutdownInProgress() && db_options_.paranoid_checks &&
bg_error_.ok()) {
// if a bad error happened (not ShutdownInProgress) and paranoid_checks is
// true, mark DB read-only
bg_error_ = s;
}
RecordFlushIOStats();
#ifndef ROCKSDB_LITE
if (s.ok()) {
// may temporarily unlock and lock the mutex.
NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options,
job_context->job_id, flush_job.GetTableProperties());
}
#endif // ROCKSDB_LITE
file_meta.FreePrivateMetadata();
return s;
}