/*
* Beansdb - A high available distributed key-value storage system:
*
* http://beansdb.googlecode.com
*
* Copyright 2010 Douban Inc. All rights reserved.
*
* Use and distribution licensed under the BSD license. See
* the LICENSE file for full text.
*
* Authors:
* Davies Liu <[email protected]>
*
*/
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <math.h>
#include <time.h>
#include "bitcask.h"
#include "htree.h"
#include "record.h"
#define MAX_BUCKET_COUNT 256
const uint32_t MAX_RECORD_SIZE = 50 * 1024 * 1024; // 50M
const uint32_t MAX_BUCKET_SIZE = (uint32_t)1024 * 1024 * 1024 * 2; // 2G
const uint32_t WRITE_BUFFER_SIZE = 1024 * 1024 * 4; // 4M
const char DATA_FILE[] = "%s/%03d.data";
const char HINT_FILE[] = "%s/%03d.hint.qlz";
const char NEW_DATA_FILE[] = "%s/%03d.data.new";
const char NEW_HINT_FILE[] = "%s/%03d.hint.new.qlz";
struct bitcask_t {
char* path;
int depth;
HTree* tree; //這個tree記錄了所有的data數據信息(也就是curr個tree的信息),比cur_tree要大得多
int curr; //當前的桶的序號,這之前的桶都已經寫入datafile了
HTree* curr_tree; //只有一個curr_tree,就是當前active的datafile的bucket的數據
//write_buffer相當於active file的一個緩衝區。當write_buffer滿了以後就flush
char *write_buffer; //write_buffer
int wbuf_size; //write_buffer的大小
int wbuf_start_pos; //write_buffer的大小小於文件的大小,所以start_pos是記錄的write_buffer在文件中的位移
//也就是文件的末尾
int wbuf_curr_pos; //有效的數據的大小
/*
結合item的pos,可以得到操作:
如果有item的pos,那麼pos = item->pos & 0xffffff00是這個record相對於文件的位移
而start_pos是write_buffer相對於文件的位移,
bc->write_buffer + pos - bc->wbuf_start_pos就得到了這個record在write_buffer
(如果有的話,即這是最後一個bucket)的位置
*/
pthread_mutex_t flush_lock;
pthread_mutex_t buffer_lock;
pthread_mutex_t write_lock;
};
//一個bc裏最多有MAX_BUCKET_COUNT個文件,每個文件叫做這個bc的bucket
//打開一個bitcask
//1.申請內存並初始化。
//2.遍歷目錄下的所有files——根據hintfile——如果沒有就是用datafile——來建立一個整體的bc->tree
//3.更新bc的curr域,表示當前有多少個data文件
//before - 遍歷的時間限制,只遍歷before以後的hintfile,或者datafile中tsstamp在before之後的record
Bitcask* bc_open(const char *path, int depth, time_t before)
{
if (path == NULL || depth > 4) return NULL;
if (0 != access(path, F_OK) && 0 != mkdir(path, 0750)){
fprintf(stderr, "mkdir %s failed\n", path);
return NULL;
}
Bitcask* bc = (Bitcask*)malloc(sizeof(Bitcask));
memset(bc, 0, sizeof(Bitcask));
bc->path = strdup(path);
bc->depth = depth;
bc->tree = ht_new(depth);
bc->curr_tree = ht_new(depth);
bc->wbuf_size = 1024 * 4;
bc->write_buffer = malloc(bc->wbuf_size);
pthread_mutex_init(&bc->buffer_lock, NULL);
pthread_mutex_init(&bc->write_lock, NULL);
pthread_mutex_init(&bc->flush_lock, NULL);
char datapath[255], hintpath[255];
int i=0;
for (i=0; i<MAX_BUCKET_COUNT; i++) {
//看看第i個桶是不是空的
sprintf(datapath, DATA_FILE, path, i);
FILE* f = fopen(datapath, "rb");
if (NULL == f) break;
fclose(f);
sprintf(hintpath, HINT_FILE, path, i);
struct stat st;
if (before == 0){
//如果有對應的hintfile,則更新這個hintfile對應的樹節點
//這是啓動時,利用hintfile進行樹創建的步驟
if (0 == lstat(hintpath, &st)){
scanHintFile(bc->tree, i, hintpath, NULL);
}else{
//否則創建新的hintfile
scanDataFile(bc->tree, i, datapath, hintpath);
}
}else{
if (0 == lstat(hintpath, &st) &&
(st.st_mtime < before || 0 == lstat(datapath, &st) && st.st_mtime < before)){
scanHintFile(bc->tree, i, hintpath, NULL);
}else{
scanDataFileBefore(bc->tree, i, datapath, before);
}
}
}
bc->curr = i;
// ht_optimize(bc->tree);
return bc;
}
/*
* bc_close() is not thread safe, should stop other threads before call it.
* */
//1.flush,將write_buffer寫入到datafile中,
//2.bc->curr_tree生成對應的hintfile
//3.銷燬bc->tree
//4.銷燬其它變量
void bc_close(Bitcask *bc)
{
int i=0;
pthread_mutex_lock(&bc->write_lock);
//1
bc_flush(bc, 0);
//2
if (NULL != bc->curr_tree) {
//構建當前bucket的hint文件
char buf[255];
sprintf(buf, HINT_FILE, bc->path, bc->curr);
build_hint(bc->curr_tree, buf);
bc->curr_tree = NULL;
}
bc->curr = 0;
//3
ht_destroy(bc->tree);
//4
free(bc->path);
free(bc->write_buffer);
free(bc);
}
//利用it的信息(pos)更新args對應的樹
void update_items(Item *it, void *args)
{
HTree *tree = (HTree*) args;
Item *p = ht_get(tree, it->name);
if (!p) {
fprintf(stderr, "Bug, item missed after optimized\n");
return;
}
//如果(it->pos & 0xff) != (p->pos & 0xff)
//那麼說明至少有兩個datafile中有這個key對應的data,這時要以bc->tree中的bucket爲基準
//也就是說,我們只更新bucket正確的DataRecord對應的Item
if (it->pos != p->pos && (it->pos & 0xff) == (p->pos & 0xff) ) {
ht_add(tree, p->name, it->pos, p->hash, p->ver);
}
free(p);
}
//在經過一段時間的運行後,新的bc->tree會新增或者刪除一些節點,原來的datafile中的記錄有可能就
//就應該被刪除了。爲了節省文件空間,需要將那些空的比較多的datafile中的有效的DataRecord保留下來,而
//而將該刪的DataRecord刪掉。
//1.依次遍歷這個bc的每個bucket,也就是每個datafile
//2.調用record.c中的optimizeDataFile,這個函數會比較hintfile中的tree跟bc->tree的不同
// 並記錄下來刪除的record的數目,以決定是否值得optimize
//3.如果需要optimize,那麼從datafile中讀取DataRecord,並在bc->tree中查找看是否有必要保留
//4.經過optimize,datafile中DataRecord的位置可能發生了變化,這些變化被存儲在相應的hashtree中
// 也就是本函數的cur_tree中,我們需要遍歷cur_tree,反過來更新bc->tree
//5.然後根據cur_tree生成對應的hintfile
void bc_optimize(Bitcask *bc, int limit)
{
int i;
//1
for (i=0; i < bc->curr; i++) {
char data[255], hint[255];
sprintf(data, DATA_FILE, bc->path, i);
sprintf(hint, HINT_FILE, bc->path, i);
//2,3
HTree *cur_tree = optimizeDataFile(bc->tree, i, data, hint, limit);
if (NULL == cur_tree) continue;
pthread_mutex_lock(&bc->write_lock);
//4
ht_visit(cur_tree, update_items, bc->tree);
pthread_mutex_unlock(&bc->write_lock);
//5
build_hint(cur_tree, hint);
}
}
//從bc中對應的datafile中查找key對應的DataRecord
//注意bc中能存放一個value的結構是:
//a.已經被持久化的datafile
//b.active的datafile(被flush了)
//c.bc的write_buffer(還沒有被flush)
//所以得到bc_get的步驟爲:
//1.從bc->tree中查找這個key對應的Item,
//2.得到dr所在的datafile編號及位置
//3.判斷dr在a,b,c哪個裏面
// 3.1.在c裏面則直接從write_buffer中取,注意dr位置的計算
// 3.2.在a和b中的處理方法一樣,都是直接從文件中讀取record
//4.根據是否得到dr,來反向更新bc->tree
DataRecord* bc_get(Bitcask *bc, const char* key)
{
//1
Item *item = ht_get(bc->tree, key);
if (NULL == item) return NULL;
//ver小於0,說明該item是無效的
if (item->ver < 0){
free(item);
return NULL;
}
//2
//後8位是文件編號
int bucket = item->pos & 0xff;
//前24位是在文件中的位置
uint32_t pos = item->pos & 0xffffff00;
if (bucket > bc->curr) {
fprintf(stderr, "BUG: invalid bucket %d > %d\n", bucket, bc->curr);
ht_remove(bc->tree, key);
free(item);
return NULL;
}
DataRecord* r = NULL;
//如果r在當前bucket中
//這個bucket還沒有寫入文件中
if (bucket == bc->curr) {
pthread_mutex_lock(&bc->buffer_lock);
//3.1
if (bucket == bc->curr && pos >= bc->wbuf_start_pos){
//從write_buffer中找
//dr在write_buffer中的起始位置爲p
int p = pos - bc->wbuf_start_pos;
r = decode_record(bc->write_buffer + p, bc->wbuf_curr_pos - p);
}
pthread_mutex_unlock(&bc->buffer_lock);
if (r != NULL){//從write_buffer中找到了
free(item);
return r;
}
}
//3.2
//如果r不在最後一個bucket中,或者在最後一個bucket中但是被flush了。
//打開存儲這個bucket的文件
char data[255];
sprintf(data, DATA_FILE, bc->path, bucket);
FILE *f = fopen(data, "rb");
if (NULL == f){
goto GET_END;
}
if (0 != fseek(f, pos, SEEK_SET)){
fprintf(stderr, "IOError: seek file %d to %d failed\n", bucket, pos);
goto GET_END;
}
r = read_record(f, true);
if (NULL == r){
fprintf(stderr, "Bug: get %s failed in %s %d %d\n", key, bc->path, bucket, pos);
}else{
// check key
if (strcmp(key, r->key) != 0){
fprintf(stderr, "Bug: record %s is not expected %s\n", r->key, key);
free_record(r);
r = NULL;
}
}
GET_END:
//4
if (NULL == r)
ht_remove(bc->tree, key);
if (f != NULL) fclose(f);
free(item);
return r;
}
struct build_thread_args {
HTree *tree;
char *path;
};
//創建hint文件的線程入口函數
void* build_thread(void *param)
{
struct build_thread_args *args = (struct build_thread_args*) param;
build_hint(args->tree, args->path);
free(args->path);
free(param);
return NULL;
}
//清空write_buffer,將其內容寫入active datafile中。
//因爲datafile的大小是有限制的,所以有可能會持久化當前的datafile而新建一個active
//1.打開當前的active datafile,並檢測文件大小跟當前的cur_pos是否相同
//2.向文件中寫入
//3.如果write_buffer沒有全部寫入,則將後面的內容前移
//4.更新write_buffer的pos,如果有必要,擴充write_buffer
//5.如果當前datafile已經足夠大,那麼持久化本datafile,新建一個datafile及對應的htree
// 5.1.首先要把write_buffer中的內容全部寫入
// 5.2.在新線程中持久化本datafile,建立對應的hintfile
// 5.3.新建一個datafile(curr+1),對應地,新建一個htree
void bc_flush(Bitcask *bc, int limit)
{
if (bc->curr >= MAX_BUCKET_COUNT) {
fprintf(stderr, "reach max bucket count\n");
exit(1);
}
pthread_mutex_lock(&bc->flush_lock);
//寫入本bucket的datafile中
//符合條件
if (bc->wbuf_curr_pos > limit * 1024) {
//1
char buf[255];
sprintf(buf, DATA_FILE, bc->path, bc->curr);
FILE *f = fopen(buf, "ab");
if (f == NULL) {
fprintf(stderr, "open file %s for flushing failed.\n", buf);
exit(1);
}
// check file size
int last_pos = ftell(f);
if (last_pos != bc->wbuf_start_pos) {
fprintf(stderr, "last pos not match: %d != %d\n", last_pos, bc->wbuf_start_pos);
exit(1);
}
//2
int n = fwrite(bc->write_buffer, 1, bc->wbuf_curr_pos, f);
pthread_mutex_lock(&bc->buffer_lock);
//3
if (n < bc->wbuf_curr_pos) {//沒有寫完
memmove(bc->write_buffer, bc->write_buffer + n, bc->wbuf_curr_pos - n);
}
//4
//更新兩個pos的值
bc->wbuf_start_pos += n;
bc->wbuf_curr_pos -= n;
if (bc->wbuf_curr_pos == 0 && bc->wbuf_size < WRITE_BUFFER_SIZE) {
//如果有必要,擴充write_buffer
bc->wbuf_size *= 2;
free(bc->write_buffer);
bc->write_buffer = malloc(bc->wbuf_size);
}
//5
//如果write_buffer可以用來存儲數據的空間大於一個bucket的size,新建一個bucket1
//這個新建的bucket1是用一個新線程來跑的
if (bc->wbuf_start_pos + bc->wbuf_size > MAX_BUCKET_SIZE) {
//5.1
if (bc->wbuf_curr_pos > 0) {
if (fwrite(bc->write_buffer, 1, bc->wbuf_curr_pos, f) < bc->wbuf_curr_pos){
fprintf(stderr, "write to %s failed\n", buf);
exit(1);
}
}
//5.2
char datapath[255];
sprintf(datapath, HINT_FILE, bc->path, bc->curr);
struct build_thread_args *args = (struct build_thread_args*)malloc(
sizeof(struct build_thread_args));
//將當前bucekt的數據寫入到一個hintfile中
args->tree = bc->curr_tree;
args->path = strdup(datapath);
pthread_t build_ptid;
pthread_create(&build_ptid, NULL, build_thread, args);
//5.3
// next bucket
bc->curr ++;
bc->curr_tree = ht_new(bc->depth);
bc->wbuf_start_pos = 0;
bc->wbuf_curr_pos = 0;
}
pthread_mutex_unlock(&bc->buffer_lock);
fclose(f);
}
pthread_mutex_unlock(&bc->flush_lock);
}
//set是beansdb的核心操作,也是實現sync的方式。
//set有四種類型:替換,插入,刪除,同步。
//version的更新應該遵循這樣的規則:
// a.每次更新時,需要將version+1
// b.每次刪除時,如果此前version爲正,則version爲version+1的絕對值
//這樣做是爲了得到sync的方法:
//比如節點1跟節點2同時add了一個key,然後又都delete了它,這時key的version爲-2
//此後節點1失效,節點2更新了這個key,key的version變爲3,當節點1與節點2sync時,
//節點1給出的version爲-2,節點2給出的爲3,節點1得知自己落後,從而進行追趕。
//1.得到本bc(節點)中該key對應的ver,設爲oldv
//2.根據version和oldv的大小比較來判斷到底是哪種類型,給ver賦值。
//3.更新兩個htree和datafile文件
// 3.1.value相同,那麼只需更新htree中的version
// 3.2.否則無論是刪除,插入還是更新,都要新建一個DataRecord,加入當前的datafile中。
// 如果是更新或者刪除的話,原來datafile中的數據會在Optimize的時候被刪除。
bool bc_set(Bitcask *bc, const char* key, char* value, int vlen, int flag, int version)
{
if (version < 0 && vlen > 0 || vlen > MAX_RECORD_SIZE){
fprintf(stderr, "invalid set cmd \n");
return false;
}
bool suc = false; //是否成功的標識
pthread_mutex_lock(&bc->write_lock);
int oldv = 0, ver = version;
Item *it = ht_get(bc->tree, key);
if (it != NULL) {
oldv = it->ver;
}
//2
if (version == 0 && oldv > 0){ // replace
//更新,版本號+1
ver = oldv + 1;
} else if (version == 0 && oldv <= 0){ // add
//從被刪除狀態轉爲存在狀態,ver應該爲-oldv+1
//這個ver=1應該是不對的。
ver = 1;
} else if (version < 0 && oldv <= 0) { // delete, not exist
goto SET_FAIL; //如果存在,不應該返回FAIL呀
} else if (version == -1) { // delete
ver = - abs(oldv) - 1;
} else if (abs(version) <= abs(oldv)) { // sync
//例如: version oldver op
// 5 8 這個不是最新的
// -5 8 這已經不是它想要刪除的那個item了
goto SET_FAIL;
} else { // sync
//例如: version oldver op
// 8 5 更新
// 8 -5 插入
// -8 5 刪除
ver = version;
}
uint16_t hash = gen_hash(value, vlen);
//這個item要被刪除了
if (ver < 0) hash = 0;
//tree中存在這個it,那麼更新
if (NULL != it && hash == it->hash) {
DataRecord *r = bc_get(bc, key);
//
if (r != NULL && r->flag == flag && vlen == r->vsz
&& memcmp(value, r->value, vlen) == 0) {
//
if (version != 0){
ht_add(bc->tree, key, it->pos, it->hash, ver);
if (it->pos & 0xff == bc->curr){
if (bc->curr_tree == NULL) {
fprintf(stderr, "BUG: curr_tree should not be NULL\n");
}else{
ht_add(bc->curr_tree, key, it->pos, it->hash, ver);
}
}
}
suc = true;
free_record(r);
goto SET_FAIL;
}
}
//tree中不存在這個it,或者it的value跟set的value不同。
//即使是刪除了,也要加入到datafile中
int klen = strlen(key);
DataRecord *r = malloc(sizeof(DataRecord) + klen);
r->ksz = klen;
memcpy(r->key, key, klen);
r->vsz = vlen;
r->value = value;
r->free_value = false;
r->flag = flag;
r->version = ver;
r->tstamp = time(NULL);
int rlen;
char *rbuf = encode_record(r, &rlen);
if (rbuf == NULL || (rlen & 0xff) != 0){
fprintf(stderr, "encode_record() failed with %d\n", rlen);
if (rbuf != NULL) free(rbuf);
goto SET_FAIL;
}
pthread_mutex_lock(&bc->buffer_lock);
//如果這個write_buffer已經裝不下這個record了,清空
if (bc->wbuf_curr_pos + rlen > bc->wbuf_size) {
pthread_mutex_unlock(&bc->buffer_lock);
bc_flush(bc, 0);
pthread_mutex_lock(&bc->buffer_lock);
}
// record maybe larger than buffer
//如果是更新的話,那麼這個DataRecord的bucket就可能改變了。
while (bc->wbuf_curr_pos + rlen > bc->wbuf_size) {
bc->wbuf_size *= 2;
bc->write_buffer = realloc(bc->write_buffer, bc->wbuf_size);
}
memcpy(bc->write_buffer + bc->wbuf_curr_pos, rbuf, rlen);
int pos = (bc->wbuf_start_pos + bc->wbuf_curr_pos) | bc->curr;
bc->wbuf_curr_pos += rlen;
pthread_mutex_unlock(&bc->buffer_lock);
//更新tree
ht_add(bc->tree, key, pos, hash, ver);
ht_add(bc->curr_tree, key, pos, hash, ver);
suc = true;
free(rbuf);
free_record(r);
SET_FAIL:
pthread_mutex_unlock(&bc->write_lock);
if (it != NULL) free(it);
return suc;
}
bool bc_delete(Bitcask *bc, const char* key)
{
return bc_set(bc, key, "", 0, 0, -1);
}
uint16_t bc_get_hash(Bitcask *bc, const char * pos, int *count)
{
return ht_get_hash(bc->tree, pos, count);
}
char* bc_list(Bitcask *bc, const char* pos, const char* prefix)
{
return ht_list(bc->tree, pos, prefix);
}
uint32_t bc_count(Bitcask *bc, uint32_t* curr)
{
uint32_t total = 0;
ht_get_hash(bc->tree, "@", &total);
if (NULL != curr && NULL != bc->curr_tree) {
ht_get_hash(bc->curr_tree, "@", curr);
}
return total;
}
BeansDB源碼剖析——bitcask.c
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.