BeansDB源碼剖析——bitcask.c

/*
*  Beansdb - A high available distributed key-value storage system:
*
*      http://beansdb.googlecode.com
*
*  Copyright 2010 Douban Inc.  All rights reserved.
*
*  Use and distribution licensed under the BSD license.  See
*  the LICENSE file for full text.
*
*  Authors:
*      Davies Liu <[email protected]>
*
*/

#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <math.h>
#include <time.h>

#include "bitcask.h"
#include "htree.h"
#include "record.h"

#define MAX_BUCKET_COUNT 256

const uint32_t MAX_RECORD_SIZE = 50 * 1024 * 1024; // 50M
const uint32_t MAX_BUCKET_SIZE = (uint32_t)1024 * 1024 * 1024 * 2; // 2G
const uint32_t WRITE_BUFFER_SIZE = 1024 * 1024 * 4; // 4M

const char DATA_FILE[] = "%s/%03d.data";
const char HINT_FILE[] = "%s/%03d.hint.qlz";
const char NEW_DATA_FILE[] = "%s/%03d.data.new";
const char NEW_HINT_FILE[] = "%s/%03d.hint.new.qlz";

struct bitcask_t {
	char*  path;
	int    depth;
	HTree* tree; //這個tree記錄了所有的data數據信息(也就是curr個tree的信息)，比cur_tree要大得多
	int    curr; //當前的桶的序號，這之前的桶都已經寫入datafile了
	HTree* curr_tree; //只有一個curr_tree，就是當前active的datafile的bucket的數據
	//write_buffer相當於active file的一個緩衝區。當write_buffer滿了以後就flush
	char   *write_buffer; //write_buffer
	int    wbuf_size; //write_buffer的大小
	int    wbuf_start_pos; //write_buffer的大小小於文件的大小，所以start_pos是記錄的write_buffer在文件中的位移
	//也就是文件的末尾
	int    wbuf_curr_pos; //有效的數據的大小
	/*
	結合item的pos，可以得到操作：
	如果有item的pos，那麼pos = item->pos & 0xffffff00是這個record相對於文件的位移
	而start_pos是write_buffer相對於文件的位移，
	bc->write_buffer + pos - bc->wbuf_start_pos就得到了這個record在write_buffer
	(如果有的話，即這是最後一個bucket)的位置
	*/
	pthread_mutex_t flush_lock;
	pthread_mutex_t buffer_lock;
	pthread_mutex_t write_lock;
};

//一個bc裏最多有MAX_BUCKET_COUNT個文件，每個文件叫做這個bc的bucket
//打開一個bitcask
//1.申請內存並初始化。
//2.遍歷目錄下的所有files——根據hintfile——如果沒有就是用datafile——來建立一個整體的bc->tree
//3.更新bc的curr域，表示當前有多少個data文件
//before - 遍歷的時間限制，只遍歷before以後的hintfile，或者datafile中tsstamp在before之後的record
Bitcask* bc_open(const char *path, int depth, time_t before)
{
	if (path == NULL || depth > 4) return NULL;
	if (0 != access(path, F_OK) && 0 != mkdir(path, 0750)){
		fprintf(stderr, "mkdir %s failed\n", path);
		return NULL;
	}
	Bitcask* bc = (Bitcask*)malloc(sizeof(Bitcask));
	memset(bc, 0, sizeof(Bitcask));    
	bc->path = strdup(path);
	bc->depth = depth;
	bc->tree = ht_new(depth);
	bc->curr_tree = ht_new(depth);
	bc->wbuf_size = 1024 * 4;
	bc->write_buffer = malloc(bc->wbuf_size);
	pthread_mutex_init(&bc->buffer_lock, NULL);
	pthread_mutex_init(&bc->write_lock, NULL);
	pthread_mutex_init(&bc->flush_lock, NULL);

	char datapath[255], hintpath[255];
	int i=0;
	for (i=0; i<MAX_BUCKET_COUNT; i++) {
		//看看第i個桶是不是空的
		sprintf(datapath, DATA_FILE, path, i);
		FILE* f = fopen(datapath, "rb");
		if (NULL == f) break;
		fclose(f);

		sprintf(hintpath, HINT_FILE, path, i);
		struct stat st;
		if (before == 0){
			//如果有對應的hintfile，則更新這個hintfile對應的樹節點
			//這是啓動時，利用hintfile進行樹創建的步驟
			if (0 == lstat(hintpath, &st)){
				scanHintFile(bc->tree, i, hintpath, NULL);
			}else{
				//否則創建新的hintfile
				scanDataFile(bc->tree, i, datapath, hintpath);                
			}
		}else{
			if (0 == lstat(hintpath, &st) && 
				(st.st_mtime < before || 0 == lstat(datapath, &st) && st.st_mtime < before)){
					scanHintFile(bc->tree, i, hintpath, NULL); 
			}else{
				scanDataFileBefore(bc->tree, i, datapath, before);
			}
		}
	}
	bc->curr = i;
	//    ht_optimize(bc->tree);

	return bc;
}

/*
* bc_close() is not thread safe, should stop other threads before call it.
* */
//1.flush，將write_buffer寫入到datafile中，
//2.bc->curr_tree生成對應的hintfile
//3.銷燬bc->tree
//4.銷燬其它變量
void bc_close(Bitcask *bc)
{
	int i=0;
	pthread_mutex_lock(&bc->write_lock);
	
	//1
	bc_flush(bc, 0);

	//2
	if (NULL != bc->curr_tree) {
		//構建當前bucket的hint文件
		char buf[255];
		sprintf(buf, HINT_FILE, bc->path, bc->curr);
		build_hint(bc->curr_tree, buf);
		bc->curr_tree = NULL;
	}
	bc->curr = 0;
	//3
	ht_destroy(bc->tree);
	//4
	free(bc->path);
	free(bc->write_buffer);
	free(bc);
}

//利用it的信息（pos）更新args對應的樹
void update_items(Item *it, void *args)
{
	HTree *tree = (HTree*) args;
	Item *p = ht_get(tree, it->name);
	if (!p) {
		fprintf(stderr, "Bug, item missed after optimized\n");
		return;
	}

	//如果(it->pos & 0xff) != (p->pos & 0xff)
	//那麼說明至少有兩個datafile中有這個key對應的data，這時要以bc->tree中的bucket爲基準
	//也就是說，我們只更新bucket正確的DataRecord對應的Item
	if (it->pos != p->pos && (it->pos & 0xff) == (p->pos & 0xff) ) {
		ht_add(tree, p->name, it->pos, p->hash, p->ver);
	}
	free(p);
}

//在經過一段時間的運行後，新的bc->tree會新增或者刪除一些節點，原來的datafile中的記錄有可能就
//就應該被刪除了。爲了節省文件空間，需要將那些空的比較多的datafile中的有效的DataRecord保留下來，而
//而將該刪的DataRecord刪掉。
//1.依次遍歷這個bc的每個bucket，也就是每個datafile
//2.調用record.c中的optimizeDataFile，這個函數會比較hintfile中的tree跟bc->tree的不同
//	並記錄下來刪除的record的數目，以決定是否值得optimize
//3.如果需要optimize，那麼從datafile中讀取DataRecord，並在bc->tree中查找看是否有必要保留
//4.經過optimize，datafile中DataRecord的位置可能發生了變化，這些變化被存儲在相應的hashtree中
//	也就是本函數的cur_tree中，我們需要遍歷cur_tree，反過來更新bc->tree
//5.然後根據cur_tree生成對應的hintfile
void bc_optimize(Bitcask *bc, int limit)
{
	int i;
	
	//1
	for (i=0; i < bc->curr; i++) {
		char data[255], hint[255];
		sprintf(data, DATA_FILE, bc->path, i);
		sprintf(hint, HINT_FILE, bc->path, i);

		//2,3
		HTree *cur_tree = optimizeDataFile(bc->tree, i, data, hint, limit);
		if (NULL == cur_tree) continue;

		pthread_mutex_lock(&bc->write_lock);
		//4
		ht_visit(cur_tree, update_items, bc->tree);
		pthread_mutex_unlock(&bc->write_lock);

		//5
		build_hint(cur_tree, hint);
	}
}

//從bc中對應的datafile中查找key對應的DataRecord
//注意bc中能存放一個value的結構是：
//a.已經被持久化的datafile 
//b.active的datafile(被flush了)
//c.bc的write_buffer(還沒有被flush)
//所以得到bc_get的步驟爲：
//1.從bc->tree中查找這個key對應的Item，
//2.得到dr所在的datafile編號及位置
//3.判斷dr在a,b,c哪個裏面
//	3.1.在c裏面則直接從write_buffer中取，注意dr位置的計算
//	3.2.在a和b中的處理方法一樣，都是直接從文件中讀取record
//4.根據是否得到dr，來反向更新bc->tree
DataRecord* bc_get(Bitcask *bc, const char* key)
{
	//1
	Item *item = ht_get(bc->tree, key);
	if (NULL == item) return NULL;
	//ver小於0，說明該item是無效的
	if (item->ver < 0){
		free(item);
		return NULL;
	}

	//2
	//後8位是文件編號
	int bucket = item->pos & 0xff;
	//前24位是在文件中的位置
	uint32_t pos = item->pos & 0xffffff00;
	if (bucket > bc->curr) {
		fprintf(stderr, "BUG: invalid bucket %d > %d\n", bucket, bc->curr);
		ht_remove(bc->tree, key);
		free(item);
		return NULL;
	}

	DataRecord* r = NULL;
	//如果r在當前bucket中
	//這個bucket還沒有寫入文件中
	if (bucket == bc->curr) {
		pthread_mutex_lock(&bc->buffer_lock);
		//3.1
		if (bucket == bc->curr && pos >= bc->wbuf_start_pos){
			//從write_buffer中找
			//dr在write_buffer中的起始位置爲p
			int p = pos - bc->wbuf_start_pos;
			r = decode_record(bc->write_buffer + p, bc->wbuf_curr_pos - p);
		}
		pthread_mutex_unlock(&bc->buffer_lock);

		if (r != NULL){//從write_buffer中找到了
			free(item);
			return r;
		}
	}

	//3.2
	//如果r不在最後一個bucket中，或者在最後一個bucket中但是被flush了。
	//打開存儲這個bucket的文件
	char data[255];
	sprintf(data, DATA_FILE, bc->path, bucket);
	FILE *f = fopen(data, "rb");
	if (NULL == f){
		goto GET_END;
	}

	if (0 != fseek(f, pos, SEEK_SET)){
		fprintf(stderr, "IOError: seek file %d to %d failed\n", bucket, pos);
		goto GET_END;
	}

	r = read_record(f, true);
	if (NULL == r){
		fprintf(stderr, "Bug: get %s failed in %s %d %d\n", key, bc->path, bucket, pos);        
	}else{
		// check key
		if (strcmp(key, r->key) != 0){
			fprintf(stderr, "Bug: record %s is not expected %s\n", r->key, key);
			free_record(r);
			r = NULL;
		} 
	}
GET_END:
	//4
	if (NULL == r)
		ht_remove(bc->tree, key);
	if (f != NULL) fclose(f);
	free(item);
	return r;
}

struct build_thread_args {
	HTree *tree;
	char *path;
};

//創建hint文件的線程入口函數
void* build_thread(void *param)
{
	struct build_thread_args *args = (struct build_thread_args*) param;
	build_hint(args->tree, args->path);
	free(args->path);
	free(param);
	return NULL;
}

//清空write_buffer，將其內容寫入active datafile中。
//因爲datafile的大小是有限制的，所以有可能會持久化當前的datafile而新建一個active
//1.打開當前的active datafile，並檢測文件大小跟當前的cur_pos是否相同
//2.向文件中寫入
//3.如果write_buffer沒有全部寫入，則將後面的內容前移
//4.更新write_buffer的pos，如果有必要，擴充write_buffer
//5.如果當前datafile已經足夠大，那麼持久化本datafile，新建一個datafile及對應的htree
//	5.1.首先要把write_buffer中的內容全部寫入
//	5.2.在新線程中持久化本datafile，建立對應的hintfile
//	5.3.新建一個datafile(curr+1)，對應地，新建一個htree
void bc_flush(Bitcask *bc, int limit)
{
	if (bc->curr >= MAX_BUCKET_COUNT) {
		fprintf(stderr, "reach max bucket count\n");
		exit(1);
	}

	pthread_mutex_lock(&bc->flush_lock);
	//寫入本bucket的datafile中
	//符合條件
	if (bc->wbuf_curr_pos > limit * 1024) {
		//1
		char buf[255];
		sprintf(buf, DATA_FILE, bc->path, bc->curr);
		FILE *f = fopen(buf, "ab");
		if (f == NULL) {
			fprintf(stderr, "open file %s for flushing failed.\n", buf);
			exit(1);
		}
		// check file size
		int last_pos = ftell(f);
		if (last_pos != bc->wbuf_start_pos) {
			fprintf(stderr, "last pos not match: %d != %d\n", last_pos, bc->wbuf_start_pos);
			exit(1);
		}

		//2
		int n = fwrite(bc->write_buffer, 1, bc->wbuf_curr_pos, f);

		pthread_mutex_lock(&bc->buffer_lock);
		//3
		if (n < bc->wbuf_curr_pos) {//沒有寫完
			memmove(bc->write_buffer, bc->write_buffer + n, bc->wbuf_curr_pos - n);
		}

		//4
		//更新兩個pos的值
		bc->wbuf_start_pos += n;
		bc->wbuf_curr_pos -= n;
		if (bc->wbuf_curr_pos == 0 && bc->wbuf_size < WRITE_BUFFER_SIZE) {
			//如果有必要，擴充write_buffer
			bc->wbuf_size *= 2;
			free(bc->write_buffer);
			bc->write_buffer = malloc(bc->wbuf_size);
		}

		//5
		//如果write_buffer可以用來存儲數據的空間大於一個bucket的size，新建一個bucket1
		//這個新建的bucket1是用一個新線程來跑的
		if (bc->wbuf_start_pos + bc->wbuf_size > MAX_BUCKET_SIZE) {
			//5.1
			if (bc->wbuf_curr_pos > 0) {
				if (fwrite(bc->write_buffer, 1, bc->wbuf_curr_pos, f) < bc->wbuf_curr_pos){
					fprintf(stderr, "write to %s failed\n", buf);
					exit(1);
				}
			}
			//5.2
			char datapath[255];
			sprintf(datapath, HINT_FILE, bc->path, bc->curr);
			struct build_thread_args *args = (struct build_thread_args*)malloc(
				sizeof(struct build_thread_args));
			//將當前bucekt的數據寫入到一個hintfile中
			args->tree = bc->curr_tree;
			args->path = strdup(datapath);
			pthread_t build_ptid;
			pthread_create(&build_ptid, NULL, build_thread, args);
			//5.3
			// next bucket
			bc->curr ++;
			bc->curr_tree = ht_new(bc->depth);
			bc->wbuf_start_pos = 0;
			bc->wbuf_curr_pos = 0;
		}
		pthread_mutex_unlock(&bc->buffer_lock);

		fclose(f);
	}
	pthread_mutex_unlock(&bc->flush_lock);
}

//set是beansdb的核心操作，也是實現sync的方式。
//set有四種類型：替換，插入，刪除，同步。
//version的更新應該遵循這樣的規則：
//	a.每次更新時，需要將version+1
//	b.每次刪除時，如果此前version爲正，則version爲version+1的絕對值
//這樣做是爲了得到sync的方法：
//比如節點1跟節點2同時add了一個key，然後又都delete了它，這時key的version爲-2
//此後節點1失效，節點2更新了這個key，key的version變爲3，當節點1與節點2sync時，
//節點1給出的version爲-2，節點2給出的爲3，節點1得知自己落後，從而進行追趕。
//1.得到本bc(節點)中該key對應的ver，設爲oldv
//2.根據version和oldv的大小比較來判斷到底是哪種類型，給ver賦值。
//3.更新兩個htree和datafile文件
//	3.1.value相同，那麼只需更新htree中的version
//	3.2.否則無論是刪除，插入還是更新，都要新建一個DataRecord，加入當前的datafile中。
//			如果是更新或者刪除的話，原來datafile中的數據會在Optimize的時候被刪除。
bool bc_set(Bitcask *bc, const char* key, char* value, int vlen, int flag, int version)
{
	if (version < 0 && vlen > 0 || vlen > MAX_RECORD_SIZE){
		fprintf(stderr, "invalid set cmd \n");
		return false;
	}

	bool suc = false; //是否成功的標識
	pthread_mutex_lock(&bc->write_lock);

	int oldv = 0, ver = version;
	Item *it = ht_get(bc->tree, key);
	if (it != NULL) {
		oldv = it->ver;
	}

	//2
	if (version == 0 && oldv > 0){ // replace
		//更新，版本號+1
		ver = oldv + 1;
	} else if (version == 0 && oldv <= 0){ // add
		//從被刪除狀態轉爲存在狀態，ver應該爲-oldv+1
		//這個ver=1應該是不對的。
		ver = 1;
	} else if (version < 0 && oldv <= 0) { // delete, not exist
		goto SET_FAIL; //如果存在，不應該返回FAIL呀
	} else if (version == -1) { // delete
		ver = - abs(oldv) - 1;
	} else if (abs(version) <= abs(oldv)) { // sync
		//例如： version		oldver		op
		//		     5			8			  這個不是最新的
		//		    -5			8			  這已經不是它想要刪除的那個item了
		goto SET_FAIL;
	} else { // sync
		//例如： version		oldver		op
		//		     8			 5			 更新
		//		     8			 -5			 插入
		//		    -8			 5			 刪除
		ver = version;
	}

	uint16_t hash = gen_hash(value, vlen);
	//這個item要被刪除了
	if (ver < 0) hash = 0;

	//tree中存在這個it，那麼更新
	if (NULL != it && hash == it->hash) {
		DataRecord *r = bc_get(bc, key);
		//
		if (r != NULL && r->flag == flag && vlen  == r->vsz
			&& memcmp(value, r->value, vlen) == 0) {
				//
				if (version != 0){
					ht_add(bc->tree, key, it->pos, it->hash, ver);
					if (it->pos & 0xff == bc->curr){
						if (bc->curr_tree == NULL) {
							fprintf(stderr, "BUG: curr_tree should not be NULL\n");
						}else{
							ht_add(bc->curr_tree, key, it->pos, it->hash, ver);
						}
					}
				}
				suc = true;
				free_record(r);
				goto SET_FAIL;
		}
	}

	//tree中不存在這個it，或者it的value跟set的value不同。
	//即使是刪除了，也要加入到datafile中
	int klen = strlen(key);
	DataRecord *r = malloc(sizeof(DataRecord) + klen);
	r->ksz = klen;
	memcpy(r->key, key, klen);
	r->vsz = vlen;
	r->value = value;
	r->free_value = false;
	r->flag = flag;
	r->version = ver;
	r->tstamp = time(NULL);

	int rlen;
	char *rbuf = encode_record(r, &rlen);
	if (rbuf == NULL || (rlen & 0xff) != 0){
		fprintf(stderr, "encode_record() failed with %d\n", rlen);
		if (rbuf != NULL) free(rbuf);
		goto SET_FAIL; 
	}

	pthread_mutex_lock(&bc->buffer_lock);
	//如果這個write_buffer已經裝不下這個record了，清空
	if (bc->wbuf_curr_pos + rlen > bc->wbuf_size) {
		pthread_mutex_unlock(&bc->buffer_lock);
		bc_flush(bc, 0);
		pthread_mutex_lock(&bc->buffer_lock);
	}
	// record maybe larger than buffer
	//如果是更新的話，那麼這個DataRecord的bucket就可能改變了。
	while (bc->wbuf_curr_pos + rlen > bc->wbuf_size) {
		bc->wbuf_size *= 2;
		bc->write_buffer = realloc(bc->write_buffer, bc->wbuf_size);
	}
	memcpy(bc->write_buffer + bc->wbuf_curr_pos, rbuf, rlen);

	int pos = (bc->wbuf_start_pos + bc->wbuf_curr_pos) | bc->curr;
	bc->wbuf_curr_pos += rlen;
	pthread_mutex_unlock(&bc->buffer_lock);

	//更新tree
	ht_add(bc->tree, key, pos, hash, ver);
	ht_add(bc->curr_tree, key, pos, hash, ver);
	suc = true;
	free(rbuf);
	free_record(r);

SET_FAIL:
	pthread_mutex_unlock(&bc->write_lock);
	if (it != NULL) free(it);
	return suc;
}

bool bc_delete(Bitcask *bc, const char* key)
{
	return bc_set(bc, key, "", 0, 0, -1);
}

uint16_t bc_get_hash(Bitcask *bc, const char * pos, int *count)
{
	return ht_get_hash(bc->tree, pos, count);
}

char* bc_list(Bitcask *bc, const char* pos, const char* prefix)
{
	return ht_list(bc->tree, pos, prefix);
}

uint32_t   bc_count(Bitcask *bc, uint32_t* curr)
{
	uint32_t total = 0;
	ht_get_hash(bc->tree, "@", &total);
	if (NULL != curr && NULL != bc->curr_tree) {
		ht_get_hash(bc->curr_tree, "@", curr);
	}
	return total;
}
BeansDB源碼剖析——bitcask.c

華爲機試題之表達式求值

Ubuntu安裝Google Chrome，報NSS version的錯誤

兩個有序數組元素之和的最小K個值

Problem 1802 —— 火車調度

levelDB學習筆記——Version

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結