//DataRecord與item的不同是,item只保存鍵值,而record保存鍵值和value值,但是內存裏只存PADDING的大小
typedef struct data_record {
char *value;
union {
bool free_value; // free value or not,改爲need_free比較好
uint32_t crc;
};
int32_t tstamp; //時間戳
int32_t flag; //record.c開頭的那幾個const int標誌的組合。
int32_t version;
uint32_t ksz; //key大小
uint32_t vsz; //v大小
char key[0];
} DataRecord;
/*
* Beansdb - A high available distributed key-value storage system:
*
* http://beansdb.googlecode.com
*
* Copyright 2010 Douban Inc. All rights reserved.
*
* Use and distribution licensed under the BSD license. See
* the LICENSE file for full text.
*
* Authors:
* Davies Liu <[email protected]>
*
*/
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "record.h"
#include "crc32.c"
#include "quicklz.h"
//#include "fnv1a.h"
const int PADDING = 256; //PADDING是爲了留出低8位,來記錄bucket的下標
const int32_t COMPRESS_FLAG = 0x00010000;
const int32_t CLIENT_COMPRESS_FLAG = 0x00000010;
const float COMPRESS_RATIO_LIMIT = 0.7;//最小的壓縮比例
const int TRY_COMPRESS_SIZE = 1024 * 10;
uint32_t gen_hash(char *buf, int len)
{
uint32_t hash = len * 97;
if (len <= 1024){
hash += fnv1a(buf, len); //整個
}else{
hash += fnv1a(buf, 512); //前512個
hash *= 97;
hash += fnv1a(buf + len - 512, 512); //後512個
}
return hash;
}
typedef struct hint_record {
uint32_t ksize:8;
uint32_t pos:24;
int32_t version;
uint16_t hash;
char name[2]; // allign
} HintRecord;
const int NAME_IN_RECORD = 2;
//| | |
//----------------------------------
//buf 已寫 cur 可寫 size
//param中存放了多個(HintRecord+key),而HintRecord又是根據Item得到的。
struct param {
int size;
int curr;
char* buf;
};
//將it存入param中
void collect_items(Item* it, void* param)
{
//-NAME_IN_RECORD是爲了減少HintRecord中name的那兩個比特
//+1是爲了後面空出一個位置放'\0'
int length = sizeof(HintRecord) + strlen(it->name) + 1 - NAME_IN_RECORD;
struct param *p = (struct param *)param;
//不夠存,擴大param
if (p->size - p->curr < length) {
p->size *= 2;
p->buf = (char*)realloc(p->buf, p->size);
}
//相當於replacement new
HintRecord *r = (HintRecord*)(p->buf + p->curr);
r->ksize = strlen(it->name);
//it->pos的低8位表示file_id,高24位表示在file中的pos
r->pos = it->pos >> 8;
r->version = it->ver;
r->hash = it->hash;
memcpy(r->name, it->name, r->ksize + 1);
p->curr += length;
}
//將buf中的內容寫入到一個臨時文件中,最後用這個文件代替path的文件。
void write_file(char *buf, int size, const char* path)
{
char tmp[255];
sprintf(tmp, "%s.tmp", path);
FILE *hf = fopen(tmp, "wb");
if (NULL==hf){
fprintf(stderr, "open %s failed\n", tmp);
return;
}
//寫入size個字符,每個字符的大小爲1
int n = fwrite(buf, 1, size, hf);
fclose(hf);
if (n == size) {
//刪除path所指文件
unlink(path);
//改變這個已經寫入的文件的名字爲path
rename(tmp, path);
}else{
fprintf(stderr, "write to %s failed \n", tmp);
}
}
//將tree中的數據放入到hint文件中,這個tree(其實是bitcast中的cur_tree)會被銷燬
//1.從tree中收集Item存入一個buf中,然後將treee銷燬
//2.壓縮buf
//3.將buf寫入到一個hintfile中
void build_hint(HTree* tree, const char* hintpath)
{
struct param p;
p.size = 1024 * 1024;
p.curr = 0;
p.buf = malloc(p.size);
//1
//將tree裏的item都蒐集到p中
//ver<0的也收集了
ht_visit(tree, collect_items, &p);
ht_destroy(tree);
// 2
//如果後綴是.qlz說明數據要經過壓縮
if (strcmp(hintpath + strlen(hintpath) - 4, ".qlz") == 0) {
char* wbuf = malloc(QLZ_SCRATCH_COMPRESS);
char* dst = malloc(p.size + 400);
//將p中的數據壓縮成dst_size個字節存到dst中
int dst_size = qlz_compress(p.buf, dst, p.curr, wbuf);
free(p.buf);
p.curr = dst_size;
p.buf = dst;
free(wbuf);
}
//3
write_file(p.buf, p.curr, hintpath);
free(p.buf);
}
//掃描hintfile,將其中的HintRecord放入到tree中。
//tree -- 實際是BitCask的tree
//bucket -- 是這個hintfile在BitCask中的編號
//path -- hintfile文件的目錄
//new_path -- 把hintfile文件中的內容存入這個文件中
//1.打開hintfile並使用mmap得到裏面的全部內容
//2.解壓縮
//3.依次讀取每個HintRecord放入到tree中。
void scanHintFile(HTree* tree, int bucket, const char* path, const char* new_path)
{
char *addr;
int fd;
struct stat sb;
size_t length;
fd = open(path, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "open %s failed\n", path);
return;
}
if (fstat(fd, &sb) == -1 || sb.st_size == 0){
close(fd);
return ;
}
//1
addr = (char*) mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (addr == MAP_FAILED){
fprintf(stderr, "mmap failed %s\n", path);
close(fd);
return;
}
//2
char *start = addr, *end = addr + sb.st_size;
if (strcmp(path + strlen(path) - 4, ".qlz") == 0) {
char wbuf[QLZ_SCRATCH_DECOMPRESS];
int size = qlz_size_decompressed(addr);
start = malloc(size);
int vsize = qlz_decompress(addr, start, wbuf);
if (vsize < size) {
fprintf(stderr, "decompress %s failed: %d < %d, remove it\n", path, vsize, size);
unlink(path);
exit(1);
}
end = start + vsize;
}
//爲什麼不把這一步放到前面,直接將addr對應的內容拷貝到new_path中?
if (new_path != NULL) {
if (strcmp(new_path + strlen(new_path) - 4, ".qlz") == 0) {
char* wbuf = malloc(QLZ_SCRATCH_COMPRESS);
char* dst = malloc(sb.st_size + 400);
int dst_size = qlz_compress(start, dst, end - start, wbuf);
write_file(dst, dst_size, new_path);
free(dst);
free(wbuf);
} else {
write_file(start, end - start, new_path);
}
}
//3
char *p = start;
while (p < end) {
HintRecord *r = (HintRecord*) p;
p += sizeof(HintRecord) - NAME_IN_RECORD + r->ksize + 1;
if (p > end){
fprintf(stderr, "scan %s: unexpected end, need %ld byte\n", path, p - end);
break;
}
uint32_t pos = (r->pos << 8) | (bucket & 0xff);
if (strlen(r->name) == r->ksize) {
ht_add(tree, r->name, pos, r->hash, r->version);
}else{
fprintf(stderr, "scan %s: key length not match %d\n", path, r->ksize);
}
}
munmap(addr, sb.st_size);
if (start != addr ) free(start);
close(fd);
}
//返回r中的value值
char* record_value(DataRecord *r)
{
char *res = r->value;
if (res == r->key + r->ksz + 1) {
// value was alloced in record
res = malloc(r->vsz);
memcpy(res, r->value, r->vsz);
}
return res;
}
void free_record(DataRecord *r)
{
if (r == NULL) return;
if (r->value != NULL && r->free_value) free(r->value);
free(r);
}
void compress_record(DataRecord *r)
{
int ksz = r->ksz, vsz = r->vsz;
int n = sizeof(DataRecord) - sizeof(char*) + ksz + vsz;
//比一個PADDING還大,而且沒有被壓縮過
if (n > PADDING && (r->flag & (COMPRESS_FLAG|CLIENT_COMPRESS_FLAG)) == 0) {
char *wbuf = malloc(QLZ_SCRATCH_COMPRESS);
char *v = malloc(vsz + 400);
if (wbuf == NULL || v == NULL) return ;
//先嚐試壓縮一部分,如果沒壓縮完,就重新壓縮
//取較小的
int try_size = vsz > TRY_COMPRESS_SIZE ? TRY_COMPRESS_SIZE : vsz;
int vsize = qlz_compress(r->value, v, try_size, wbuf);
//沒有壓縮完,並且嘗試壓縮的壓縮比例達到了0.7,重新壓縮
if (try_size < vsz && vsize < try_size * COMPRESS_RATIO_LIMIT){
try_size = vsz;
vsize = qlz_compress(r->value, v, try_size, wbuf);
}
free(wbuf);
//如果壓縮失敗,返回
if (vsize > try_size * COMPRESS_RATIO_LIMIT || try_size < vsz) {
free(v);
return;
}
//壓縮成功,更新r
if (r->free_value) {
free(r->value);
}
r->value = v;
r->free_value = true; //r的value需要free
r->vsz = vsize;
r->flag |= COMPRESS_FLAG;
}
}
DataRecord* decompress_record(DataRecord *r)
{
if (r->flag & COMPRESS_FLAG) {
char scratch[QLZ_SCRATCH_DECOMPRESS];
//先驗證原數據有沒有被破壞
int csize = qlz_size_compressed(r->value);
if (csize != r->vsz) {
fprintf(stderr, "broken compressed data: %d != %d, flag=%x\n", csize, r->vsz, r->flag);
goto DECOMP_END;
}
//解壓
//解壓本應得到的大小
int size = qlz_size_decompressed(r->value);
char *v = malloc(size);
//內存申請不成功也
if (v == NULL) {
fprintf(stderr, "malloc(%d)\n", size);
goto DECOMP_END;
}
int ret = qlz_decompress(r->value, v, scratch);
//解壓得到的數據少,發生錯誤
if (ret < size) {
fprintf(stderr, "decompress %s failed: %d < %d\n", r->key, ret, size);
goto DECOMP_END;
}
//更新r
if (r->free_value) {
free(r->value);
}
r->value = v;
r->free_value = true;
r->vsz = size;
r->flag &= ~COMPRESS_FLAG;
}
return r;
//r是錯誤的,釋放
DECOMP_END:
free_record(r);
return NULL;
}
DataRecord* decode_record(char* buf, int size)
{
DataRecord *r = (DataRecord *) (buf - sizeof(char*));
int ksz = r->ksz, vsz = r->vsz;
if (ksz < 0 || ksz > 200 || vsz < 0 || vsz > 100 * 1024 * 1024){
fprintf(stderr, "invalid ksz=: %d, vsz=%d\n", ksz, vsz);
return NULL;
}
int need = sizeof(DataRecord) - sizeof(char*) + ksz + vsz;
if (size < need) {
fprintf(stderr, "not enough data in buffer: %d < %d\n", size, need);
return NULL;
}
// CRC check ?
DataRecord *r2 = (DataRecord *) malloc(need + 1 + sizeof(char*));
memcpy(r2, r, sizeof(DataRecord) + ksz);
r2->key[ksz] = 0; // c str
r2->free_value = false;
r2->value = r2->key + ksz + 1;
memcpy(r2->value, r->key + ksz, vsz);
return decompress_record(r2);
}
//從f中讀取一個DataRecord
//1.分步驟讀取。
// 1.1.首先從文件中讀一個PADDING出來,這是一個DataRecord所佔的最小的文件空間。
// 1.2.計算讀取的內容中是否包含完整的value
//2.crc校驗
//3.解壓縮
DataRecord* read_record(FILE *f, bool decomp)
{
//1
//申請的空間比DataRecord的size大沒有關係。
DataRecord *r = (DataRecord*) malloc(PADDING + sizeof(char*));
r->value = NULL;
//1.1
if (fread(&r->crc, 1, PADDING, f) != PADDING) {//或者到達f的末尾,或者f爲空。
fprintf(stderr, "read record faied\n");
goto READ_END;
}
int ksz = r->ksz, vsz = r->vsz;
if (ksz < 0 || ksz > 200 || vsz < 0 || vsz > 100 * 1024 * 1024){
fprintf(stderr, "invalid ksz=: %d, vsz=%d\n", ksz, vsz);
goto READ_END;
}
uint32_t crc_old = r->crc;
//1.2
//計算PADDING的數據中除了DataRecord和它的key以外,還有多少數據。
//sizeof(char*)是DataRecord最後的key[0]
int read_size = PADDING - (sizeof(DataRecord) - sizeof(char*)) - ksz;
if (vsz < read_size) {//value只存在於剛纔讀取的PADDING裏
r->value = r->key + ksz + 1; //key的最後一個字節是結束符'\0',所以加1
r->free_value = false;
//後移一個字節,騰出空間給key的0
memmove(r->value, r->key + ksz, vsz);
//注意如果包含完整的value,那麼讀取的這個PADDING裏也沒有其它DataRecord的內容了。
//因爲是按照PADDING對齊的。
}else{//剛纔的PADDING沒有讀完,在f中還有殘留
r->value = malloc(vsz);
r->free_value = true;
//先把可以讀的讀到
memcpy(r->value, r->key + ksz, read_size);
int need = vsz - read_size;
int ret = 0;
//然後再從文件中讀
if (need > 0 && need != (ret=fread(r->value + read_size, 1, need, f))) {
r->key[ksz] = 0; // c str
fprintf(stderr, "read record %s faied: %d < %d @%ld\n", r->key, ret, need, ftell(f));
goto READ_END;
}
}
r->key[ksz] = 0; // c str
//2
uint32_t crc = crc32(0, (char*)(&r->tstamp),
sizeof(DataRecord) - sizeof(char*) - sizeof(uint32_t) + ksz);
crc = crc32(crc, r->value, vsz);
if (crc != crc_old){
fprintf(stderr, "%s @%ld crc32 check failed %d != %d\n", r->key, ftell(f), crc, r->crc);
goto READ_END;
}
//3
if (decomp) {
r = decompress_record(r);
}
return r;
READ_END:
free_record(r);
return NULL;
}
//encode與compress的不同是,encode是整個的記錄,這包括crc,而compress只是K、V
char* encode_record(DataRecord *r, int *size)
{
compress_record(r);
int m, n;
int ksz = r->ksz, vsz = r->vsz;
int hs = sizeof(char*); // over header
m = n = sizeof(DataRecord) - hs + ksz + vsz;
//湊成PADDING的整數倍,這樣,m的低八位就全爲0了
if (n % PADDING != 0) {
m += PADDING - (n % PADDING);
}
char *buf = malloc(m);
DataRecord *data = (DataRecord*)(buf - hs);
memcpy(&data->crc, &r->crc, sizeof(DataRecord)-hs);
memcpy(data->key, r->key, ksz);
memcpy(data->key + ksz, r->value, vsz);
data->crc = crc32(0, (char*)&data->tstamp, n - sizeof(uint32_t));
*size = m;
return buf;
}
//向文件f中寫記錄r,f已經定位
int write_record(FILE *f, DataRecord *r)
{
int size;
char *data = encode_record(r, &size);
if (fwrite(data, 1, size, f) < size){
fprintf(stderr, "write %d byte failed\n", size);
free(data);
return -1;
}
free(data);
return 0;
}
//遍歷DataFile中的DataRecord加入到tree中。
//注意這個函數的調用情境,是在bc_open時,發現對應hintfile不存在後才調用的。
//bc_open是datafile決定tree(因爲tree一開始是不存在的),
//而optimize是tree決定datafile(因爲tree中的數據是最新的)
//1.準備工作:打開datafile,新建一個htree來記錄hint
//2.依次讀取DataRecord,加入到tree中。
//3.新建hintfile文件。
void scanDataFile(HTree* tree, int bucket, const char* path, const char* hintpath)
{
if (bucket < 0 || bucket > 255) return;
//1
FILE *df = fopen(path, "rb");
if (NULL==df){
fprintf(stderr, "open %s failed\n", path);
return;
}
fprintf(stderr, "scan datafile %s \n", path);
//datafile對應的tree
HTree *cur_tree = ht_new(0);
fseek(df, 0, SEEK_END);
uint32_t total = ftell(df);
fseek(df, 0, SEEK_SET);
uint32_t pos = 0;
//2
while (pos < total) {
DataRecord *r = read_record(df, true);
if (r != NULL) {
uint16_t hash = gen_hash(r->value, r->vsz);
//datafile決定tree
//pos是Item->pos的前24位,bucket是後8位
if (r->version > 0){
ht_add(tree, r->key, pos | bucket, hash, r->version);
}else{
ht_remove(tree, r->key);
}
ht_add(cur_tree, r->key, pos | bucket, hash, r->version);
free_record(r);
}
//datafile文件是以PADDING個字節對齊的
pos = ftell(df);
if (pos % PADDING != 0){
int left = PADDING - (pos % PADDING);
fseek(df, left, SEEK_CUR);
pos += left;
}
}
fclose(df);
//3
build_hint(cur_tree, hintpath);
}
//只考察befor之前的record
void scanDataFileBefore(HTree* tree, int bucket, const char* path, time_t before)
{
if (bucket < 0 || bucket > 255) return;
FILE *df = fopen(path, "rb");
if (NULL == df){
fprintf(stderr, "open %s failed\n", path);
return;
}
fprintf(stderr, "scan datafile %s before %ld\n", path, before);
fseek(df, 0, SEEK_END);
uint32_t total = ftell(df);
fseek(df, 0, SEEK_SET);
uint32_t pos = 0;
while (pos < total) {
DataRecord *r = read_record(df, true);
if (r != NULL) {
//這個記錄是在時間戳之後纔有的
if (r->tstamp >= before ){
break;
}
if (r->version > 0){
uint16_t hash = gen_hash(r->value, r->vsz);
ht_add(tree, r->key, pos | bucket, hash, r->version);
}else{
ht_remove(tree, r->key);
}
free_record(r);
}
pos = ftell(df);
if (pos % PADDING != 0){
int left = PADDING - (pos % PADDING);
fseek(df, left, SEEK_CUR);
pos += left;
}
}
fclose(df);
}
//計算刪除掉的記錄
//從path對應的hint文件中,逐一掃描HintRecord,如果發現HintRecord跟tree中的key對應的
//Item不符,或者tree中不存在,或者tree中的ver小於0,那麼deleted++
//total記錄hint文件中總的HintRecord的數目
//1.打開path(hint)處的文件,讀取內容並解壓,存入到一個buf中
//2.從buf中依次得到HintRecord
//3.比較這些record在tree中是否被刪除了(ver<0或者tree中不存在)或者被移動到了其它的文件
static int count_deleted_record(HTree* tree, int bucket, const char* path, int *total)
{
char *addr;
int fd;
struct stat sb;
size_t length;
*total = 0;
//1
fd = open(path, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "open %s failed\n", path);
return 0;
}
if (fstat(fd, &sb) == -1 || sb.st_size == 0){
close(fd);
return 0;
}
addr = (char*) mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (addr == MAP_FAILED){
fprintf(stderr, "mmap failed %s\n", path);
close(fd);
return 0;
}
//解壓
char *start = addr, *end = addr + sb.st_size;
if (strcmp(path + strlen(path) - 4, ".qlz") == 0) {
char wbuf[QLZ_SCRATCH_DECOMPRESS];
int size = qlz_size_decompressed(addr);
start = malloc(size);
int vsize = qlz_decompress(addr, start, wbuf);
if (vsize < size) {
fprintf(stderr, "decompress %s failed: %d < %d, remove it\n", path, vsize, size);
unlink(path);
return 0;
}
end = start + vsize;
}
char *p = start;
int deleted = 0;
while (p < end) {
HintRecord *r = (HintRecord*) p;
p += sizeof(HintRecord) - NAME_IN_RECORD + r->ksize + 1;
if (p > end){
fprintf(stderr, "scan %s: unexpected end, need %ld byte\n", path, p - end);
break;
}
(*total) ++;
Item *it = ht_get(tree, r->name);
//關於it->pos != ((r->pos << 8) | bucket):
//如果一個record被刪除了,然後相同的key又被插入,這樣兩個datafile中就會有
//相同的key對應的data,但是bc->tree中是隻有一個的,可以據此消除重複
if (it == NULL || it->pos != ((r->pos << 8) | bucket) || it->ver <= 0) {
deleted ++;
}
if (it) free(it);
}
munmap(addr, sb.st_size);
if (start != addr) free(start);
close(fd);
return deleted;
}
//優化,通過hintpath的統計記錄,來決定是否優化data文件
//將有效record對應的item保存至一棵新建的樹中,也就是用來進行hint的tree
//1.估算是否值得優化,如果是,打開一個臨時文件進行寫入
//2.掃面datafile中的每個DataRecord,看看它
// a.在tree中不存在
// b.改變了位置——或者不在這個文件中,或者在文件中的其它位置
// c.ver < 0
// 如果以上條件都不滿足,才能寫進新的文件中
//3.修改臨時文件名,完成優化。
HTree* optimizeDataFile(HTree* tree, int bucket, const char* path, const char* hintpath, int limit)
{
//1
int all = 0;
//hintpath的文件中保存的是老數據。需要跟tree裏的新數據比較。
int deleted = count_deleted_record(tree, bucket, hintpath, &all);
//只有刪除的record佔到總record的十分之一,才進行優化
if (deleted <= all * 0.1 && deleted <= limit) {
fprintf(stderr, "only %d records deleted in %d, skip %s\n", deleted, all, path);
return NULL;
}
FILE *df = fopen(path, "rb");
if (NULL==df){
fprintf(stderr, "open %s failed\n", path);
return NULL;
}
char tmp[255];
sprintf(tmp, "%s.tmp", path);
FILE *new_df = fopen(tmp, "wb");
if (NULL==new_df){
fprintf(stderr, "open %s failed\n", tmp);
fclose(df);
return NULL;
}
//1
HTree *cur_tree = ht_new(0);
fseek(df, 0, SEEK_END);
uint32_t total = ftell(df);
fseek(df, 0, SEEK_SET);
uint32_t pos = 0;
deleted = 0;
while (pos < total) {
DataRecord *r = read_record(df, false);
if (r != NULL) {
Item *it = ht_get(tree, r->key);
//這個item是在這個datafile中的
//與scanDataFIle相對應,這裏是tree決定datafile
if (it && it->pos == (pos | bucket) && it->ver > 0) {
r->version = it->ver;
uint32_t new_pos = ftell(new_df);
uint16_t hash = it->hash;
//數據在datafile中的pos改變了。
ht_add(cur_tree, r->key, new_pos | bucket, hash, it->ver);
if (write_record(new_df, r) != 0) {
ht_destroy(cur_tree);
fclose(df);
fclose(new_df);
return NULL;
}
}else{
deleted ++;
}
if (it) free(it);
free_record(r);
}
//對齊
pos = ftell(df);
if (pos % PADDING != 0){
int left = PADDING - (pos % PADDING);
fseek(df, left, SEEK_CUR);
pos += left;
}
}
uint32_t deleted_bytes = ftell(df) - ftell(new_df);
fclose(df);
fclose(new_df);
//3
unlink(hintpath);
unlink(path);
rename(tmp, path);
fprintf(stderr, "optimize %s complete, %d records deleted, %d bytes came back\n",
path, deleted, deleted_bytes);
return cur_tree;
}
//對datafile中的record進行遍歷。
void visit_record(const char* path, RecordVisitor visitor, void *arg1, void *arg2, bool decomp)
{
FILE *df = fopen(path, "rb");
if (NULL==df){
fprintf(stderr, "open %s failed\n", path);
return;
}
fprintf(stderr, "scan datafile %s \n", path);
fseek(df, 0, SEEK_END);
uint32_t total = ftell(df);
fseek(df, 0, SEEK_SET);
uint32_t pos = 0;
while (pos < total) {
DataRecord *r = read_record(df, decomp);
if (r != NULL) {
bool cont = visitor(r, arg1, arg2);
if (cont) break;
}
pos = ftell(df);
if (pos % PADDING != 0){
int left = PADDING - (pos % PADDING);
fseek(df, left, SEEK_CUR);
pos += left;
}
}
fclose(df);
}