Berkeley DB

Berkeley DB 是一個嵌入式數據庫,它適合於管理海量的(256T)、簡單的數據。BDB是以鍵值對(value/key)來存儲和管理數據庫的。鍵可以重複,數據值可以是任意類型的。BDB的底層是用B+樹或者其他算法實現的。我用的jar包是B+樹實現的版本。

Berkeley DB是用Environment對象來管理數據庫,一個Environment可以管理多個database。每個database都存儲鍵值對,而序列化到磁盤上是通過catalog實現的。BDB的操作是在內存和磁盤上的,最終BDB的存取結果集在程序中使用是通過容器實現的(數據庫在程序中的視圖)。

所以BDB的使用分爲5步:

  • 創建Environment
    //Open Environment
    private Environment environment;
    //同EnvironmentConfig來配置環境
    EnvironmentConfig environmentConfig=new EnvironmentConfig();
    environmentConfig.setTransactional(true);
    environmentConfig.setAllowCreate(true);
    //homeDirectory是數據庫存放的目錄
    environment=new Environment(new File(homeDirectory),environmentConfig);
  • 打開database和catalog
    protected StoredClassCatalog catalog;//catalog
    protected Database database;//database
    private static final String CLASS_CATALOG="java_class_catalog";//數據庫名
    protected Database catalogDatabase;//catalog存放處

    //open Database
    DatabaseConfig dbConfig=new DatabaseConfig();//數據庫配置
    dbConfig.setTransactional(true);
    catalogDBConfig.setAllowCreate(true);
    dbConfig.setSortedDuplicates(false);//不存重複鍵值
    database=environment.openDatabase(null, "URL", dbConfig);//存放實際數據的數據庫   

    //Open Catalog
    DatabaseConfig catalogDBConfig=new DatabaseConfig();//數據庫配置
    catalogDBConfig.setTransactional(true);
    catalogDBConfig.setAllowCreate(true);
    catalogDatabase=environment.openDatabase(null, CLASS_CATALOG, catalogDBConfig);
    //用StoredClassCatalog類將catalog保存在db中並返回可操作對象。
    catalog=new StoredClassCatalog(catalogDatabase);
  • 存儲序列化類型綁定
    //鍵綁定
    EntryBinding<Integer> keyBinding=new SerialBinding<Integer>(catalog,Integer.class);
    //值綁定
    SerialBinding<Url> valueBinding=new SerialBinding<Url>(catalog,Url.class);
  • 存儲結果容器訪問
    StoreMap<Integer,Url> urlMap;//結果集的操作容器(視圖)
    urlMap=new StoredMap<Integer,Url>(database,keyBinding,valueBinding,true);
  • 關閉資源
     database.close();
     catalog.close();
     environment.close();

我在爬蟲項目中庸BDB作爲內存數據庫來保存未訪問的URL。由於StoredMap元素存儲無序,StoredsortedMap給元素排序後存儲。但都不是隊列的序列。故本例中以整數位主鍵存儲,記錄整數的值來模擬隊列的頭和尾。StoredMap實現了Map接口,可以使用其所有函數,例如:get()、remove()、put()等。

java代碼如下:

//BDBFrontier.java
import java.io.File;
import java.io.FileNotFoundException;

import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;

public abstract class BDBFrontier{
    public BDBFrontier(String homeDirectory) throws DatabaseException,FileNotFoundException{
        //Open Environment
        System.out.println("Opening environment in: "+homeDirectory);
        EnvironmentConfig environmentConfig=new EnvironmentConfig();
        environmentConfig.setTransactional(true);
        environmentConfig.setAllowCreate(true);
        environment=new Environment(new File(homeDirectory),environmentConfig);

        //Open Catalog
        DatabaseConfig catalogDBConfig=new DatabaseConfig();
        catalogDBConfig.setTransactional(true);
        catalogDBConfig.setAllowCreate(true);
        catalogDatabase=environment.openDatabase(null, CLASS_CATALOG, catalogDBConfig);
        catalog=new StoredClassCatalog(catalogDatabase);

        //open Database
        DatabaseConfig dbConfig=new DatabaseConfig();
        dbConfig.setTransactional(true);
        dbConfig.setAllowCreate(true);
        database=environment.openDatabase(null, "URL", dbConfig);
    }

    public void close() throws DatabaseException{
        database.close();
        catalog.close();//這句應該可以關閉與之相關的數據庫,但是API上沒有將
        environment.close();
    }

    protected abstract Object put(Object key,Object value);
    protected abstract Object get(Object key);
    protected abstract Object delete(Object key);

    private Environment environment;
    protected StoredClassCatalog catalog;
    protected Database database;

    private static final String CLASS_CATALOG="java_class_catalog";//name
    protected Database catalogDatabase;
}

//BDBFrontier.java
import java.io.FileNotFoundException;
import com.sleepycat.bind.EntryBinding;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.collections.StoredMap;
import com.sleepycat.je.DatabaseException;

public class Frontier extends BDBFrontier implements UrlFrontier{

    private StoredMap<Integer, Url> urlMap=null;//a database view
    private Integer head;//URL隊列頭
    private Integer tail;//URL隊列尾

    public Frontier(String homeDirectory) throws DatabaseException, FileNotFoundException {
        super(homeDirectory);
        // TODO Auto-generated constructor stub
        EntryBinding<Integer> keyBinding=new SerialBinding<Integer>(catalog,Integer.class);
        SerialBinding<Url> valueBinding=new SerialBinding<Url>(catalog,Url.class);
        //創建視圖,並設置爲可寫
        urlMap=new StoredMap<Integer,Url>(database,keyBinding,valueBinding,true);//true可寫
        head=0;
        tail=0;
    }

    @Override
    public Url getNext() throws Exception {
        // TODO Auto-generated method stub
        Url result=null;
        if(!urlMap.isEmpty()){
            result=urlMap.get(head);
            delete(head++);
        }
        return result;
    }

    @Override
    public boolean putUrl(Url url) throws Exception {
        // TODO Auto-generated method stub
        if( put(tail++,url) != null) return true;
        else return false;
    }

    @Override
    protected Object put(Object key, Object value) {
        // TODO Auto-generated method stub
        return urlMap.put((Integer)key, (Url)value);

    }

    @Override
    protected Object get(Object key) {
        // TODO Auto-generated method stub
        return urlMap.get(key);
    }

    @Override
    protected Object delete(Object key) {
        // TODO Auto-generated method stub
        return urlMap.remove(key);
    }

    public boolean isEmpty() {
        // TODO Auto-generated method stub
        return urlMap.isEmpty();
    }

    public boolean contains(Url url) {
        // TODO Auto-generated method stub
        return urlMap.containsValue(url);
    }

    public static void main(String[] args){
        try{
            Frontier frontier=new Frontier("D:\\workspace\\db");
            Url url=new Url();
            url.setOriUrl("http://www.163.com");
            frontier.putUrl(url);
            url.setOriUrl("http://www.164.com");
            frontier.putUrl(url);
            url.setOriUrl("http://www.165.com");
            frontier.putUrl(url);
            System.out.println(frontier.getNext().getOriUrl());
            System.out.println(frontier.getNext().getOriUrl());
            System.out.println(frontier.getNext().getOriUrl());
            frontier.close();
        }catch(Exception e){
            e.printStackTrace();
        }finally{

        }
    }
}

參考文獻
自己動手寫網絡爬蟲
嵌入式數據庫系統Berkeley DB
Berkeley DB 使用經驗總結

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章