Zookeeper源碼解析1 一數據初始化

一、初始化調用層級關係

org.apache.zookeeper.server.ZooKeeperServerMain

public class ZooKeeperServerMain {

    private ServerCnxnFactory cnxnFactory;

    public static void main(String[] args) {
        ZooKeeperServerMain main = new ZooKeeperServerMain();
        try {
            main.initializeAndRun(args);
        } catch (IllegalArgumentException e) {
        }
    }

    protected void initializeAndRun(String[] args) throws ConfigException, IOException
    {
       // 其他代碼省略.... ... 
        runFromConfig(config);
    }

    /**
     * Run from a ServerConfig.
     * @param config ServerConfig to use.
     * @throws IOException
     */
    public void runFromConfig(ServerConfig config) throws IOException {
        LOG.info("Starting server");
        FileTxnSnapLog txnLog = null;
        try {
            // Note that this thread isn't going to be doing anything else,
            // so rather than spawning another thread, we will just call
            // run() in this thread.
            // create a file logger url from the command line args
            final ZooKeeperServer zkServer = new ZooKeeperServer();
            // Registers shutdown handler which will be used to know the
            // server error or shutdown state changes.
            final CountDownLatch shutdownLatch = new CountDownLatch(1);
            zkServer.registerServerShutdownHandler(
                    new ZooKeeperServerShutdownHandler(shutdownLatch));

            txnLog = new FileTxnSnapLog(new File(config.dataLogDir), new File(
                    config.dataDir));
            txnLog.setServerStats(zkServer.serverStats());
            zkServer.setTxnLogFactory(txnLog);
            zkServer.setTickTime(config.tickTime);
            zkServer.setMinSessionTimeout(config.minSessionTimeout);
            zkServer.setMaxSessionTimeout(config.maxSessionTimeout);
            cnxnFactory = ServerCnxnFactory.createFactory();
            cnxnFactory.configure(config.getClientPortAddress(),
                    config.getMaxClientCnxns());
            
            //開始初始化zk服務........        
            cnxnFactory.startup(zkServer);
            
            // Watch status of ZooKeeper server. It will do a graceful shutdown
            // if the server is not running or hits an internal error.
            shutdownLatch.await();
            shutdown();
            cnxnFactory.join();
            if (zkServer.canShutdown()) {
                zkServer.shutdown(true);
            }
        } catch (InterruptedException e) {
            // warn, but generally this is ok
            LOG.warn("Server interrupted", e);
        } finally {
            if (txnLog != null) {
                txnLog.close();
            }
        }
    }
}

org.apache.zookeeper.server.NettyServerCnxnFactory#startup 這裏初始化方式有多種通信方式:

@Override
public void startup(ZooKeeperServer zks) throws IOException,
        InterruptedException {
    //綁定通訊端口        
    start();
    //將當前服務加入集羣序列
    setZooKeeperServer(zks);
    //初始化數據
    zks.startdata();
    zks.startup();
}

org.apache.zookeeper.server.ZooKeeperServer#startdata

public void startdata() throws IOException, InterruptedException {
    //check to see if zkDb is not null
    if (zkDb == null) {
        zkDb = new ZKDatabase(this.txnLogFactory);
    } 
    //如果zkData未初始化，則開始數據初始 
    if (!zkDb.isInitialized()) {
        loadData();
    }
}

/**
 *  Restore sessions and data
 */
public void loadData() throws IOException, InterruptedException {
    /*
     * When a new leader starts executing Leader#lead, it 
     * invokes this method. The database, however, has been
     * initialized before running leader election so that
     * the server could pick its zxid for its initial vote.
     * It does it by invoking QuorumPeer#getLastLoggedZxid.
     * Consequently, we don't need to initialize it once more
     * and avoid the penalty of loading it a second time. Not 
     * reloading it is particularly important for applications
     * that host a large database.
     * 
     * The following if block checks whether the database has
     * been initialized or not. Note that this method is
     * invoked by at least one other method: 
     * ZooKeeperServer#startdata.
     *  
     * See ZOOKEEPER-1642 for more detail.
     * 當一個新的
     */
    if(zkDb.isInitialized()){
        setZxid(zkDb.getDataTreeLastProcessedZxid());
    }
    else {
        //初始化數據並把返回的最終zxid賦予給當前服務
        setZxid(zkDb.loadDataBase());
    }
    
    // Clean up dead sessions
    LinkedList<Long> deadSessions = new LinkedList<Long>();
    for (Long session : zkDb.getSessions()) {
        if (zkDb.getSessionWithTimeOuts().get(session) == null) {
            deadSessions.add(session);
        }
    }
    zkDb.setDataTreeInit(true);
    for (long session : deadSessions) {
        // XXX: Is lastProcessedZxid really the best thing to use?
        killSession(session, zkDb.getDataTreeLastProcessedZxid());
    }
}

org.apache.zookeeper.server.ZKDatabase#loadDataBase

/**
 * load the database from the disk onto memory and also add 
 * the transactions to the committedlog in memory.
 * @return the last valid zxid on disk
 * @throws IOException
 */
public long loadDataBase() throws IOException {
    //開始從快照日誌中恢復事務信息
    long zxid = snapLog.restore(dataTree, sessionsWithTimeouts, commitProposalPlaybackListener);
    initialized = true;
    return zxid;
}

org.apache.zookeeper.server.persistence.FileTxnSnapLog#restore

/**
 * this function restores the server 
 * database after reading from the 
 * snapshots and transaction logs
 * @param dt the datatree to be restored
 * @param sessions the sessions to be restored
 * @param listener the playback listener to run on the 
 * database restoration
 * @return the highest zxid restored
 * @throws IOException
 */
public long restore(DataTree dt, Map<Long, Integer> sessions, 
        PlayBackListener listener) throws IOException {
    //從快照中恢復數據到緩存DataTree
    snapLog.deserialize(dt, sessions);
    //從事務日誌中恢復數據到DataTree
    return fastForwardFromEdits(dt, sessions, listener);
}

到這裏開始代碼有分支了，第一支去快照文件中獲取信息，第二支從事務日誌中獲取信息 (快照代碼後面用藍色標識，事務日誌代碼用紅色標識)

二、快照文件處理部分：

*org.apache.zookeeper.server.persistence.FileSnap#deserialize(org.apache.zookeeper.server.DataTree, java.util.Map<java.lang.Long,java.lang.Integer>) (核心代碼) *

/**
 * deserialize a data tree from the most recent snapshot
 * @return the zxid of the snapshot
 */ 
public long deserialize(DataTree dt, Map<Long, Integer> sessions)
        throws IOException {
    // we run through 100 snapshots (not all of them)
    // if we cannot get it running within 100 snapshots
    // we should  give up
    //這裏默認獲取最新的100個快照(作爲默認獲取100個快照文件是爲了在下面的操作中，最大可能初始化成功快照數據到datatree)
    List<File> snapList = findNValidSnapshots(100);//具體方法見下文 !!!!
    if (snapList.size() == 0) {
        return -1L;
    }
    File snap = null;
    boolean foundValid = false;
    for (int i = 0; i < snapList.size(); i++) {
        snap = snapList.get(i);
        InputStream snapIS = null;
        CheckedInputStream crcIn = null;
        try {
            LOG.info("Reading snapshot " + snap);
            snapIS = new BufferedInputStream(new FileInputStream(snap));
            crcIn = new CheckedInputStream(snapIS, new Adler32());
            InputArchive ia = BinaryInputArchive.getArchive(crcIn);
            //數據反序列化
            deserialize(dt,sessions, ia);

            long checkSum = crcIn.getChecksum().getValue();
            long val = ia.readLong("val");
            //如果快照被損壞，則直接拋出異常，爲毛不是先判斷再執行 deserialize(dt,sessions, ia); 操作？？？
            if (val != checkSum) {
                throw new IOException("CRC corruption in snapshot :  " + snap);
            }
            foundValid = true;
            //因爲快照文件序列是按照從大到小排列，能執行到這裏說明已經加載了
            //最新可用的快照文件到datatree,此時就不需要再遍歷加載其他的相比老舊的快照
            break;
        } catch(IOException e) {
            LOG.warn("problem reading snap file " + snap, e);
        } finally {
            if (snapIS != null) 
                snapIS.close();
            if (crcIn != null) 
                crcIn.close();
        } 
    }
    if (!foundValid) {
        throw new IOException("Not able to find valid snapshots in " + snapDir);
    }
    //返回最新可用快照中處理的最新zxid(這是目前基於快照而言最新的zxid，但是不是實際最新的zxid，
    //因爲快照是按照時間段來記錄的，很有可能在生成快照的時候，此時又有新的數據執行而當前快照並未記錄
    //這也就是爲什麼會在後面處理完快照數據後還要從事務日誌中重新判定獲取zxid)
    dt.lastProcessedZxid = Util.getZxidFromName(snap.getName(), SNAPSHOT_FILE_PREFIX);
    return dt.lastProcessedZxid;
}



/** 獲取快照文件
 * find the last (maybe) valid n snapshots. this does some 
 * minor checks on the validity of the snapshots. It just
 * checks for / at the end of the snapshot. This does
 * not mean that the snapshot is truly valid but is
 * valid with a high probability. also, the most recent 
 * will be first on the list. 
 * @param n the number of most recent snapshots
 * @return the last n snapshots (the number might be
 * less than n in case enough snapshots are not available).
 * @throws IOException
 */
private List<File> findNValidSnapshots(int n) throws IOException {
    //獲取目錄下的全量快照文件列表，並按照快照名稱降序排列,確保在後繼初始化中能夠最先被執行
    //(雖然傳入了取n個文件，而實際是爲了排序獲取前n個所以這裏是需要取出該目錄下所有快照文件列表，同樣這裏需要注意一下排序規則，是按照快照文件名稱降序排序)
    List<File> files = Util.sortDataDir(snapDir.listFiles(), SNAPSHOT_FILE_PREFIX, false);
    int count = 0;
    List<File> list = new ArrayList<File>();
    for (File f : files) {
        // we should catch the exceptions
        // from the valid snapshot and continue
        // until we find a valid one
        try {
            //確保的確是快照文件，則遍歷放入到新的文件列表中
            if (Util.isValidSnapshot(f)) {
                list.add(f);
                count++;
                //當遍歷到需要的快照個數則直接退出,因爲參數指定了取n個
                if (count == n) {
                    break;
                }
            }
        } catch (IOException e) {
            LOG.info("invalid snapshot " + f, e);
        }
    }
    return list;
}

三、事務日誌處理部分

org.apache.zookeeper.server.persistence.FileTxnSnapLog#fastForwardFromEdits （核心代碼）

/**
 * This function will fast forward the server database to have the latest
 * transactions in it.  This is the same as restore, but only reads from
 * the transaction logs and not restores from a snapshot.
 * @param dt the datatree to write transactions to.
 * @param sessions the sessions to be restored.
 * @param listener the playback listener to run on the
 * database transactions.
 * @return the highest zxid restored.
 * @throws IOException
 *
 * 快速恢復事務
 */
public long fastForwardFromEdits(DataTree dt, Map<Long, Integer> sessions,
                                 PlayBackListener listener) throws IOException {

    //獲取事務日誌
    FileTxnLog txnLog = new FileTxnLog(dataDir);
    //dt.lastProcessedZxid+1,用這個最大的zxid + 1去事務日誌文件中找事務日誌 
    TxnIterator itr = txnLog.read(dt.lastProcessedZxid+1);
    long highestZxid = dt.lastProcessedZxid;
    TxnHeader hdr;
    try {
        while (true) {
            // iterator points to 
            // the first valid txn when initialized
            hdr = itr.getHeader();
            //如果事務集合中沒有，則確認最新的事務ID就是快照中的最新zxid
            if (hdr == null) {
                //empty logs 
                return dt.lastProcessedZxid;
            }
            //如果當前遍歷的事務日誌中的zxid大於快照中最新的zxid，則將最高zxID即highestZxid的值改爲當前事務日誌的zxid
            if (hdr.getZxid() < highestZxid && highestZxid != 0) {
                LOG.error("{}(higestZxid) > {}(next log) for type {}",
                        new Object[] { highestZxid, hdr.getZxid(),
                                hdr.getType() });
            } else {
                highestZxid = hdr.getZxid();
            }
            try {
                //同時執行該事務
                processTransaction(hdr,dt,sessions, itr.getTxn());
            } catch(KeeperException.NoNodeException e) {
               throw new IOException("Failed to process transaction type: " +
                     hdr.getType() + " error: " + e.getMessage(), e);
            }
            //其主要用來接收事務應用過程中的回調，在Zookeeper數據恢復後期，會有事務修正過程，此過程會回調PlayBackListener來進行對應的數據修正
            listener.onTxnLoaded(hdr, itr.getTxn());
            if (!itr.next()) 
                break;
        }
    } finally {
        if (itr != null) {
            itr.close();
        }
    }
    return highestZxid;
}

org.apache.zookeeper.server.persistence.FileTxnLog#read

/**
 * start reading all the transactions from the given zxid
 * @param zxid the zxid to start reading transactions from
 * @return returns an iterator to iterate through the transaction
 * logs
 */
public TxnIterator read(long zxid) throws IOException {
    return new FileTxnIterator(logDir, zxid);
}



public FileTxnIterator(File logDir, long zxid) throws IOException {
  this.logDir = logDir;
  this.zxid = zxid;
  init();
}

/**
 * initialize to the zxid specified
 * this is inclusive of the zxid
 * @throws IOException
 */
void init() throws IOException {
    storedFiles = new ArrayList<File>();
    //獲取所有事務日誌文件
    List<File> files = Util.sortDataDir(FileTxnLog.getLogFiles(logDir.listFiles(), 0), LOG_FILE_PREFIX, false);
    for (File f: files) {
        //將大於傳入的zxid(快照最新zxid)的事務日誌文件放入存儲列表中
        if (Util.getZxidFromName(f.getName(), LOG_FILE_PREFIX) >= zxid) {
            storedFiles.add(f);
        }
        // add the last logfile that is less than the zxid
        //將最新的小於zxid的事務日誌文件放入存儲表中並退出循環
        else if (Util.getZxidFromName(f.getName(), LOG_FILE_PREFIX) < zxid) {
            storedFiles.add(f);
            break;
        }
    }
    goToNextLog();
    if (!next())
        return;
    while (hdr.getZxid() < zxid) {
        if (!next())
            return;
    }
}

/**
 * go to the next logfile
 * @return true if there is one and false if there is no
 * new file to be read
 * @throws IOException
 */
private boolean goToNextLog() throws IOException {
    //如果存儲列表中有事務日誌文件，則刪除存儲列表中最後一個文件
    if (storedFiles.size() > 0) {
        this.logFile = storedFiles.remove(storedFiles.size()-1);
        //爲被刪除的事務日誌文件創建輸入流
        ia = createInputArchive(this.logFile);
        return true;
    }
    return false;
}



/**
 * the iterator that moves to the next transaction
 * @return true if there is more transactions to be read
 * false if not.
 */
public boolean next() throws IOException {
    if (ia == null) {
        return false;
    }
    try {
        long crcValue = ia.readLong("crcvalue");
        byte[] bytes = Util.readTxnBytes(ia);
        // Since we preallocate, we define EOF to be an
        if (bytes == null || bytes.length==0) {
            throw new EOFException("Failed to read " + logFile);
        }
        // EOF or corrupted record
        // validate CRC
        Checksum crc = makeChecksumAlgorithm();
        crc.update(bytes, 0, bytes.length);
        if (crcValue != crc.getValue())
            throw new IOException(CRC_ERROR);
        hdr = new TxnHeader();
        record = SerializeUtils.deserializeTxn(bytes, hdr);
    } catch (EOFException e) {
        LOG.debug("EOF excepton " + e);
        inputStream.close();
        inputStream = null;
        ia = null;
        hdr = null;
        // this means that the file has ended
        // we should go to the next file
        if (!goToNextLog()) {
            return false;
        }
        // if we went to the next log file, we should call next() again
        return next();
    } catch (IOException e) {
        inputStream.close();
        throw e;
    }
    return true;
}