Lucene的索引文件格式(2)這篇文章寫的非常好,參考他寫了解析segments.gen和segments_1的代碼。
代碼如下(lucene版本爲:lucene-core-3.4.0.jar)
package format;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IndexInput;
import constant.Constant;
/**
* 讀取segments.gen,segments_1中的信息
* 參考:http://www.cnblogs.com/forfuture1978/archive/2009/12/14/1623599.html 索引文件格式分析
*/
public class SegmentFormatAnalysis {
private SegmentFormatAnalysis() {
}
/**
* Index all text files under a directory.
*
* @throws IOException
*/
public static void main(String[] args) throws IOException {
getSEGMENTS_GEN();
// 打開索引文件夾
Runtime.getRuntime().exec("cmd.exe /c start " + Constant.INDEX_PATH);
}
private static void readSegmentInfo(IndexInput input, int format)
throws IOException {
System.out.println("version:" + input.readString());
String name = input.readString();
System.out.println("SegName(段名):" + name);
int docCount = input.readInt();
System.out.println("SegSize(此段中包含的文檔數):" + docCount);
/****/
System.out.println("format <= SegmentInfos.FORMAT_LOCKLESS:"
+ (format <= SegmentInfos.FORMAT_LOCKLESS));
if (format <= SegmentInfos.FORMAT_LOCKLESS) {
long delGen = input.readLong();
System.out.println("delGen :" + delGen);
} else {
System.err.println("format 有誤");
System.exit(-1);
}// end o if else
/****/
/****/
int docStoreOffset;
String docStoreSegment;
boolean docStoreIsCompoundFile;
if (format <= SegmentInfos.FORMAT_SHARED_DOC_STORE) {
docStoreOffset = input.readInt();
if (docStoreOffset != -1) {
docStoreSegment = input.readString();
docStoreIsCompoundFile = (1 == input.readByte());
} else {
docStoreSegment = name;
docStoreIsCompoundFile = false;
}
} else {
docStoreOffset = -1;
docStoreSegment = name;
docStoreIsCompoundFile = false;
}
System.out
.println("docStoreOffset(如果DocStoreOffset爲-1,則此段單獨存儲自己的域(Stored Field)和詞向量(Term Vector)) :"
+ docStoreOffset);
System.out.println("docStoreSegment (docStoreSegment是域和詞向量信息存儲的段):"
+ docStoreSegment);
System.out.println("docStoreIsCompoundFile :" + docStoreIsCompoundFile);
/****/
/****/
boolean hasSingleNormFile;
if (format <= SegmentInfos.FORMAT_SINGLE_NORM_FILE) {
hasSingleNormFile = (1 == input.readByte());
} else {
hasSingleNormFile = false;
}
System.out.println("hasSingleNormFile :" + hasSingleNormFile);
/****/
/****/
int numNormGen = input.readInt();
System.out.println("numNormGen :" + numNormGen);
/****/
byte isCompoundFile = input.readByte();
System.out.println("isCompoundFile :" + isCompoundFile);
int delCount;
if (format <= SegmentInfos.FORMAT_DEL_COUNT) {
delCount = input.readInt();
assert delCount <= docCount;
} else
delCount = -1;
System.out.println("delCount :" + delCount);
boolean hasProx;
if (format <= SegmentInfos.FORMAT_HAS_PROX)
hasProx = input.readByte() == 1;
else
hasProx = true;
System.out.println("hasProx :" + hasProx);
Map<String, String> diagnostics;
if (format <= SegmentInfos.FORMAT_DIAGNOSTICS) {
diagnostics = input.readStringStringMap();
} else {
diagnostics = Collections.<String, String> emptyMap();
}
//
if (format <= SegmentInfos.FORMAT_HAS_VECTORS) {
boolean hasVectors = input.readByte() == 1;
System.out.println("hasVectors :" + hasVectors);
} else {
System.err.println("format 有誤");
System.exit(-1);
}
}// end of method
/**
* 讀取segments_0,segments_1信息
*
* @param segmentFileName
* @throws IOException
*/
private static void getSEGMENTS_N(String segmentFileName)
throws IOException {
System.out.println("---------------------------");
String indexPath = Constant.INDEX_PATH;
Directory directory = FSDirectory.open(new File(indexPath));
// String segmentFileName = "segments_1";
ChecksumIndexInput input = new ChecksumIndexInput(directory
.openInput(segmentFileName));
// Lucene 2.1此值-3,Lucene 2.9時,此值爲-9。
int format = input.readInt();
System.out.println("Format:" + format);
System.out.println("Version(估計其實存儲的時索引最後修改的時間毫秒數):" + input.readLong());
// System.out.println("Version:" + System.currentTimeMillis());
// 是下一個新段(Segment)的段名。
// 所有屬於同一個段的索引文件都以段名作爲文件名,一般爲_0.xxx, _0.yyy, _1.xxx, _1.yyy ……
// 新生成的段的段名一般爲原有最大段名加一。
// 如同的索引,NameCount讀出來是2,說明新的段爲_2.xxx, _2.yyy
System.out.println("NameCount:" + input.readInt());
int segCount = input.readInt();
System.out.println("SegCount(Segment的個數):" + segCount);
for (int i = segCount; i > 0; i--) {
System.out.println("第" + i + "段信息 begin+++++++++++");
readSegmentInfo(input, format);
System.out.println("第" + i + "段信息 end +++++++++++");
// SegmentInfo si = new SegmentInfo(directory, format, input);
}// end of for
//
if (format >= 0) { // in old format the version number may be at the end
long version = -250;
if (input.getFilePointer() >= input.length())
version = System.currentTimeMillis(); // old file format without
else
version = input.readLong(); // read version
System.out.println("version:" + version);
}
// 保存了用戶從字符串到字符串的映射Map
Map<String, String> userData;
if (format <= SegmentInfos.FORMAT_USER_DATA) {
if (format <= SegmentInfos.FORMAT_DIAGNOSTICS) {
userData = input.readStringStringMap();
} else if (0 != input.readByte()) {
userData = Collections.singletonMap("userData", input
.readString());
} else {
userData = Collections.<String, String> emptyMap();
}
} else {
userData = Collections.<String, String> emptyMap();
}
// 遍歷map-userData
System.out.println("遍歷userData:");
Set<Map.Entry<String, String>> set = userData.entrySet();
for (Iterator<Map.Entry<String, String>> it = set.iterator(); it
.hasNext();) {
Map.Entry<String, String> entry = (Map.Entry<String, String>) it
.next();
System.out.println(entry.getKey() + "--->" + entry.getValue());
}// end of for
if (format <= SegmentInfos.FORMAT_CHECKSUM) {
final long checksumNow = input.getChecksum();
final long checksumThen = input.readLong();
System.out.println("checksumNow:" + checksumNow + "\tchecksumThen:"
+ checksumThen);
if (checksumNow != checksumThen)
throw new CorruptIndexException(
"checksum mismatch in segments file");
}
input.close();
}// end of method
/**
* 獲取segments.gen中的信息
*
* @throws IOException
*/
public static void getSEGMENTS_GEN() throws IOException {
String indexPath = Constant.INDEX_PATH;
Directory dir = FSDirectory.open(new File(indexPath));
IndexInput genInput = dir.openInput(IndexFileNames.SEGMENTS_GEN);// "segments.gen"
int version = genInput.readInt();// 讀出版本號
// Lucene 2.1此值-3,Lucene 2.9時,此值爲-9。
System.out.println("version:" + version);
if (version == SegmentInfos.FORMAT_LOCKLESS) {// 如果版本號正確
long gen0 = genInput.readLong();// 讀出第一個N
long gen1 = genInput.readLong();// 讀出第二個N
System.out.println("gen0:" + gen0 + "\tgen1:" + gen1);
long gen = gen1;
String segmentFileName = IndexFileNames.SEGMENTS + "_" + gen;
System.out.println("*************************" + segmentFileName
+ "分析結果爲:" + "*************************");
getSEGMENTS_N(segmentFileName);
// if (gen0 == gen1) {//如果兩者相等則爲genB
// genB = gen0;
// }
} else {
System.out.println("version錯誤:" + version);
System.exit(-1);
}
}
}
………………