- package com.jadyer.lucene;
- import java.io.File;
- import java.io.IOException;
- import java.text.SimpleDateFormat;
- import java.util.Date;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.NumericField;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.queryParser.ParseException;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.BooleanQuery;
- import org.apache.lucene.search.FuzzyQuery;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.NumericRangeQuery;
- import org.apache.lucene.search.PhraseQuery;
- import org.apache.lucene.search.PrefixQuery;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TermRangeQuery;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.WildcardQuery;
- import org.apache.lucene.search.BooleanClause.Occur;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.util.Version;
- /**
- * 【Lucene3.6.2入門系列】第03節_簡述Lucene中常見的搜索功能
- * @create Aug 1, 2013 3:54:27 PM
- * @author 玄玉<http://blog.csdn.net/jadyer>
- */
- public class HelloSearch {
- private Directory directory;
- private IndexReader reader;
- private String[] ids = {"1", "2", "3", "4", "5", "6"};
- private String[] names = {"Michael", "Scofield", "Tbag", "Jack", "Jade", "Jadyer"};
- private String[] emails = {"[email protected]", "[email protected]", "[email protected]", "[email protected]", "[email protected]", "[email protected]"};
- private String[] contents = {"my java blog is http://blog.csdn.net/jadyer", "my website is http://www.jadyer.cn", "my name is jadyer", "I am JavaDeveloper", "I am from Haerbin", "I like Lucene"};
- private int[] attachs = {9,3,5,4,1,2};
- private Date[] dates = new Date[ids.length];
- public HelloSearch(){
- IndexWriter writer = null;
- Document doc = null;
- SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
- try {
- dates[0] = sdf.parse("20120601");
- dates[1] = sdf.parse("20120603");
- dates[2] = sdf.parse("20120605");
- dates[3] = sdf.parse("20120607");
- dates[4] = sdf.parse("20120609");
- dates[5] = sdf.parse("20120611");
- directory = FSDirectory.open(new File("myExample/03_index/"));
- writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));
- writer.deleteAll(); //創建索引之前,先把文檔清空掉
- for(int i=0; i<ids.length; i++){ //遍歷ID來創建文檔
- doc = new Document();
- doc.add(new Field("id", ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(new Field("name", names[i], Field.Store.YES, Field.Index.ANALYZED_NO_NORMS));
- doc.add(new Field("email", emails[i], Field.Store.YES, Field.Index.NOT_ANALYZED));
- doc.add(new Field("email", "test"+i+""+i+"@jadyer.com", Field.Store.YES, Field.Index.NOT_ANALYZED));
- doc.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED));
- doc.add(new NumericField("attach", Field.Store.YES, true).setIntValue(attachs[i])); //爲數字加索引(第三個參數指定是否索引)
- doc.add(new NumericField("attach", Field.Store.YES, true).setIntValue((i+1)*100)); //假設有多個附件
- doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(dates[i].getTime())); //爲日期加索引
- writer.addDocument(doc);
- }
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- if(null != writer){
- try {
- writer.close();
- } catch (IOException ce) {
- ce.printStackTrace();
- }
- }
- }
- }
- /**
- * 針對分頁搜索創建索引
- */
- public HelloSearch(boolean pageFlag){
- String[] myNames = new String[50];
- String[] myContents = new String[50];
- for(int i=0; i<50; i++){
- myNames[i] = "file(" + i + ")";
- myContents[i] = "I love JavaSE, also love Lucene(" + i + ")";
- }
- IndexWriter writer = null;
- Document doc = null;
- try {
- directory = FSDirectory.open(new File("myExample/03_index/"));
- writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));
- writer.deleteAll();
- for(int i=0; i<myNames.length; i++){
- doc = new Document();
- doc.add(new Field("myname", myNames[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(new Field("mycontent", myContents[i], Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
- }
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if(null != writer){
- try {
- writer.close();
- } catch (IOException ce) {
- ce.printStackTrace();
- }
- }
- }
- }
- /**
- * 獲取IndexSearcher實例
- */
- private IndexSearcher getIndexSearcher(){
- try {
- if(reader == null){
- reader = IndexReader.open(directory);
- }else{
- //if the index was changed since the provided reader was opened, open and return a new reader; else,return null
- //如果當前reader在打開期間index發生改變,則打開並返回一個新的IndexReader,否則返回null
- IndexReader ir = IndexReader.openIfChanged(reader);
- if(ir != null){
- reader.close(); //關閉原reader
- reader = ir; //賦予新reader
- }
- }
- return new IndexSearcher(reader);
- }catch(Exception e) {
- e.printStackTrace();
- }
- return null; //發生異常則返回null
- }
- /**
- * 執行搜索操作
- * @param query 搜索的Query對象
- */
- private void doSearch(Query query){
- IndexSearcher searcher = this.getIndexSearcher();
- try {
- //第二個參數指定搜索後顯示的最多的記錄數,其與tds.totalHits沒有聯繫
- TopDocs tds = searcher.search(query, 10);
- System.out.println("本次搜索到[" + tds.totalHits + "]條記錄");
- for(ScoreDoc sd : tds.scoreDocs){
- Document doc = searcher.doc(sd.doc);
- System.out.print("文檔編號=" + sd.doc + " 文檔權值=" + doc.getBoost() + " 文檔評分=" + sd.score + " ");
- System.out.print("id=" + doc.get("id") + " email=" + doc.get("email") + " name=" + doc.get("name") + " ");
- //獲取多個同名域的方式
- String[] attachValues = doc.getValues("attach");
- for(String attach : attachValues){
- System.out.print("attach=" + attach + " ");
- }
- System.out.println();
- }
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if(null != searcher){
- try {
- searcher.close(); //記得關閉IndexSearcher
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- /**
- * 精確匹配搜索
- * @param fieldName 域名(相當於表的字段名)
- * @param keyWords 搜索的關鍵字
- */
- public void searchByTerm(String fieldName, String keyWords){
- Query query = new TermQuery(new Term(fieldName, keyWords));
- this.doSearch(query);
- }
- /**
- * 基於範圍的搜索
- * @param fieldName 域名(相當於表的字段名)
- * @param start 開始字符
- * @param end 結束字符
- */
- public void searchByTermRange(String fieldName, String start, String end){
- Query query = new TermRangeQuery(fieldName, start, end, true, true); //後面兩個參數用於指定開區間或閉區間
- this.doSearch(query);
- }
- /**
- * 針對數字的搜索
- */
- public void searchByNumericRange(String fieldName, int min, int max){
- Query query = NumericRangeQuery.newIntRange(fieldName, min, max, true, true);
- this.doSearch(query);
- }
- /**
- * 基於前綴的搜索
- * @see 它是對Field分詞後的結果進行前綴查找的結果
- */
- public void searchByPrefix(String fieldName, String prefix){
- Query query = new PrefixQuery(new Term(fieldName, prefix));
- this.doSearch(query);
- }
- /**
- * 基於通配符的搜索
- * @see *-->任意多個字符
- * @see ?-->一個字符
- */
- public void searchByWildcard(String fieldName, String wildcard){
- Query query = new WildcardQuery(new Term(fieldName, wildcard));
- this.doSearch(query);
- }
- /**
- * 模糊搜索
- * @see 與通配符搜索不同
- */
- public void searchByFuzzy(String fieldName, String fuzzy){
- Query query = new FuzzyQuery(new Term(fieldName, fuzzy));
- this.doSearch(query);
- }
- /**
- * 多條件搜索
- * @see 本例中搜索name值中以Ja開頭,且content中包含am的內容
- * @see Occur.MUST------表示此條件必須爲true
- * @see Occur.MUST_NOT--表示此條件必須爲false
- * @see Occur.SHOULD----表示此條件非必須
- */
- public void searchByBoolean(){
- BooleanQuery query = new BooleanQuery();
- query.add(new WildcardQuery(new Term("name", "Ja*")), Occur.MUST);
- query.add(new TermQuery(new Term("content", "am")), Occur.MUST);
- this.doSearch(query);
- }
- /**
- * 短語搜索
- * @see 很遺憾的是短語查詢對中文搜索沒有太大的作用,但對英文搜索是很好用的,但它的開銷比較大,儘量少用
- */
- public void searchByPhrase(){
- PhraseQuery query = new PhraseQuery();
- query.setSlop(1); //設置跳數
- query.add(new Term("content", "am")); //第一個Term
- query.add(new Term("content", "Haerbin")); //產生距離之後的第二個Term
- this.doSearch(query);
- }
- /**
- * 基於QueryParser的搜索
- */
- public void searchByQueryParse(){
- QueryParser parser = new QueryParser(Version.LUCENE_36, "content", new StandardAnalyzer(Version.LUCENE_36));
- Query query = null;
- try {
- // query = parser.parse("Haerbin"); //搜索content中包含[Haerbin]的記錄
- // query = parser.parse("I AND Haerbin"); //搜索content中包含[I]和[Haerbin]的記錄
- // query = parser.parse("Lucene OR Haerbin"); //搜索content中包含[Lucene]或者[Haerbin]的記錄
- // query = parser.parse("Lucene Haerbin"); //搜索content中包含[Lucene]或者[Haerbin]的記錄
- // parser.setDefaultOperator(Operator.AND); //將空格的默認操作OR修改爲AND
- // //1)如果name域在索引時,不進行分詞,那麼無論這裏寫成[name:Jadyer]還是[name:jadyer],最後得到的都是0條記錄
- // //2)由於name原值爲大寫[J],若索引時不對name分詞,除非修改name原值爲小寫[j],並且搜索[name:jadyer]才能得到記錄
- // query = parser.parse("name:Jadyer"); //修改搜索域爲name=Jadyer的記錄
- // query = parser.parse("name:Ja*"); //支持通配符
- // query = parser.parse("\"I am\""); //搜索content中包含[I am]的記錄(注意不能使用parse("content:'I am'"))
- // parser.setAllowLeadingWildcard(true); //設置允許[*]或[?]出現在查詢字符的第一位,即[name:*de],否則[name:*de]會報異常
- // query = parser.parse("name:*de"); //Lucene默認的第一個字符不允許爲通配符,因爲這樣效率比較低
- // //parse("+am +name:Jade")--------------搜索content中包括[am]的,並且name=Jade的記錄
- // //parse("am AND NOT name:Jade")--------搜索content中包括[am]的,並且nam不是Jade的記錄
- // //parse("(blog OR am) AND name:Jade")--搜索content中包括[blog]或者[am]的,並且name=Jade的記錄
- // query = parser.parse("-name:Jack +I"); //搜索content中包括[I]的,並且name不是Jack的記錄(加減號要放到域說明的前面)
- // query = parser.parse("id:[1 TO 3]"); //搜索id值從1到3的記錄(TO必須大寫,且這種方式沒有辦法匹配數字)
- // query = parser.parse("id:{1 TO 3}"); //搜索id=2的記錄
- query = parser.parse("name:Jadk~"); //模糊搜索
- } catch (ParseException e) {
- e.printStackTrace();
- }
- this.doSearch(query);
- }
- /**
- * 普通的分頁搜索
- * @see 適用於lucene3.5之前
- * @param expr 搜索表達式
- * @param pageIndex 頁碼
- * @param pageSize 分頁大小
- */
- public void searchPage(String expr, int pageIndex, int pageSize){
- IndexSearcher searcher = this.getIndexSearcher();
- QueryParser parser = new QueryParser(Version.LUCENE_36, "mycontent", new StandardAnalyzer(Version.LUCENE_36));
- try {
- Query query = parser.parse(expr);
- TopDocs tds = searcher.search(query, pageIndex*pageSize);
- ScoreDoc[] sds = tds.scoreDocs;
- for(int i=(pageIndex-1)*pageSize; i<pageIndex*pageSize; i++){
- Document doc = searcher.doc(sds[i].doc);
- System.out.println("文檔編號:" + sds[i].doc + "-->" + doc.get("myname") + "-->" + doc.get("mycontent"));
- }
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- if(null != searcher){
- try {
- searcher.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- /**
- * 基於searchAfter的分頁搜索
- * @see 適用於Lucene3.5
- * @param expr 搜索表達式
- * @param pageIndex 頁碼
- * @param pageSize 分頁大小
- */
- public void searchPageByAfter(String expr, int pageIndex, int pageSize){
- IndexSearcher searcher = this.getIndexSearcher();
- QueryParser parser = new QueryParser(Version.LUCENE_36, "mycontent", new StandardAnalyzer(Version.LUCENE_36));
- try {
- Query query = parser.parse(expr);
- TopDocs tds = searcher.search(query, (pageIndex-1)*pageSize);
- //使用IndexSearcher.searchAfter()搜索,該方法第一個參數爲上一頁記錄中的最後一條記錄
- if(pageIndex > 1){
- tds = searcher.searchAfter(tds.scoreDocs[(pageIndex-1)*pageSize-1], query, pageSize);
- }else{
- tds = searcher.searchAfter(null, query, pageSize);
- }
- for(ScoreDoc sd : tds.scoreDocs){
- Document doc = searcher.doc(sd.doc);
- System.out.println("文檔編號:" + sd.doc + "-->" + doc.get("myname") + "-->" + doc.get("mycontent"));
- }
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- if(null != searcher){
- try {
- searcher.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- }
下面是JUnit4.x編寫的測試
- package com.jadyer.test;
- import java.io.File;
- import org.junit.Before;
- import org.junit.Test;
- import com.jadyer.lucene.HelloSearch;
- public class HelloSearchTest {
- private HelloSearch hello;
- @Before
- public void init(){
- hello = new HelloSearch();
- }
- @Test
- public void searchByTerm(){
- hello.searchByTerm("content", "my");
- }
- @Test
- public void searchByTermRange(){
- hello.searchByTermRange("name", "M", "o");
- }
- @Test
- public void searchByNumericRange(){
- hello.searchByNumericRange("attach", 2, 5);
- }
- @Test
- public void searchByPrefix(){
- hello.searchByPrefix("content", "b");
- }
- @Test
- public void searchByWildcard(){
- hello.searchByWildcard("name", "Ja??er");
- }
- @Test
- public void searchByFuzzy(){
- hello.searchByFuzzy("name", "Jadk");
- }
- @Test
- public void searchByBoolean(){
- hello.searchByBoolean();
- }
- @Test
- public void searchByPhrase(){
- hello.searchByPhrase();
- }
- @Test
- public void searchByQueryParse(){
- hello.searchByQueryParse();
- }
- @Test
- public void searchPage(){
- for(File file : new File("myExample/03_index/").listFiles()){
- file.delete();
- }
- hello = new HelloSearch(true);
- hello.searchPage("mycontent:javase", 2, 10);
- }
- @Test
- public void searchPageByAfter(){
- for(File file : new File("myExample/03_index/").listFiles()){
- file.delete();
- }
- hello = new HelloSearch(true);
- hello.searchPageByAfter("mycontent:javase", 3, 10);
- }
- }