解決數字和英文字母結合檢索出現高亮重複問題

問題如下:

         數字與英文結合在一起檢索,出現高亮重複問題

         如:檢索“220”則

                         關於同意220kV佛山變電站#1、#2主變報廢的批覆 .txt                檢索“220kv”則                          關於同意220220kV佛山變電站#1、#2主變報廢的批覆 .txt             高亮採用的是索引時記錄Term的位置,高亮處理採用TermPositionVector termFreqVector = (TermPositionVector)ireader.getTermFreqVector(doc, fieldname);方式,其中主要代碼如下:       QueryParser queryParser = new QueryParser(Version.LUCENE_30,fieldname,queryAnalyzer);
     Query query = queryParser.parse(keyWordLc);
     Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(
       "<font color=/"red/">", "</font>"), new QueryScorer(
         query));
     highlighter.setTextFragmenter(new SimpleFragmenter(50));
      
     TermPositionVector termFreqVector = (TermPositionVector)ireader.getTermFreqVector(doc, fieldname);
     /**   
      * 注意這裏最好設爲true,雖然會影響性能,但是避免出現:
      * 文檔題名爲:索引測試新建文檔1.txt
               * 查看tokens結果:[(1,8,9), (1.txt,8,13), (文檔,6,8), (新建,4,6), (測試,2,4), (索引,0,2), (txt,10,13)]
               * 這樣高亮顯示的時候<font color="red">索引測試新建文檔</font>1.txt
               * 因爲高亮顯示的方法裏是按位置信息,噹噹前匹配的term小於前面最大的最後位置時纔去高亮,
               * 不然則在最後獲取到最小匹配的term的首位置到最後匹配的term的末位置的字符串全部高亮起來了。
      */
           TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector,true);  
          
           String content = hitDoc.get(fieldname);
           String result = highlighter.getBestFragments(tokenStream, content, 5,"...");         通過調試跟蹤,paoding分詞器對“220kv”會分詞爲“220   kv    220kv”,而通過Lucene提供的lucene-highlighter-3.0.2.jar、lucene-memory-3.0.2.jar

解決方法是修改lucene-highlighter-3.0.2.jar中Highlighter類,代碼如下:

      public final TextFragment[] getBestTextFragments(
  TokenStream tokenStream,
  String text,
  boolean mergeContiguousFragments,
  int maxNumFragments)
  throws IOException, InvalidTokenOffsetsException
 {
  ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
  StringBuilder newText=new StringBuilder();
  
     TermAttribute termAtt = tokenStream.addAttribute(TermAttribute.class);
     OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
     tokenStream.addAttribute(PositionIncrementAttribute.class);
     tokenStream.reset();
    
  TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
  TokenStream newStream = fragmentScorer.init(tokenStream);
  if(newStream != null) {
    tokenStream = newStream;
  }
  fragmentScorer.startFragment(currentFrag);
  docFrags.add(currentFrag);  FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);  try
  {   String tokenText;
   int startOffset;
   int endOffset;
   int lastEndOffset = 0;
   int lastStartOffset = 0;  //用來記錄當前所取的字符串起點位置   textFragmenter.start(text, tokenStream);   TokenGroup tokenGroup=new TokenGroup(tokenStream);   for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
         next = tokenStream.incrementToken())
   {
    if( (offsetAtt.endOffset()>text.length())
     ||
     (offsetAtt.startOffset()>text.length())
     )      
    {
     throw new InvalidTokenOffsetsException("Token "+ termAtt.term()
       +" exceeds length of provided text sized "+text.length());
    }
    if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
    {
     //the current token is distinct from previous tokens -
     // markup the cached token group info
     startOffset = tokenGroup.matchStartOffset;
     endOffset = tokenGroup.matchEndOffset;
     
     //用下面兩行替代代碼tokenText = text.substring(startOffset, endOffset);
     //解決“數字+英文或英文+數字”格式關鍵詞出現高亮重複問題,如:檢索“220KV”會高亮“220220KV”
     lastStartOffset = Math.max(startOffset, lastEndOffset);
     tokenText = text.substring(lastStartOffset, endOffset);
     
     String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
     //store any whitespace etc from between this and last group
     if (startOffset > lastEndOffset)
      newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
     newText.append(markedUpText);
     lastEndOffset=Math.max(endOffset, lastEndOffset);
     
     tokenGroup.clear();     //check if current token marks the start of a new fragment
     if(textFragmenter.isNewFragment())
     {
      currentFrag.setScore(fragmentScorer.getFragmentScore());
      //record stats for a new fragment
      currentFrag.textEndPos = newText.length();
      currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
      fragmentScorer.startFragment(currentFrag);
      docFrags.add(currentFrag);
     }
    }    tokenGroup.addToken(fragmentScorer.getTokenScore());//    if(lastEndOffset>maxDocBytesToAnalyze)
//    {
//     break;
//    }
   }
   currentFrag.setScore(fragmentScorer.getFragmentScore());   if(tokenGroup.numTokens>0)
   {
    //flush the accumulated text (same code as in above loop)
    startOffset = tokenGroup.matchStartOffset;
    endOffset = tokenGroup.matchEndOffset;
    tokenText = text.substring(startOffset, endOffset);
    String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
    //store any whitespace etc from between this and last group
    if (startOffset > lastEndOffset)
     newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
    newText.append(markedUpText);
    lastEndOffset=Math.max(lastEndOffset,endOffset);
   }   //Test what remains of the original text beyond the point where we stopped analyzing
   if (
//     if there is text beyond the last token considered..
     (lastEndOffset < text.length())
     &&
//     and that text is not too large...
     (text.length()<= maxDocCharsToAnalyze)
    )    
   {
    //append it to the last fragment
    newText.append(encoder.encodeText(text.substring(lastEndOffset)));
   }   currentFrag.textEndPos = newText.length();   //sort the most relevant sections of the text
   for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
   {
    currentFrag = i.next();    //If you are running with a version of Lucene before 11th Sept 03
    // you do not have PriorityQueue.insert() - so uncomment the code below
    /*
         if (currentFrag.getScore() >= minScore)
         {
          fragQueue.put(currentFrag);
          if (fragQueue.size() > maxNumFragments)
          { // if hit queue overfull
           fragQueue.pop(); // remove lowest in hit queue
           minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
          }
         }
    */
    //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
    //fix to PriorityQueue. The correct method to use here is the new "insert" method
    // USE ABOVE CODE IF THIS DOES NOT COMPILE!
    fragQueue.insertWithOverflow(currentFrag);
   }   //return the most relevant fragments
   TextFragment frag[] = new TextFragment[fragQueue.size()];
   for (int i = frag.length - 1; i >= 0; i--)
   {
    frag[i] = fragQueue.pop();
   }   //merge any contiguous fragments to improve readability
   if(mergeContiguousFragments)
   {
    mergeContiguousFragments(frag);
    ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
    for (int i = 0; i < frag.length; i++)
    {
     if ((frag[i] != null) && (frag[i].getScore() > 0))
     {
      fragTexts.add(frag[i]);
     }
    }
    frag= fragTexts.toArray(new TextFragment[0]);
   }   return frag;  }
  finally
  {
   if (tokenStream != null)
   {
    try
    {
     tokenStream.close();
    }
    catch (Exception e)
    {
    }
   }
  }
 }
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章