問題如下:
數字與英文結合在一起檢索,出現高亮重複問題
如:檢索“220”則
關於同意220kV佛山變電站#1、#2主變報廢的批覆 .txt 檢索“220kv”則 關於同意220220kV佛山變電站#1、#2主變報廢的批覆 .txt 高亮採用的是索引時記錄Term的位置,高亮處理採用TermPositionVector termFreqVector = (TermPositionVector)ireader.getTermFreqVector(doc, fieldname);方式,其中主要代碼如下: QueryParser queryParser = new QueryParser(Version.LUCENE_30,fieldname,queryAnalyzer);Query query = queryParser.parse(keyWordLc);
Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(
"<font color=/"red/">", "</font>"), new QueryScorer(
query));
highlighter.setTextFragmenter(new SimpleFragmenter(50));
TermPositionVector termFreqVector = (TermPositionVector)ireader.getTermFreqVector(doc, fieldname);
/**
* 注意這裏最好設爲true,雖然會影響性能,但是避免出現:
* 文檔題名爲:索引測試新建文檔1.txt
* 查看tokens結果:[(1,8,9), (1.txt,8,13), (文檔,6,8), (新建,4,6), (測試,2,4), (索引,0,2), (txt,10,13)]
* 這樣高亮顯示的時候<font color="red">索引測試新建文檔</font>1.txt
* 因爲高亮顯示的方法裏是按位置信息,噹噹前匹配的term小於前面最大的最後位置時纔去高亮,
* 不然則在最後獲取到最小匹配的term的首位置到最後匹配的term的末位置的字符串全部高亮起來了。
*/
TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector,true);
String content = hitDoc.get(fieldname);
String result = highlighter.getBestFragments(tokenStream, content, 5,"..."); 通過調試跟蹤,paoding分詞器對“220kv”會分詞爲“220 kv 220kv”,而通過Lucene提供的lucene-highlighter-3.0.2.jar、lucene-memory-3.0.2.jar
解決方法是修改lucene-highlighter-3.0.2.jar中Highlighter類,代碼如下:
public final TextFragment[] getBestTextFragments(TokenStream tokenStream,
String text,
boolean mergeContiguousFragments,
int maxNumFragments)
throws IOException, InvalidTokenOffsetsException
{
ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
StringBuilder newText=new StringBuilder();
TermAttribute termAtt = tokenStream.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
TokenStream newStream = fragmentScorer.init(tokenStream);
if(newStream != null) {
tokenStream = newStream;
}
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag); FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try
{ String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
int lastStartOffset = 0; //用來記錄當前所取的字符串起點位置 textFragmenter.start(text, tokenStream); TokenGroup tokenGroup=new TokenGroup(tokenStream); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
next = tokenStream.incrementToken())
{
if( (offsetAtt.endOffset()>text.length())
||
(offsetAtt.startOffset()>text.length())
)
{
throw new InvalidTokenOffsetsException("Token "+ termAtt.term()
+" exceeds length of provided text sized "+text.length());
}
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
//用下面兩行替代代碼tokenText = text.substring(startOffset, endOffset);
//解決“數字+英文或英文+數字”格式關鍵詞出現高亮重複問題,如:檢索“220KV”會高亮“220220KV”
lastStartOffset = Math.max(startOffset, lastEndOffset);
tokenText = text.substring(lastStartOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=Math.max(endOffset, lastEndOffset);
tokenGroup.clear(); //check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment())
{
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
currentFrag.textEndPos = newText.length();
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
} tokenGroup.addToken(fragmentScorer.getTokenScore());// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
}
currentFrag.setScore(fragmentScorer.getFragmentScore()); if(tokenGroup.numTokens>0)
{
//flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=Math.max(lastEndOffset,endOffset);
} //Test what remains of the original text beyond the point where we stopped analyzing
if (
// if there is text beyond the last token considered..
(lastEndOffset < text.length())
&&
// and that text is not too large...
(text.length()<= maxDocCharsToAnalyze)
)
{
//append it to the last fragment
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
} currentFrag.textEndPos = newText.length(); //sort the most relevant sections of the text
for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
{
currentFrag = i.next(); //If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
/*
if (currentFrag.getScore() >= minScore)
{
fragQueue.put(currentFrag);
if (fragQueue.size() > maxNumFragments)
{ // if hit queue overfull
fragQueue.pop(); // remove lowest in hit queue
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
}
}
*/
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
fragQueue.insertWithOverflow(currentFrag);
} //return the most relevant fragments
TextFragment frag[] = new TextFragment[fragQueue.size()];
for (int i = frag.length - 1; i >= 0; i--)
{
frag[i] = fragQueue.pop();
} //merge any contiguous fragments to improve readability
if(mergeContiguousFragments)
{
mergeContiguousFragments(frag);
ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(frag[i]);
}
}
frag= fragTexts.toArray(new TextFragment[0]);
} return frag; }
finally
{
if (tokenStream != null)
{
try
{
tokenStream.close();
}
catch (Exception e)
{
}
}
}
}