6.Synonyms, aliases and word that mean same
关于同义词的处理 可以提供一个SynonymAnalyzer 来处理,这样可以把同一个词的同义词索引到同一个位置这样在搜索的时候就可以根据同义词来搜索了. 可以看看书中的测试代码,带有详细注释(点击我).
7.Stemming Analyzer
PositionalPorterStopAnalyzer 这是一个非Build-in 的Analyzer实现,她把所有的词都分析为一个基词形式(root form),例如:
breathe,breathes,breathing , and breathed 都分析为breath了.
她还去除所有的Stop word 并且保留stop word 的位置, 例如: the quick brown fox jumps ower the lazy dog 就被分析为下面的term了
2: [quick]
3: [brown]
4: [fox]
5: [jump]
6: [over]
8: [lazi]
9: [dog]
因为 去掉了the 词 所有1,7没有东西. 她是利用一个filter来实现该功能的.如:
01 package lia.analysis.positional; 02 03 import org.apache.lucene.analysis.TokenStream; 04 import org.apache.lucene.analysis.Token; 05 import org.apache.lucene.analysis.TokenFilter; 06 import java.util.Set; 07 import java.io.IOException; 08 09 public class PositionalStopFilter extends TokenFilter { 10 private Set stopWords; 11 12 public PositionalStopFilter(TokenStream in, Set stopWords) { 13 super(in); 14 this.stopWords = stopWords; 15 } 16 17 public final Token next() throws IOException { 18 int increment = 0; 19 for (Token token = input.next(); 20 token != null; token = input.next()) { 21 22 if (!stopWords.contains(token.termText())) { 23 token.setPositionIncrement( //为Stop word 保留Position位置 24 token.getPositionIncrement() + increment); 25 return token; 26 } 27 28 increment++; 29 } 30 31 return null; 32 } 33 }
有PositionalPorterStopAnalyzer.java 提供Stop word 列表.
01 package lia.analysis.positional; 02 03 import org.apache.lucene.analysis.Analyzer; 04 import org.apache.lucene.analysis.LowerCaseTokenizer; 05 import org.apache.lucene.analysis.PorterStemFilter; 06 import org.apache.lucene.analysis.StopAnalyzer; 07 import org.apache.lucene.analysis.StopFilter; 08 import org.apache.lucene.analysis.TokenStream; 09 10 import java.io.Reader; 11 import java.util.Set; 12 13 public class PositionalPorterStopAnalyzer extends Analyzer { 14 private Set stopWords; 15 16 public PositionalPorterStopAnalyzer() { 17 this(StopAnalyzer.ENGLISH_STOP_WORDS); /// 使用默认的英文Stop word 18 } 19 20 public PositionalPorterStopAnalyzer(String[] stopList) { /// 使用自己指定的Stop word 21 stopWords = StopFilter.makeStopSet(stopList); 22 } 23 24 public TokenStream tokenStream(String fieldName, Reader reader) { 25 return new PorterStemFilter( 26 new PositionalStopFilter( 27 new LowerCaseTokenizer(reader), stopWords)); 28 } 29 }
测试代码如下:
01 package lia.analysis.positional; 02 03 import junit.framework.TestCase; 04 import lia.analysis.AnalyzerUtils; 05 import org.apache.lucene.document.Document; 06 import org.apache.lucene.document.Field; 07 import org.apache.lucene.index.IndexWriter; 08 import org.apache.lucene.queryParser.QueryParser; 09 import org.apache.lucene.search.Hits; 10 import org.apache.lucene.search.IndexSearcher; 11 import org.apache.lucene.search.Query; 12 import org.apache.lucene.store.RAMDirectory; 13 14 import java.io.IOException; 15 16 public class PositionalPorterStopAnalyzerTest extends TestCase { 17 private static PositionalPorterStopAnalyzer porterAnalyzer = 18 new PositionalPorterStopAnalyzer(); 19 20 private RAMDirectory directory; 21 22 public void setUp() throws Exception { 23 directory = new RAMDirectory(); 24 IndexWriter writer = 25 new IndexWriter(directory, porterAnalyzer, true); 26 27 Document doc = new Document(); 28 doc.add(Field.Text("contents", 29 "The quick brown fox jumps over the lazy dogs")); 30 writer.addDocument(doc); 31 writer.close(); 32 } 33 34 public void testStems() throws Exception { //(3) 35 IndexSearcher searcher = new IndexSearcher(directory); 36 Query query = QueryParser.parse("laziness", 37 "contents", 38 porterAnalyzer); 39 Hits hits = searcher.search(query); 40 assertEquals("lazi", 1, hits.length()); 41 42 43 query = QueryParser.parse("\"fox jumped\"", 44 "contents", 45 porterAnalyzer); 46 47 hits = searcher.search(query); 48 assertEquals("jump jumps jumped jumping", 1, hits.length()); 49 } 50 51 public void testExactPhrase() throws Exception {//// 测试 丢失Position信息 引起的麻烦 (1) 52 IndexSearcher searcher = new IndexSearcher(directory); 53 Query query = QueryParser.parse("\"over the lazy\"", 54 "contents", 55 porterAnalyzer); 56 57 Hits hits = searcher.search(query); 58 assertEquals("exact match not found!", 0, hits.length()); 59 } 60 61 public void testWithSlop() throws Exception { 62 IndexSearcher searcher = new IndexSearcher(directory); 63 64 QueryParser parser = new QueryParser("contents", 65 porterAnalyzer); 66 parser.setPhraseSlop(1); // (2) 67 68 Query query = parser.parse("\"over the lazy\""); 69 70 Hits hits = searcher.search(query); 71 assertEquals("hole accounted for", 1, hits.length()); 72 } 73 74 public static void main(String[] args) throws IOException { 75 AnalyzerUtils.displayTokensWithPositions(porterAnalyzer, 76 "The quick brown fox jumps over the lazy dogs"); 77 } 78 }
共3页 1 2 3
|
来源:
Java爱好者
| 作者:
| 发表时间:
2006-11-8 10:17:00
|
|
|