使用Lucene实现全文检索,主要有下面三个步骤: 1、建立索引库:根据网站新闻信息库中的已有的数据资料建立Lucene索引文件。 2、通过索引库搜索:有了索引后,即可使用标准的词法分析器或直接的词法分析器实现进行全文检索。 3、维护索引库:网站新闻信息库中的信息会不断的变动,包括新增、修改及删除等,这些信息的变动都需要进一步反映到Lucene索引文件中。
下面是myrss.easyjf.com相关代码!
一、索引管理(建立及维护) 索引管理类MyRssIndexManage主要实现根据网站信息库中的数据建立索引,维护索引等。由于索引的过程需要消耗一定的时间,因此,索引管理类实现Runnable接口,使得我们可以在程序中开新线程来运行。 package com.easyjf.lucene;
import java.util.Date; import java.util.List;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Searcher;
import com.easyjf.dbo.EasyJDB; import com.easyjf.news.business.NewsDir; import com.easyjf.news.business.NewsDoc; import com.easyjf.news.business.NewsUtil; import com.easyjf.web.tools.IPageList; public class MyRssIndexManage implements Runnable { private String indexDir; private String indexType="add"; public void run() { // TODO Auto-generated method stub if("add".equals(indexType)) normalIndex(); else if ("init".equals(indexType)) reIndexAll(); } public void normalIndex() { try{ Date start = new Date(); int num=0; IndexWriter writer=new IndexWriter(indexDir,new StandardAnalyzer(),false); //NewsDir dir=NewsDir.readBySn(); String scope="(needIndex<2) or(needIndex is null)"; IPageList pList=NewsUtil.pageList(scope,1,50); for(int p=0;p<pList.getPages();p++) { pList=NewsUtil.pageList(scope,p,100); List list=pList.getResult(); for(int i=0;i<list.size();i++) { NewsDoc doc=(NewsDoc)list.get(i); writer.addDocument(newsdoc2lucenedoc(doc)); num++; } } writer.optimize(); writer.close(); EasyJDB.getInstance().execute("update NewsDoc set needIndex=2 where "+scope); Date end = new Date(); System.out.print("新增索引"+num+"条信息,一共花:"+(end.getTime() - start.getTime())/60000+"分钟!"); } catch(Exception e) { e.printStackTrace(); } } public void reIndexAll() { try{ Date start = new Date(); int num=0; IndexWriter writer=new IndexWriter(indexDir,new StandardAnalyzer(),true); NewsDir dir=NewsDir.readBySn("easyjf"); IPageList pList=NewsUtil.pageList(dir,1,50); for(int p=0;p<pList.getPages();p++) { pList=NewsUtil.pageList(dir,p,100); List list=pList.getResult(); for(int i=0;i<list.size();i++) { NewsDoc doc=(NewsDoc)list.get(i); writer.addDocument(newsdoc2lucenedoc(doc)); num++; } } writer.optimize(); writer.close(); EasyJDB.getInstance().execute("update NewsDoc set needIndex=2 where dirPath like 'easyjf%'"); Date end = new Date(); System.out.print("全部重新做了一次索引,一共处理了"+num+"条信息,花:"+(end.getTime() - start.getTime())/60000+"分钟!"); } catch(Exception e) { e.printStackTrace(); } } private Document newsdoc2lucenedoc(NewsDoc doc) { Document lDoc=new Document(); lDoc.add(new Field("title",doc.getTitle(),Field.Store.YES,Field.Index.TOKENIZED)); lDoc.add(new Field("content",doc.getContent(),Field.Store.YES,Field.Index.TOKENIZED)); lDoc.add(new Field("url",doc.getRemark(),Field.Store.YES,Field.Index.NO)); lDoc.add(new Field("cid",doc.getCid(),Field.Store.YES,Field.Index.NO)); lDoc.add(new Field("source",doc.getSource(),Field.Store.YES,Field.Index.NO)); lDoc.add(new Field("inputTime",doc.getInputTime().toString(),Field.Store.YES,Field.Index.NO)); return lDoc; } public String getIndexDir() { return indexDir; } public void setIndexDir(String indexDir) { this.indexDir = indexDir; } public String getIndexType() { return indexType; } public void setIndexType(String indexType) { this.indexType = indexType; } }
共3页 1 2 3
|
来源:
| 作者:
| 发表时间:
2006-10-27 12:13:00
|
|
|