在实践中,我以将 PHP 中文手册中的 HTML 文件生成索引,然后通过一个 JSP 对其进行全文检索。
生成索引的 Java 代码:
/** * PHPDocIndexer.java * 用于对 PHPDoc 的 HTML 页面生成索引文件。 */ import java.io.File; import java.io.FileReader; import java.io.BufferedReader; import java.io.IOException; import java.util.Date; import java.text.DateFormat; import java.lang.*;
import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateField;
class PHPDocIndexer { public static void main(String[] args) throws ClassNotFoundException, IOException { try { Date start = new Date();
IndexWriter writer = new IndexWriter(”/home/nio/indexes-phpdoc”, new CJKAnalyzer(), true); //索引保存目录,必须存在 indexDocs(writer, new File(”/home/nio/phpdoc-zh”)); //HTML 文件保存目录
System.out.println(”Optimizing ….”); writer.optimize(); writer.close();
Date end = new Date();
System.out.print(”Total time: “); System.out.println(end.getTime() - start.getTime()); } catch (Exception e) { System.out.println(”Class ” + e.getClass() + ” throws error!n errmsg: ” + e.getMessage()); } //end try } //end main
public static void indexDocs(IndexWriter writer, File file) throws Exception { if (file.isDirectory()) { String[] files = file.list(); for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } //end for } else if (file.getPath().endsWith(”.html”)) { //只对 HTML 文件做索引 System.out.print(”Add file:” + file + ” ….”); // Add html file …. Document doc = new Document(); doc.add(Field.UnIndexed(”file”, file.getName())); //索引文件名 doc.add(Field.UnIndexed(”modified”, DateFormat.getDateTimeInstance().format(new Date(file.lastModified())))); //索引最后修改时间
String title = “”; String content = “”; String status = “start”;
FileReader fReader = new FileReader(file); BufferedReader bReader = new BufferedReader(fReader); String line = bReader.readLine();
while (line != null) { content += line; //截取 HTML 标题 <title> if (”start” == status && line.equalsIgnoreCase(”><TITLE”)) { status = “match”; } else if (”match” == status) { title = line.substring(1, line.length() - 7); doc.add(Field.Text(”title”, title)); //索引标题 status = “end”; } //end if line = bReader.readLine(); } //end while bReader.close(); fReader.close(); doc.add(Field.Text(”content”, content.replaceAll(”<[^<>]+>”, “”))); //索引内容 writer.addDocument(doc); System.out.println(” [OK]”); } //end if }
} //end class
共2页 1 2
|
来源:
| 作者:
Nio
| 发表时间:
2006-10-15 17:20:00
|
|
|