建立搜索引擎四步:
抓取数据---》解析数据----》创建索引-----》执行搜索
第一部分:体会
今天做了一个简单示例,用的版本是lucune 2.4.1,跟教科书上的有些不同,但教科书上的示例都能正常执行。
第二部分:遇到的问题
刚把程序写出来的时候,出现了搜索不到结果的情况,后来发现是在添加Field的时候没有把搜索的字段设置成
Field.Index.ANALYZED,
如果你在搜索器中设定的搜索字段是 text,那么在创建索引的时候应该把这个字段设置成可以被索引。
也就是
而不是
第三部分:新老版本变更的问题
以下是lucune2.4.1版本与老版本的不同之处。
创建索引时的不同
老版本
新版本
在执行索引时的不同,注意看下 search和searchNew两个方法的不同就可以了
第四部分:成果
LuceneIndexer.java
import java.awt.BorderLayout;
import java.awt.Container;
import java.awt.GridLayout;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import javax.swing.JButton;
import javax.swing.JFileChooser;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTextArea;
import javax.swing.JTextField;
import javax.swing.SwingUtilities;
import javax.swing.UIManager;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
public class LuceneIndexer {
/**
* @param args
*/
private JTextField jtfa;
private JButton jba;
private JTextField jtfb;
private JButton jbb;
private JButton jbc;
private static JTextArea jta;
private void createAndShowGUI(){
String lf = "com.sun.java.swing.plaf.windows.WindowsLookAndFeel";
try{
UIManager.setLookAndFeel(lf);
}
catch(Exception ce){
JOptionPane.showMessageDialog(null,"无法设定外观感觉");
}
JFrame.setDefaultLookAndFeelDecorated(true);
JFrame frame = new JFrame("dogdog Indexer!dogdog7788@foxmail.com");
frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
final JFileChooser fc = new JFileChooser();
fc.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY);
Container con = frame.getContentPane();
con.setLayout(new BorderLayout());
JPanel jpup = new JPanel();
jpup.setLayout(new GridLayout(3,2));
jtfa = new JTextField(30);
jba = new JButton("选择被缩影的文件存放路径");
jba.addActionListener(
new ActionListener(){
public void actionPerformed(ActionEvent a) {
// TODO Auto-generated method stub
int r=fc.showOpenDialog(null);
if(r==JFileChooser.APPROVE_OPTION){
jtfa.setText(fc.getSelectedFile().getPath());
jbc.setEnabled(true);
}
}
}
);
jtfb = new JTextField(30);
JButton jbb = new JButton("选择索引存放的路径");
jbb.addActionListener(
new ActionListener(){
public void actionPerformed(ActionEvent a) {
// TODO Auto-generated method stub
int r= fc.showOpenDialog(null);
if(r==JFileChooser.APPROVE_OPTION){
jtfb.setText(fc.getSelectedFile().getPath());
jbc.setEnabled(true);
}
}}
);
JLabel jl = new JLabel("");
jbc = new JButton("建立索引");
jbc.addActionListener(
new ActionListener(){
public void actionPerformed(ActionEvent arg0) {
// TODO Auto-generated method stub
try{
LuceneIndexerTool.index(jtfa.getText(),jtfb.getText());
}
catch(Exception e){
e.printStackTrace();
jbc.setEnabled(true);
JOptionPane.showMessageDialog(null, "索引创建失败!");
System.out.println(e.getMessage());
}
}
}
);
jpup.add(jtfa);
jpup.add(jba);
jpup.add(jbb);
jpup.add(jl);
jpup.add(jbc);
jta = new JTextArea(10,60);
JScrollPane jsp = new JScrollPane(jta);
con.add(jpup,BorderLayout.NORTH);
con.add(jsp,BorderLayout.CENTER);
frame.setSize(200,100);
frame.pack();
frame.setVisible(true);
}
public static void main(String[] args) {
// TODO Auto-generated method stub
SwingUtilities.invokeLater(
new Runnable(){
public void run() {
// TODO Auto-generated method stub
try{
new LuceneIndexer().createAndShowGUI();
}
catch(Exception e){
JOptionPane.showMessageDialog(null, "程序加载失败");
}
}
}
);
}
static class LuceneIndexerTool {
public static void index(String filesPath,String indexPath) throws IOException{
IndexWriter writer = new IndexWriter(indexPath,new StandardAnalyzer(),true,IndexWriter.MaxFieldLength.UNLIMITED);
String s[] = FileList.getFiles(filesPath);
int len = s.length;
String ext = "";
for(int i= 0;i<len;i++){
File f = new File(s[i]);
ext = getExt(f);
if(ext.equalsIgnoreCase("html")||ext.equalsIgnoreCase("htm")||ext.equalsIgnoreCase("txt")){
Document doc = new Document();
//filename字段
String filename = f.getName();
Field field = new Field("filename",filename,Field.Store.YES,Field.Index.ANALYZED);
doc.add(field);
//uri字段
String filepath = f.getPath();
field = new Field("uri",filepath,Field.Store.YES,Field.Index.NO);
doc.add(field);
//cdate字段
Date d = new Date(f.lastModified());
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-mm-dd E");
String cdate = sdf.format(d);
field = new Field("cdate",cdate,Field.Store.YES,Field.Index.NO);
doc.add(field);
//size字段
double si = f.length();
String size = "";
if(si>1024){
size = String.valueOf(Math.floor(si/1024))+"K";
}
else{
size = String.valueOf(Math.floor(si))+"Bytes";
}
field = new Field("size",size,Field.Store.YES,Field.Index.NO);
doc.add(field);
//text字段
String text = FileText.getText(f);
field = new Field("text",text,Field.Store.YES,Field.Index.ANALYZED);
doc.add(field);
//digest字段
String digest="";
if(text.length()>200){
digest = text.substring(0, 201);
}
else{
digest =text;
}
field = new Field("digest",digest,Field.Store.YES,Field.Index.ANALYZED);
doc.add(field);
writer.addDocument(doc);
jta.setText(jta.getText()+"已经归入索引:"+f.getName()+"___"+cdate+"/n");
}
}
writer.close();
JOptionPane.showMessageDialog(null, "索引建立完毕","dogdog advice",JOptionPane.INFORMATION_MESSAGE);
}
public static String getExt(File f)
{
String s = f.getName();
try{
s = s.substring(s.lastIndexOf(".")+1);
//System.out.println(s.lastIndexOf("s"));
}
catch(Exception e){
s="";
}
return s;
}
}
}
LuceneSearcher.java
import java.awt.BorderLayout;
import java.awt.Container;
import java.awt.FlowLayout;
import java.awt.GridLayout;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import javax.swing.JButton;
import javax.swing.JFileChooser;
import javax.swing.JFrame;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTextArea;
import javax.swing.JTextField;
import javax.swing.SwingUtilities;
import javax.swing.UIManager;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.TopDocs;
import creatindex.LuceneIndexer;
public class LuceneSearcher {
/**
* @param args
*/
private JTextField jtfa;
private JButton jba;
private JTextField jtfb;
private JButton jbb;
private JButton jbc;
private static JTextArea jta;
private JTextField jtfc;
private JButton jbd;
private JButton jbe;
private void createAndShowGUI(){
String lf = "com.sun.java.swing.plaf.windows.WindowsLookAndFeel";
try{
UIManager.setLookAndFeel(lf);
}
catch(Exception e){
JOptionPane.showMessageDialog(null,"无法设定外观感觉");
}
JFrame.setDefaultLookAndFeelDecorated(true);
JFrame frame = new JFrame("dogdog searcher");
frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
final JFileChooser fc = new JFileChooser();
fc.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY);
fc.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY);
Container con = frame.getContentPane();
con.setLayout(new BorderLayout());
JPanel jpup = new JPanel();
jpup.setLayout(new GridLayout(2,2));
jtfa = new JTextField(30);
jba = new JButton("选择索引的存放路径");
jba.addActionListener(
new ActionListener(){
public void actionPerformed(ActionEvent a) {
// TODO Auto-generated method stub
int r=fc.showOpenDialog(null);
if(r==JFileChooser.APPROVE_OPTION){
jtfa.setText(fc.getSelectedFile().getPath());
}
}
}
);
jtfb = new JTextField(30);
JButton jbb = new JButton("搜索");
jbb.addActionListener(
new ActionListener(){
public void actionPerformed(ActionEvent arg0) {
// TODO Auto-generated method stub
try{
String indexPath=jtfa.getText();
String phrase = jtfb.getText();
//LuceneSearcherTool.search(phrase, indexPath);
LuceneSearcherTool.searchNew(phrase, indexPath);
}
catch(Exception e){
JOptionPane.showMessageDialog(null, "搜索失败!","提示",JOptionPane.ERROR_MESSAGE);
e.printStackTrace();
}
}
}
);
jpup.add(jtfa);
jpup.add(jba);
jpup.add(jtfb);
jpup.add(jbb);
jta = new JTextArea(10,30);
JScrollPane jsp = new JScrollPane(jta);
JPanel jpdown = new JPanel();
jpdown.setLayout(new FlowLayout());
jtfc=new JTextField(35);
jbd = new JButton("设定导出路径");
fc.setFileSelectionMode(JFileChooser.FILES_AND_DIRECTORIES);
jbd.addActionListener(new ActionListener(){
public void actionPerformed(ActionEvent arg0) {
// TODO Auto-generated method stub
int r = fc.showOpenDialog(null);
if(r==JFileChooser.APPROVE_OPTION){
jtfc.setText(fc.getSelectedFile().getPath());
}
}
});
jbe = new JButton("导出搜索结果");
jbe.addActionListener(new ActionListener(){
public void actionPerformed(ActionEvent arg0) {
// TODO Auto-generated method stub
try{
File f = new File(jtfc.getText());
FileWriter fw = new FileWriter(f);
PrintWriter pw = new PrintWriter(fw);
pw.write(jta.getText());
pw.flush();
pw.close();
JOptionPane.showMessageDialog(null, "成功写入文件");
}
catch(Exception e){
JOptionPane.showMessageDialog(null, "写入文件失败");
e.printStackTrace();
}
}
});
jpdown.add(jtfc);
jpdown.add(jbd);
jpdown.add(jbe);
con.add(jpup,BorderLayout.NORTH);
con.add(jsp,BorderLayout.CENTER);
con.add(jpdown,BorderLayout.SOUTH);
frame.setSize(200,100);
frame.pack();
frame.setVisible(true);
}
static class LuceneSearcherTool{
@SuppressWarnings("deprecation")
public static void search(String phrase,String indexPath) throws IOException, Exception{
System.out.println(phrase.toString());
//建立搜索器
IndexSearcher searcher = new IndexSearcher(indexPath);
//搜索text字段
Term t = new Term("text",phrase);
//生成Query对象
Query q = new TermQuery(t);
Hits hs = searcher.search(q);
//搜索到的结果数量
int num = hs.length();
jta.setText("检索到的记录数量:"+num+"/n");
jta.setText(jta.getText()+"***************"+"/n/n");
//输出结果
for(int i=0;i<num;i++){
Document doc = hs.doc(i);
if(doc == null){
continue;
}
//获得FILENAME字段
Field field = doc.getField("filename");
String filename = field.stringValue();
//uri字段
field = doc.getField("uri");
String uri = field.stringValue();
//cdate字段
field = doc.getField("cdate");
String cdate = field.stringValue();
//String cdate = "cdate";
//digest字段
field = doc.getField("digest");
String digest = field.stringValue();
StringBuffer sb = new StringBuffer();
sb.append("URI:"+uri+"/n");
sb.append("filename:"+filename+"/n");
sb.append("cdate:"+cdate+"/n");
sb.append("digest:"+digest+"/n");
sb.append("******************"+"/n");
jta.setText(jta.getText()+sb.toString());
}
searcher.close();
}
//2.4.1版本有所改动
public static void searchNew(String phrase,String indexPath) throws IOException, Exception{
IndexSearcher newSearcher = new IndexSearcher(indexPath);
QueryParser parser = new QueryParser("text",new StandardAnalyzer());
Query q = parser.parse(phrase);
//也可以这样生成Query
//Term t = new Term("text",phrase);
//Query q = new TermQuery(t);
TopDocs topDocs = newSearcher.search(q, 100);//100是显示队列的Size
ScoreDoc[] hits = topDocs.scoreDocs;
jta.setText("共有"+newSearcher.maxDoc()+"条记录,其中"+hits.length+"条符合搜索条件/n");
jta.setText(jta.getText()+"************/n");
//输出结果
for(int i=0;i<hits.length;i++){
int docId = hits[i].doc;
Document doc = newSearcher.doc(docId);
if(doc == null){
continue;
}
//获得FILENAME字段
Field field = doc.getField("filename");
String filename = field.stringValue();
//uri字段
field = doc.getField("uri");
String uri = field.stringValue();
//cdate字段
field = doc.getField("cdate");
String cdate = field.stringValue();
//String cdate = "cdate";
//digest字段
field = doc.getField("digest");
String digest = field.stringValue();
StringBuffer sb = new StringBuffer();
sb.append("URI:"+uri+"/n");
sb.append("filename:"+filename+"/n");
sb.append("cdate:"+cdate+"/n");
sb.append("digest:"+digest+"/n");
sb.append("******************"+"/n");
jta.setText(jta.getText()+sb.toString());
}
newSearcher.close();
}
}
public static void main(String[] args) {
// TODO Auto-generated method stub
// TODO Auto-generated method stub
SwingUtilities.invokeLater(
new Runnable(){
public void run() {
// TODO Auto-generated method stub
try{
new LuceneSearcher().createAndShowGUI();
}
catch(Exception e){
JOptionPane.showMessageDialog(null, "程序加载失败");
}
}
}
);
}
}
另有两个辅助类
FileList和FileText
import java.io.File;
import java.io.IOException;
public class FileList {
private static final String SEP = "/";
private static StringBuffer sb = new StringBuffer("");
//返回目录下的文件名的数组
public static String[] getFiles(File f) throws IOException{
if(f.isDirectory()){
File[] fs = f.listFiles();
for(int i=0;i<fs.length;i++){
getFiles(fs[i]);
}
}
else{
sb.append(f.getPath()+SEP);
}
String s = sb.toString();
String[] ss = s.split(SEP);
return ss;
}
//返回目录下的文件名的数组--重载
public static String[] getFiles(String t) throws IOException{
File f = new File(t);
return getFiles(f);
}
public static void main(String[] args) throws IOException{
String s[] = FileList.getFiles("D:/apache-tomcat-5.5.26/bin");
for(int i=0;i<s.length;i++){
System.out.println(s[i]);
}
}
}
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
public class FileText {
//读取一个文件的所有内容
public static String getText(File f){
StringBuffer sb = new StringBuffer("");
try{
FileReader fr = new FileReader(f);
BufferedReader br = new BufferedReader(fr);
String s = br.readLine();
while(s!=null){
sb.append(s);
s=br.readLine();
}
br.close();
}
catch(Exception e){
sb.append("");
}
return sb.toString();
}
//读取一个文件的所有内容--重载
public static String getText(String s){
String t = "";
try{
File f = new File(s);
t = getText(f);
}
catch(Exception e){
t="";
}
return null;
}
}