Lucene入门例子

现在的位置: 首页 > 综合 > 正文

Lucene入门例子

2014年11月10日 ⁄ 综合 ⁄ 共 6205字 ⁄ 字号小中大 ⁄ 评论关闭

1.建立索引

package org.senssic.lucene;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

/**
 * jdk7+
 * 
 * @ClassName: IndexFiles
 * @Description: 索引建立
 * @author senssic
 * @date 2014年7月8日 上午9:39:30
 * 
 */
public class IndexFiles {
	private static String[] name = { ".txt", ".html" };

	private IndexFiles() {
	}

	public static void main(String[] args) {
		String indexPath = "D:\\Index";// 建立索引的目录
		String docsPath = "D:\\LuceneIndex";// 被索引目录

		boolean create = true;// 是否重新删除建立
		final File docDir = new File(docsPath);

		Date start = new Date();
		try {
			System.out.println("索引目录中 '" + indexPath + "'...");

			Directory dir = FSDirectory.open(new File(indexPath));
			// 使用标准分词
			Analyzer analyzer = new MMSegAnalyzer();
			// 建立索引配置类
			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48,
					analyzer);
			if (create) {
				// 删除索引重新建立改变状态
				iwc.setOpenMode(OpenMode.CREATE);
			} else {
				// 在索引中添加一个新的document改变状态
				iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
			}

			// 索引缓冲
			iwc.setRAMBufferSizeMB(100);

			IndexWriter writer = new IndexWriter(dir, iwc);
			// 递归建立索引
			indexDocs(writer, docDir);
			writer.close();
			Date end = new Date();
			System.out.println("总耗时\t" + (end.getTime() - start.getTime())
					+ "\t毫秒");

		} catch (IOException e) {
			System.out.println(" 异常: " + e.getClass() + "\n异常信息: "
					+ e.getMessage());
		}
	}

	static void indexDocs(IndexWriter writer, File file) throws IOException {

		if (file.canRead()) {
			if (file.isDirectory()) {
				String[] files = file.list(new FilenameFilter() {

					@Override
					public boolean accept(File paramFile, String pString) {
						for (String stn : name) {
							if (!pString.toLowerCase().endsWith(stn)) {
								return true;
							}
						}
						return false;
					}
				});
				if (files != null) {
					for (int i = 0; i < files.length; i++) {
						indexDocs(writer, new File(file, files[i]));
					}
				}
			} else {
				StringBuilder sb = new StringBuilder();
				Scanner scanner = new Scanner(file);
				scanner.useDelimiter("\n");
				while (scanner.hasNext()) {
					sb.append(scanner.next() + "\n");

				}

				try {
					Document doc = new Document();
					// 建立索引信息元素如果不保存则不会存储到Document
					Field pathField = new StringField("path", file.getPath(),
							Field.Store.YES);
					doc.add(pathField);
					doc.add(new TextField("contents", sb.toString(),
							Field.Store.YES));
					doc.add(new StringField("lastmodified",
							new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
									.format(new Date(file.lastModified())),
							Field.Store.YES));
					doc.add(new StringField("filename", file.getName(),
							Field.Store.YES));

					float length = (float) file.length() / (float) 1024;
					doc.add(new StringField("filelength", String.format("%.3f",
							length) + "kB", Field.Store.YES));
					doc.add(new StringField("absolutepath", file
							.getAbsolutePath(), Field.Store.YES));

					// 判断刚才IndexWriterConfig的OpenMode的状态
					if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
						// 重新建立索引
						System.out.println("添加中 " + file);
						writer.addDocument(doc);
					} else {
						// 更新建立索引
						System.out.println("更新中 " + file);
						writer.updateDocument(new Term("path", file.getPath()),
								doc);

					}
				} finally {
					scanner.close();
				}
			}
		}
	}
}

2.查询

package org.senssic.lucene;

import java.io.File;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class SearchFiles {

	private SearchFiles() {
	}

	public static void main(String[] args) throws Exception {
		String index = "D:\\Index";
		String queryString = "我爱你";
		// 打开索引
		IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(
				index)));

		IndexSearcher searcher = new IndexSearcher(reader);

		// 标准分词
		Analyzer analyzer = new MMSegAnalyzer();

		// 分析的字段
		QueryParser parser = new QueryParser(Version.LUCENE_48, "contents",
				analyzer);
		// 查询的内容
		Query query = parser.parse(queryString);
		System.out.println("查询内容: " + query.toString("contents"));
		Date start = new Date();
		TopDocs results = searcher.search(query, null, 100);
		ScoreDoc sDoc[] = results.scoreDocs;
		int i = 0;
		for (ScoreDoc scoreDoc : sDoc) {
			Document document = searcher.doc(scoreDoc.doc);
			System.out.println("\n\n\n第" + ++i + "个文件:");
			System.out.println("文件名称：" + document.get("filename") + "\n路径："
					+ document.get("path") + "\n绝对路径："
					+ document.get("absolutepath") + "\n内容："
					// document.get("contents") + "\n最后修改时间："
					+ document.get("lastmodified") + "\n文件大小："
					+ document.get("filelength"));

		}

		Date end = new Date();
		System.out.println("\n\n\n耗时: " + (end.getTime() - start.getTime())
				+ "ms");
		System.out.println(results.totalHits);
		reader.close();
	}
}

3.使用mmseg4j分词的例子

package org.senssic.lucene.util;

import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class AnalyzerUtils {

	public static void displayAllTokenInfo(String str, Analyzer a) {
		try {
			TokenStream stream = a
					.tokenStream("content", new StringReader(str));
			// 位置增量的属性，存储语汇单元之间的距离
			PositionIncrementAttribute pia = stream
					.addAttribute(PositionIncrementAttribute.class);
			// 每个语汇单元的位置偏移量
			OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class);
			// 存储每一个语汇单元的信息（分词单元信息）
			CharTermAttribute cta = stream
					.addAttribute(CharTermAttribute.class);

			// 使用的分词器的类型信息
			TypeAttribute ta = stream.addAttribute(TypeAttribute.class);
			for (; stream.incrementToken();) {
				System.out.print("[" + cta + "]");
				System.out.print(pia.getPositionIncrement() + ":");
				System.out.print(cta + "[" + oa.startOffset() + "-"
						+ oa.endOffset() + "]-->" + ta.type() + "\n");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) {
		AnalyzerUtils.displayAllTokenInfo("我爱你中国", new MMSegAnalyzer());
	}

}

需要的jar包

【上篇】URI scheme
【下篇】java中常见数据库字段类型与java.sql.Types的对应关系

作者: turret

该日志由 turret 于9年前发表在综合分类下，最后更新于 2014年11月10日.
转载请注明: Lucene入门例子 | 学步园 +复制链接

抱歉!评论已关闭.

学步园

Lucene入门例子

作者: turret

书签

最新文章New

本站推荐

返回首页