现在的位置: 首页 > 综合 > 正文

Lucene分词与查询

2017年12月08日 ⁄ 综合 ⁄ 共 6243字 ⁄ 字号 评论关闭
 

 

 

package com.demo.ajax;

public class Building
{
	private Integer id;
	
	private String name;
	
	private String Information;

	public Integer getId()
	{
		return id;
	}

	public void setId(Integer id)
	{
		this.id = id;
	}

	public String getName()
	{
		return name;
	}

	public void setName(String name)
	{
		this.name = name;
	}

	public String getInformation()
	{
		return Information;
	}

	public void setInformation(String information)
	{
		Information = information;
	}
}

 

package com.demo.ajax;

import java.util.ArrayList;
import java.util.List;

public class InitTool
{
	public static List<Building> initBuilding()
	{
		List<Building> list = new ArrayList<Building>();
		
		
		for (int i=60;i<100;i++)
		{
			Building building = new Building();
			
			building.setId(i);
			building.setName(i+"号楼");
			building.setInformation("总统套间");
			list.add(building);
		}
		
		return list;
	}
}

 

package com.demo.ajax;

import java.io.File;
import java.net.URLDecoder;
import java.util.Iterator;
import java.util.List;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;


public class AnalyzerTool
{
	public void createIndex() throws Exception 
	{
		// 实例化分词器,使用的是中文分词器
		Analyzer analyzer = new PaodingAnalyzer();
		
		// 指定要保存的文件路径并保存到FSDirectory中
		
//		System.out.println(URLDecoder.decode(AnalyzerTool.class
//				.getResource("/date/index/building/").toString(),"UTF-8").substring(6));
		
		File file = new File("d:\\data\\index");
		if(!file.exists())
		{
			file.mkdirs();
		}
		
		FSDirectory directory = FSDirectory.getDirectory("d:\\data\\index");
		
		//true表示覆盖原来已经创建的索引,如果是false表示不覆盖,而是继续添加索引
		IndexWriter writer = new IndexWriter(directory, analyzer, true);

		List<Building> list = InitTool.initBuilding();

		for(int i=0;i<30;i++)
		{
			Building building = list.get(i);
			System.out.println(building.getId()+"-------------->"+building.getName()+"---------->"+building.getInformation());
		}
		
		for(Building building : list)
		{
			System.out.println(building.getId()+"-------------->"+building.getName()+"---------->"+building.getInformation());
		}
		
		
		for (Iterator<Building> it = list.iterator(); it.hasNext();) 
		{
			Document doc = new Document();
			
			Building building = (Building) it.next();
			doc.add(new Field("id", String.valueOf(building.getId()), Field.Store.YES,
					Field.Index.UN_TOKENIZED));
			doc.add(new Field("building_name", building.getName(), Field.Store.YES,
					Field.Index.TOKENIZED));
			
//			String information = FunctionUtil.Html2Text(building.getInformation());
			
			doc.add(new Field("building_information", building.getInformation(), Field.Store.YES,
					Field.Index.TOKENIZED));
			writer.addDocument(doc);
		}
		writer.optimize();
		writer.close();
	}
	
	public static void main(String[] args) throws Exception
	{
		AnalyzerTool analyzerTool = new AnalyzerTool();
		analyzerTool.createIndex();
	}
}

 

package com.demo.ajax;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;

public class LuceneSearchTool
{
	List<Building> searcheResult = new ArrayList<Building>();
	
	public List<Building> getSearcheResult()
	{
		return searcheResult;
	}

	public void setSearcheResult(List<Building> searcheResult)
	{
		this.searcheResult = searcheResult;
	}

	public List<Building> search(String keywords)throws Exception
	{
		String path = "d:\\data\\index";
		return searchIndex(path,keywords);
	}
	
	public List<Building> searchIndex(String path, String keywords) throws Exception 
	{
		FSDirectory directory = FSDirectory.getDirectory(path);
		IndexReader reader = IndexReader.open(directory);

		Searcher searcher = new IndexSearcher(directory);

		// MultiFieldQueryParser.parse中的参数分别为:
		// 1.关键词
		// 2.要查询的字段,字符串类型的数组
		String[] field = { "building_name", "building_information" };
		// 3.两个字段的关系(与或非)
		BooleanClause.Occur[] flags = new BooleanClause.Occur[] {
				BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };
		// 4.指明分词的时候所用的分词器
		Analyzer analyzer = new PaodingAnalyzer();
		Query query = MultiFieldQueryParser.parse(keywords, field, flags,
				analyzer);

		// 由于我们目前使用的查询是多字段查询,需要匹配度的排序
		// QueryScorer内置计分器
//		query.rewrite(reader);// 用于重写query对象,目的能够让计分器识别当前的query.

		// 获得结果集
		Hits hits = searcher.search(query);
		

		for (int i = 0; i < hits.length(); i++) 
		{
			Document doc = hits.doc(i);
			Building building = new Building();
			building.setId(Integer.valueOf(doc.get("id")));
			// title
			String name = doc.get("building_name");
			building.setName(name);
			// content
			String information = doc.get("building_information");
			building.setInformation(information);
			// 以上两项需要加亮

			// Highlighter的构造函数中需要添加两个参数
			// 1.高亮文字的格式(这个格式是基于html)
			SimpleHTMLFormatter simpleHTMLFOrmatter = new SimpleHTMLFormatter(
					"<font color=red>", "</font>");
			// 2.计分器
			Highlighter highlighter = new Highlighter(simpleHTMLFOrmatter,
					new QueryScorer(query));

			// 关键字附近字符串的截取,截取120个字
			Fragmenter fragmenter = new SimpleFragmenter(120);
			highlighter.setTextFragmenter(fragmenter);

			// 针对某个字段的加亮以及截取
			TokenStream tokenStream = analyzer.tokenStream("building_information",
					new StringReader(information));
			//将加亮并截取的字符串取出来
			String highLightText = highlighter.getBestFragment(tokenStream, information);
			
			if(highLightText!=null)
			{
				building.setInformation(highLightText);
			}
			
			
			// 针对某个字段的加亮以及截取
			TokenStream name_tokenStream = analyzer.tokenStream("building_name",
					new StringReader(name));
			//将加亮并截取的字符串取出来
			String name_highLightText = highlighter.getBestFragment(name_tokenStream, name);
			
			if(name_highLightText != null)
			{
				building.setName(name_highLightText);
			}
			
			searcheResult.add(building);
		}
		reader.close();
		
		return searcheResult;
	}
}

 

package com.demo.ajax;

import java.util.List;

public class Junit
{
	public static void main(String[] args) throws Exception
	{
		LuceneSearchTool luceneSearchTool = new LuceneSearchTool();
		List<Building> searcheResult = luceneSearchTool.search("号楼");
		
		System.out.println(searcheResult.size());
		for(Building building : searcheResult)
		{
			System.out.println(building.getId()+"------------->"+building.getName()+"-------------->"+building.getInformation());
		}
	}
}

 

 

【上篇】
【下篇】

抱歉!评论已关闭.