分享下自己的lucene工具类,不足之处欢迎吐槽!
0.2版
package com.jiuxing.qa.util.lucene; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.paoding.analysis.analyzer.PaodingAnalyzer; import net.paoding.analysis.knife.Paoding; import net.paoding.analysis.knife.PaodingMaker; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import com.jiuxing.qa.util.PropertyUtil; /** * luence操作工具类 提供索引创建、查询功能 lucene vsrsion 3.6.1 * * @author jiaojun [junjiao.j@gmail.com] * @version v0.0.2 * @param <T> * @date 2012-08-20 */ public class LuceneUtil<T> { private static Log log = LogFactory.getLog(LuceneUtil.class); /** * 索引优化后文件段的数量,数量越大,优化效率越大 */ private static final int DEFAULT_MAX_NUM_SEGMENTS = 3; /** * 低版本的查询索引存活周期 */ private static final long STALE_INDEXREADER_SURVIVAL_TIME = 60000; private static Map<String, IndexWriter> writerPool = new HashMap<String, IndexWriter>(); private static Map<String, IndexReader> readerPool = new HashMap<String, IndexReader>(); /** * 存放IndexReader的Map,Map里存放的都是已经实例化好的IndexReader */ private static Map<Long, IndexReader> stalereaderPool = new HashMap<Long, IndexReader>(); private static LuceneUtil util = null; private LuceneUtil() { } public synchronized static LuceneUtil getInstance() { if (util == null) { util = new LuceneUtil(); } return util; } static { init(); } /** * 始化索引池初 */ public static void init() { log.info("索引池初始化开始"); String indexDir = PropertyUtil.getPropertiesByKey("lucene.properties", "lucene.index.dir"); String pool = PropertyUtil.getPropertiesByKey("lucene.properties", "lucene.index.pool"); for (String poolDir : pool.split(",")) { synchronized (writerPool) { try { IndexWriter iw = createIndexWriter(indexDir + poolDir); if (iw != null) writerPool.put(poolDir, iw); } catch (IOException e) { log.error("writerPool初始化失败,原因:" + e.getMessage()); } } synchronized (readerPool) { try { IndexReader ir = IndexReader.open(FSDirectory .open(getIndexFile(indexDir + poolDir)),false); if (ir != null) readerPool.put(poolDir, ir); } catch (Exception e) { log.error("readerPool初始化失败,原因:" + e.getMessage()); } } } log.info("索引池初始化完成"); } /** * 创建索引池初始化 */ public static void initIndexWriter() { log.info("【创建索引池】初始化开始"); String indexDir = PropertyUtil.getPropertiesByKey("lucene.properties", "lucene.index.dir"); String pool = PropertyUtil.getPropertiesByKey("lucene.properties", "lucene.index.pool"); for (String poolDir : pool.split(",")) { synchronized (writerPool) { try { IndexWriter iw = createIndexWriter(indexDir + poolDir); if (iw != null) writerPool.put(poolDir, iw); } catch (IOException e) { log.error("writerPool初始化失败,原因:" + e.getMessage()); } } } log.info("【创建索引池】初始化完成"); } /** * 创建索引,建议定时更新即可 * * @param <T> * @param indexDir * 索引根保存位置 * @param poolDir * 索引池保存位置 * @param list * 需要创建索引的数据 * @param clz * 数据绑定的对象 * @param fields * 须创建索引的属性(小写) * @throws IOException * @throws NoSuchMethodException * @throws SecurityException * @throws InvocationTargetException * @throws IllegalAccessException * @throws IllegalArgumentException */ public static <T> void createIndex(String indexDir, String poolDir, List<?> list, Class<T> clz, String[] fields) throws IOException, SecurityException, NoSuchMethodException, IllegalArgumentException, IllegalAccessException, InvocationTargetException { createIndex(indexDir,poolDir,list,clz, fields,false); } /** * 创建索引,建议定时更新即可 * * @param <T> * @param indexDir * 索引根保存位置 * @param poolDir * 索引池保存位置 * @param list * 需要创建索引的数据 * @param clz * 数据绑定的对象 * @param fields * 须创建索引的属性(小写) * @param isDel * 是否删除原索引重新创建 * @throws IOException * @throws NoSuchMethodException * @throws SecurityException * @throws InvocationTargetException * @throws IllegalAccessException * @throws IllegalArgumentException */ public static <T> void createIndex(String indexDir, String poolDir, List<?> list, Class<T> clz, String[] fields,boolean isDel) throws IOException, SecurityException, NoSuchMethodException, IllegalArgumentException, IllegalAccessException, InvocationTargetException { log.info("索引开始创建,服务于 " + clz + " | " + fields.toString()); long start = new Date().getTime(); IndexWriter writer = getIndexWriter(indexDir, poolDir); if (null == writer) { log.error("IndexWriter获取失败"); return; } // 删除全部索引 if(isDel){ //writer.deleteAll(); } SimpleDateFormat simpleDateFormat = new SimpleDateFormat( "yyyy-MM-dd hh:mm:ss"); if (null != list && list.size() > 0) { for (int i = 0; i < list.size(); i++) { Document doc = new Document(); java.lang.reflect.Field[] cfs = clz.getDeclaredFields(); for (java.lang.reflect.Field cf : cfs) { String fieldName = cf.getName(); String stringLetter = fieldName.substring(0, 1) .toUpperCase(); String getName = "get" + stringLetter + fieldName.substring(1); // String setName="set"+stringLetter+fieldName.substring(1); Method getMethod = clz.getMethod(getName); // Method setMethod=clz.getMethod(setName, new // Class[]{cf.getType()}); Object value = getMethod.invoke((T) list.get(i)); if (Arrays.asList(fields).contains(fieldName)) { if (value != null && !"".equals(value.toString())) { String tmp = ""; if (cf.getGenericType().toString().equals( "class java.util.Date")) { tmp = simpleDateFormat.format(value); } else { tmp = value.toString(); } doc.add(new Field(fieldName, tmp, Field.Store.YES, Field.Index.ANALYZED)); } } } if(!isDel){ /** * 先将fields[0]的索引查找到,然后再删除,最后将新的索引添加到索引文件中 */ if(null != doc.get(fields[0])){ writer.updateDocument(new Term(fields[0], doc.get(fields[0])), doc); } } } log.info("索引创建完成,保存目录:" + indexDir + poolDir + ",索引创建/记录:" + writer.maxDoc() + "/" + list.size() + "条,花费时间:" + (new Date().getTime() - start) / 1000 + "秒!" + writer); list.clear(); } writer.forceMerge(DEFAULT_MAX_NUM_SEGMENTS); writer.commit(); } /** * 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列 * * @param indexDir * 索引根保存位置 * @param poolDir * 索引池保存位置 * @param keyWords * 关键词 * @param fields * 属性 * @param pageSize * 每页记录数 * @param currentPage * 当前页数 * @throws IOException * @return SearchResult 查询结果集 * @throws IOException * @throws InvalidTokenOffsetsException */ public static SearchResult searchPage(String indexDir, String poolDir, String[] keyWords, String[] fields, int pageSize, int currentPage) throws IOException, InvalidTokenOffsetsException { return searchPage(indexDir, poolDir, keyWords, fields, true, pageSize, currentPage); } /** * 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列 * * @param indexDir * 索引根保存位置 * @param poolDir * 索引池保存位置 * @param keyWords * 关键词 * @param fields * 属性 * @param isHighlighter * 是否高亮显示 * @param pageSize * 每页记录数 * @param currentPage * 当前页数 * @throws IOException * @return SearchResult 查询结果集 * @throws IOException * @throws InvalidTokenOffsetsException */ public static SearchResult searchPage(String indexDir, String poolDir, String[] keyWords, String[] fields, boolean isHighlighter, int pageSize, int currentPage) throws IOException, InvalidTokenOffsetsException { return searchPage(indexDir, poolDir, keyWords, fields, true, pageSize, currentPage,true); } /** * 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列 * * @param indexDir * 索引根保存位置 * @param poolDir * 索引池保存位置 * @param keyWords * 关键词 * @param fields * 属性 * @param isHighlighter * 是否高亮显示 * @param pageSize * 每页记录数 * @param currentPage * 当前页数 * @param isPage * 是否分页,如无需分页只查条数的话,直接传入条数即可,大大优化索引查询效率 * @throws IOException * @return SearchResult 查询结果集 * @throws IOException * @throws InvalidTokenOffsetsException */ public static SearchResult searchPage(String indexDir, String poolDir, String[] keyWords, String[] fields, boolean isHighlighter, int pageSize, int currentPage,boolean isPage) throws IOException, InvalidTokenOffsetsException { //将关键字中的特殊符号过滤 if(null != keyWords && keyWords.length>0){ String[] tmp = new String[keyWords.length]; for(int i = 0;i<keyWords.length;i++){ tmp[i] = stringFilter(keyWords[i]); } keyWords = tmp; } SearchResult searchResult = new SearchResult(); IndexSearcher searcher = null; try { IndexReader reader = getIndexReader(indexDir, poolDir); if (null != reader) { reader = refreshIndexReader(poolDir, reader); } if(null == reader){ log.error("索引文件为空,请检查!"); return null; } searcher = new IndexSearcher(reader); searcher.setDefaultFieldSortScoring(true, false); Analyzer analyzer = getAnalyzer(); if (keyWords[0].length() < 2) { analyzer = new StandardAnalyzer(Version.LUCENE_36); } Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, keyWords, fields, analyzer); // query.setBoost(0.1f); /* * 1.被排序的字段必须被索引过(Indexecd),在索引时不能 用 Field.Index.TOKENIZED * (用UN_TOKENIZED可以正常实现.用NO时查询正常,但排序不能正常设置升降序) 2.SortField类型 * SCORE、DOC、AUTO、STRING、INT、FLOAT、CUSTOM 此类型主要是根据字段的类型选择 * 3.SortField的第三个参数代表是否是降序true:降序 false:升序 */ Sort sort = new Sort(new SortField[] { SortField.FIELD_SCORE, new SortField(fields[0], SortField.STRING, true) }); TopDocs topDocs = null; if(isPage){ topDocs = searcher.search(query, searcher.maxDoc(), sort); }else{ int searchNum = pageSize<searcher.maxDoc()?pageSize:searcher.maxDoc(); topDocs = searcher.search(query, searchNum, sort); } ScoreDoc[] hits = topDocs.scoreDocs; int begin = pageSize * (currentPage - 1); int end = Math.min(begin + pageSize, hits.length); List<Document> documents = new ArrayList<Document>(); for (int i = begin; i < end; i++) { Document document = searcher.doc(hits[i].doc); if (isHighlighter) { document.getField(fields[0]) .setValue(toHighlighter(query, document, fields[0], analyzer)); } documents.add(document); // hits[i].score 匹配度分值 } searchResult.setPageSize(pageSize); searchResult.setCurrentPage(currentPage); searchResult.setDocuments(documents); searchResult.setTotalCount(hits.length); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } return searchResult; } /** * 使用Field信息来批量删除文档 * @description: <br> * @author:jiaojun * @param indexDir * @param poolDir * @param field * @param keyWord * @throws IOException * @throws CorruptIndexException */ public static void deleteIndex(String indexDir, String poolDir,String field,String keyWord) { IndexWriter writer = null; try { writer = getIndexWriter(indexDir, poolDir); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } if (null == writer) { log.error("IndexWriter获取失败"); return; } Term term=new Term(field,keyWord); //分别代表FieldName,和field的值。 try { writer.deleteDocuments(term); writer.forceMerge(DEFAULT_MAX_NUM_SEGMENTS); writer.commit(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } //reader.close();//在调用close方法前的删除只是标记删除,只有调用了writer.optimize后才//是真正的在物理上删除,否则是可以使用reader.undeleteAll(),方法进行恢复的 } /** * 释放索引 */ public static void destroy() { synchronized (writerPool) { Iterator<Entry<String, IndexWriter>> iterator = writerPool .entrySet().iterator(); while (iterator.hasNext()) { Entry<String, IndexWriter> entry = iterator.next(); IndexWriter indexWriter = entry.getValue(); try { indexWriter.commit(); indexWriter.close(); } catch (Exception e) { log.error("writerPool销毁失败,原因:" + e.getMessage()); } } writerPool.clear(); } synchronized (readerPool) { Iterator<Entry<String, IndexReader>> iterator = readerPool .entrySet().iterator(); while (iterator.hasNext()) { Entry<String, IndexReader> entry = iterator.next(); IndexReader indexReader = entry.getValue(); try { indexReader.close(); } catch (Exception e) { log.error("readerPool销毁失败,原因:" + e.getMessage()); } } readerPool.clear(); } } /** * 释放创建索引 */ public static void destroyIndexWriter() { synchronized (writerPool) { Iterator<Entry<String, IndexWriter>> iterator = writerPool .entrySet().iterator(); while (iterator.hasNext()) { Entry<String, IndexWriter> entry = iterator.next(); IndexWriter indexWriter = entry.getValue(); try { indexWriter.close(); } catch (Exception e) { log.error("writerPool销毁失败,原因:" + e.getMessage()); } } writerPool.clear(); } log.info("【创建索引池】完成销毁"); } /** * 释放旧查询索引 */ public static void destroyIndexReader(Map<Long, IndexReader> readerPool) { synchronized (readerPool) { Iterator<Entry<Long, IndexReader>> iterator = readerPool.entrySet() .iterator(); while (iterator.hasNext()) { Entry<Long, IndexReader> entry = iterator.next(); if ((System.currentTimeMillis() - entry.getKey()) >= STALE_INDEXREADER_SURVIVAL_TIME) { IndexReader indexReader = entry.getValue(); try { indexReader.close(); log.info("【查询索引池】完成销毁" + entry.getValue()); } catch (Exception e) { log.error("readerPool销毁失败,原因:" + e.getMessage()); } } } readerPool.clear(); } } /** * 刷新指定的indexReader--加载新的索引数据,若产生新的indexReader, * 则在indexReaderMap里替换旧的indexReader * * @param indexDirName * @param indexReader * @return {@link IndexReader} */ private synchronized static IndexReader refreshIndexReader(String poolDir, IndexReader indexReader) { try { destroyIndexReader(stalereaderPool); IndexReader newIndexReader = indexReader.reopen(false); if (newIndexReader != indexReader) { IndexReader oldIndexReader = indexReader; stalereaderPool.put(System.currentTimeMillis(), oldIndexReader); readerPool.put(poolDir, newIndexReader); } } catch (Exception e) { log.error("刷新索引失败" + e.getMessage()); } // return newest IndexReader return readerPool.get(poolDir); } /** * 过滤特殊符号 * * @param str * @return * @throws PatternSyntaxException */ public static String stringFilter(String str) throws PatternSyntaxException { String regEx = "[`~!@#$%^&*()+=|{ }':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?·\'\"\\-\t\n\r]"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(str); return m.replaceAll("").trim(); } /** * 高亮设置 * * @param query * @param doc * @param field * @return */ private static String toHighlighter(Query query, Document doc, String field, Analyzer analyzer) { try { SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( "<font color=\"red\">", "</font>"); Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(doc.get(field) .length() + 100)); TokenStream tokenStream = analyzer.tokenStream(field, new StringReader(doc.get(field))); String highlighterStr = highlighter.getBestFragment(tokenStream, doc.get(field)); return highlighterStr == null ? doc.get(field) : highlighterStr; } catch (IOException e) { log.error(e.getMessage()); } catch (InvalidTokenOffsetsException e) { log.error(e.getMessage()); } return null; } @SuppressWarnings("static-access") private static IndexWriter getIndexWriter(String indexDir, String poolDir) throws CorruptIndexException, LockObtainFailedException, IOException { IndexWriter writer = writerPool.get(poolDir); if (writer == null) { synchronized (writerPool) { if (!writerPool.containsKey(poolDir)) { try { writer = createIndexWriter(indexDir + poolDir); if (writer != null) writerPool.put(poolDir, writer); } catch (IOException e) { if (IndexWriter.isLocked(FSDirectory .open(getIndexFile(indexDir + poolDir)))) { IndexWriter.unlock(FSDirectory .open(getIndexFile(indexDir + poolDir))); } log.error(e.getMessage()); e.printStackTrace(); destroy(); } } } } return writer; } private static IndexReader getIndexReader(String indexDir, String poolDir) throws CorruptIndexException, IOException { IndexReader reader = readerPool.get(poolDir); synchronized (readerPool) { if (!readerPool.containsKey(poolDir)) { try { reader = IndexReader.open(FSDirectory .open(getIndexFile(indexDir + poolDir)),false); if (reader != null) readerPool.put(poolDir, reader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } return reader; } private static IndexWriter createIndexWriter(String dir) throws CorruptIndexException, LockObtainFailedException, IOException { /* * mmseg4j:ComplexAnalyzer 适用于高匹配度的中文 lucene标准:StandardAnalyzer */ IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, getAnalyzer()); /* * 创建索引模式:CREATE,覆盖模式; conf.setOpenMode(OpenMode.CREATE); * * APPEND,追加模式 conf.setOpenMode(OpenMode.CREATE_OR_APPEND); */ conf.setOpenMode(OpenMode.CREATE_OR_APPEND); if (IndexWriter.isLocked(FSDirectory.open(getIndexFile(dir)))) { IndexWriter.unlock(FSDirectory.open(getIndexFile(dir))); } IndexWriter writer = new IndexWriter(FSDirectory .open(getIndexFile(dir)), conf); return writer; } /** * 获取分词模式 paodingAnalyer Paoding paoding = PaodingMaker.make(); return * PaodingAnalyzer.writerMode(paoding); //writer mode意味要同时支持最大和最小切词 * lucene标准:StandardAnalyzer new StandardAnalyzer(Version.LUCENE_36); * * @return */ private static Analyzer getAnalyzer() { Paoding paoding = PaodingMaker.make(); return PaodingAnalyzer.writerMode(paoding); } private static File getIndexFile(String dir) { return new File(new StringBuilder(new File(LuceneUtil.class .getResource("/").getPath()).getParentFile().getParentFile() .getPath().replace('\\', '/').toString()).append(dir) .toString()); } public static void main(String[] args) { System.out.println(stringFilter("[不懂就要问]请问 H6不能插u盘听歌吗 知道的说下 谢谢!")); // init(); // for (int i = 0; i < 50; i++) { // new Thread(new Runnable() { // // @Override // public void run() { // try { // Thread.currentThread().sleep(500); // } catch (InterruptedException e) { // e.printStackTrace(); // } // // IndexWriter close = null; // IndexWriter noClose = null; // IndexWriter searchLog = null; // try { // close = getIndexWriter("/WEB-INF/index/", "close"); // noClose = getIndexWriter("/WEB-INF/index/", "noClose"); // searchLog = getIndexWriter("/WEB-INF/index/", // "searchLog"); // // IndexReader readerc = getIndexReader("/WEB-INF/index/", // "close"); // IndexReader readern = getIndexReader("/WEB-INF/index/", // "noClose"); // IndexReader readers = getIndexReader("/WEB-INF/index/", // "searchLog"); // // // System.out.println(readerc); // // System.out.println(readern); // // System.out.println(readers); // // } catch (CorruptIndexException e) { // e.printStackTrace(); // } catch (LockObtainFailedException e) { // e.printStackTrace(); // } catch (IOException e) { // e.printStackTrace(); // } // // if (close == null || noClose == null) { // System.out.println("-----------"); // } // // System.out.println(close); // // System.out.println(noClose); // // System.out.println(searchLog); // // } // }).start(); // } // // // destroy(); } }
0.1版
package com.junjiao.util.search; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.paoding.analysis.analyzer.PaodingAnalyzer; import net.paoding.analysis.knife.Paoding; import net.paoding.analysis.knife.PaodingMaker; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import com.junjiao.util.java.PropertyUtil; /** * luence操作工具类 提供索引创建、查询功能 lucene vsrsion 3.6.1 * * 【索引的检查与修复】 * CheckIndex在lucene-core jar包的org.apache.lucene.index目录下。它的功能是检查索引的的健康情况和修复索引。<br/> * 如果检查出某些segments有错误, 可以通过-fix参数执行修复操作,修复的过程就是创建一个新的segments,把所有引 <br/> * 用错误segments的索引数据删除。 * * cd /var/www/virtualhost/qa.51auto.cn/WEB-INF/lib * java -cp /var/www/virtualhost/qa.51auto.cn/WEB-INF/lib/lucene-core-3.6.1.jar -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex /var/www/virtualhost/qa.51auto.cn/WEB-INF/index/all * 检查 * java -cp lucene-core-3.6.1.jar -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex /var/www/virtualhost/qa.51auto.cn/WEB-INF/index/all * 修复 * java -cp lucene-core-3.6.1.jar -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex /var/www/virtualhost/qa.51auto.cn/WEB-INF/index/tag -fix * @author jiaojun [junjiao.j@gmail.com] * @version v0.0.1 * @param <T> * @date 2012-08-20 */ public class LuceneUtil<T> { private static Log log = LogFactory.getLog(LuceneUtil.class); /** * 索引优化后文件段的数量,数量越大,优化效率越大 */ private static final int DEFAULT_MAX_NUM_SEGMENTS = 3; /** * 低版本的查询索引存活周期 */ private static final long STALE_INDEXREADER_SURVIVAL_TIME = 60000; private static Map<String, IndexWriter> writerPool = new HashMap<String, IndexWriter>(); private static Map<String, IndexReader> readerPool = new HashMap<String, IndexReader>(); /** * 存放IndexReader的Map,Map里存放的都是已经实例化好的IndexReader */ private static Map<Long, IndexReader> stalereaderPool = new HashMap<Long, IndexReader>(); private static LuceneUtil util = null; private LuceneUtil() { } public synchronized static LuceneUtil getInstance() { if (util == null) { util = new LuceneUtil(); } return util; } static { init(); } /** * 始化索引池初 */ public static void init() { log.info("索引池初始化开始"); String indexDir = PropertyUtil.getPropertiesByKey("lucene.properties", "lucene.index.dir"); String pool = PropertyUtil.getPropertiesByKey("lucene.properties", "lucene.index.pool"); for (String poolDir : pool.split(",")) { synchronized (writerPool) { try { IndexWriter iw = createIndexWriter(indexDir + poolDir); if (iw != null) writerPool.put(poolDir, iw); } catch (IOException e) { log.error("writerPool初始化失败,原因:" + e.getMessage()); } } synchronized (writerPool) { try { IndexReader ir = IndexReader.open(FSDirectory .open(getIndexFile(indexDir + poolDir))); if (ir != null) readerPool.put(poolDir, ir); } catch (Exception e) { log.error("readerPool初始化失败,原因:" + e.getMessage()); } } } log.info("索引池初始化完成"); } /** * 创建索引池初始化 */ public static void initIndexWriter() { log.info("【创建索引池】初始化开始"); String indexDir = PropertyUtil.getPropertiesByKey("lucene.properties", "lucene.index.dir"); String pool = PropertyUtil.getPropertiesByKey("lucene.properties", "lucene.index.pool"); for (String poolDir : pool.split(",")) { synchronized (writerPool) { try { IndexWriter iw = createIndexWriter(indexDir + poolDir); if (iw != null) writerPool.put(poolDir, iw); } catch (IOException e) { log.error("writerPool初始化失败,原因:" + e.getMessage()); } } } log.info("【创建索引池】初始化完成"); } /** * 创建索引,建议定时更新即可 * * @param <T> * @param indexDir * 索引根保存位置 * @param poolDir * 索引池保存位置 * @param list * 需要创建索引的数据 * @param clz * 数据绑定的对象 * @param fields * 须创建索引的属性(小写) * @throws IOException * @throws NoSuchMethodException * @throws SecurityException * @throws InvocationTargetException * @throws IllegalAccessException * @throws IllegalArgumentException */ public static <T> void createIndex(String indexDir, String poolDir, List<?> list, Class<T> clz, String[] fields) throws IOException, SecurityException, NoSuchMethodException, IllegalArgumentException, IllegalAccessException, InvocationTargetException { createIndex(indexDir,poolDir,list,clz, fields,false); } /** * 创建索引,建议定时更新即可 * * @param <T> * @param indexDir * 索引根保存位置 * @param poolDir * 索引池保存位置 * @param list * 需要创建索引的数据 * @param clz * 数据绑定的对象 * @param fields * 须创建索引的属性(小写) * @param isDel * 是否删除原索引重新创建 * @throws IOException * @throws NoSuchMethodException * @throws SecurityException * @throws InvocationTargetException * @throws IllegalAccessException * @throws IllegalArgumentException */ public static <T> void createIndex(String indexDir, String poolDir, List<?> list, Class<T> clz, String[] fields,boolean isDel) throws IOException, SecurityException, NoSuchMethodException, IllegalArgumentException, IllegalAccessException, InvocationTargetException { log.info("索引开始创建,服务于 " + clz + " | " + fields.toString()); long start = new Date().getTime(); IndexWriter writer = getIndexWriter(indexDir, poolDir); if (null == writer) { log.error("IndexWriter获取失败"); return; } // 删除全部索引 if(isDel){ writer.deleteAll(); } SimpleDateFormat simpleDateFormat = new SimpleDateFormat( "yyyy-MM-dd hh:mm:ss"); if (null != list && list.size() > 0) { for (int i = 0; i < list.size(); i++) { Document doc = new Document(); java.lang.reflect.Field[] cfs = clz.getDeclaredFields(); for (java.lang.reflect.Field cf : cfs) { String fieldName = cf.getName(); String stringLetter = fieldName.substring(0, 1) .toUpperCase(); String getName = "get" + stringLetter + fieldName.substring(1); // String setName="set"+stringLetter+fieldName.substring(1); Method getMethod = clz.getMethod(getName); // Method setMethod=clz.getMethod(setName, new // Class[]{cf.getType()}); Object value = getMethod.invoke((T) list.get(i)); if (Arrays.asList(fields).contains(fieldName)) { if (value != null && !"".equals(value.toString())) { String tmp = ""; if (cf.getGenericType().toString().equals( "class java.util.Date")) { tmp = simpleDateFormat.format(value); } else { tmp = value.toString(); } doc.add(new Field(fieldName, tmp, Field.Store.YES, Field.Index.ANALYZED)); } } } if(!isDel){ /** * 先将fields[0]的索引查找到,然后再删除,最后将新的索引添加到索引文件中 */ if(null != doc.get(fields[0])){ writer.updateDocument(new Term(fields[0], doc.get(fields[0])), doc); } } } log.info("索引创建完成,保存目录:" + indexDir + poolDir + ",索引创建/记录:" + writer.maxDoc() + "/" + list.size() + "条,花费时间:" + (new Date().getTime() - start) / 1000 + "秒!" + writer); list.clear(); } writer.forceMerge(DEFAULT_MAX_NUM_SEGMENTS); writer.commit(); } /** * 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列 * * @param indexDir * 索引根保存位置 * @param poolDir * 索引池保存位置 * @param keyWords * 关键词 * @param fields * 属性 * @param pageSize * 每页记录数 * @param currentPage * 当前页数 * @throws IOException * @return SearchResult 查询结果集 * @throws IOException * @throws InvalidTokenOffsetsException */ public static SearchResult searchPage(String indexDir, String poolDir, String[] keyWords, String[] fields, int pageSize, int currentPage) throws IOException, InvalidTokenOffsetsException { return searchPage(indexDir, poolDir, keyWords, fields, true, pageSize, currentPage); } /** * 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列 * * @param indexDir * 索引根保存位置 * @param poolDir * 索引池保存位置 * @param keyWords * 关键词 * @param fields * 属性 * @param isHighlighter * 是否高亮显示 * @param pageSize * 每页记录数 * @param currentPage * 当前页数 * @throws IOException * @return SearchResult 查询结果集 * @throws IOException * @throws InvalidTokenOffsetsException */ public static SearchResult searchPage(String indexDir, String poolDir, String[] keyWords, String[] fields, boolean isHighlighter, int pageSize, int currentPage) throws IOException, InvalidTokenOffsetsException { return searchPage(indexDir, poolDir, keyWords, fields, true, pageSize, currentPage,true); } /** * 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列 * * @param indexDir * 索引根保存位置 * @param poolDir * 索引池保存位置 * @param keyWords * 关键词 * @param fields * 属性 * @param isHighlighter * 是否高亮显示 * @param pageSize * 每页记录数 * @param currentPage * 当前页数 * @param isPage * 是否分页,如无需分页只查条数的话,直接传入条数即可,大大优化索引查询效率 * @throws IOException * @return SearchResult 查询结果集 * @throws IOException * @throws InvalidTokenOffsetsException */ public static SearchResult searchPage(String indexDir, String poolDir, String[] keyWords, String[] fields, boolean isHighlighter, int pageSize, int currentPage,boolean isPage) throws IOException, InvalidTokenOffsetsException { //将关键字中的特殊符号过滤 if(null != keyWords && keyWords.length>0){ String[] tmp = new String[keyWords.length]; for(int i = 0;i<keyWords.length;i++){ tmp[i] = stringFilter(keyWords[i]); } keyWords = tmp; } SearchResult searchResult = new SearchResult(); IndexSearcher searcher = null; try { IndexReader reader = getIndexReader(indexDir, poolDir); if (null != reader) { reader = refreshIndexReader(poolDir, reader); } if(null == reader){ log.error("索引文件为空,请检查!"); return null; } searcher = new IndexSearcher(reader); searcher.setDefaultFieldSortScoring(true, false); Analyzer analyzer = getAnalyzer(); if (keyWords[0].length() < 2) { analyzer = new StandardAnalyzer(Version.LUCENE_36); } Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, keyWords, fields, analyzer); // query.setBoost(0.1f); /* * 1.被排序的字段必须被索引过(Indexecd),在索引时不能 用 Field.Index.TOKENIZED * (用UN_TOKENIZED可以正常实现.用NO时查询正常,但排序不能正常设置升降序) 2.SortField类型 * SCORE、DOC、AUTO、STRING、INT、FLOAT、CUSTOM 此类型主要是根据字段的类型选择 * 3.SortField的第三个参数代表是否是降序true:降序 false:升序 */ Sort sort = new Sort(new SortField[] { SortField.FIELD_SCORE, new SortField(fields[0], SortField.STRING, true) }); TopDocs topDocs = null; if(isPage){ topDocs = searcher.search(query, searcher.maxDoc(), sort); }else{ int searchNum = pageSize<searcher.maxDoc()?pageSize:searcher.maxDoc(); topDocs = searcher.search(query, searchNum, sort); } ScoreDoc[] hits = topDocs.scoreDocs; int begin = pageSize * (currentPage - 1); int end = Math.min(begin + pageSize, hits.length); List<Document> documents = new ArrayList<Document>(); for (int i = begin; i < end; i++) { Document document = searcher.doc(hits[i].doc); if (isHighlighter) { document.getField(fields[0]) .setValue(toHighlighter(query, document, fields[0], analyzer)); } documents.add(document); // hits[i].score 匹配度分值 } searchResult.setPageSize(pageSize); searchResult.setCurrentPage(currentPage); searchResult.setDocuments(documents); searchResult.setTotalCount(hits.length); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } return searchResult; } /** * 释放索引 */ public static void destroy() { synchronized (writerPool) { Iterator<Entry<String, IndexWriter>> iterator = writerPool .entrySet().iterator(); while (iterator.hasNext()) { Entry<String, IndexWriter> entry = iterator.next(); IndexWriter indexWriter = entry.getValue(); try { indexWriter.commit(); indexWriter.close(); } catch (Exception e) { log.error("writerPool销毁失败,原因:" + e.getMessage()); } } writerPool.clear(); } synchronized (readerPool) { Iterator<Entry<String, IndexReader>> iterator = readerPool .entrySet().iterator(); while (iterator.hasNext()) { Entry<String, IndexReader> entry = iterator.next(); IndexReader indexReader = entry.getValue(); try { indexReader.close(); } catch (Exception e) { log.error("readerPool销毁失败,原因:" + e.getMessage()); } } readerPool.clear(); } } /** * 释放创建索引 */ public static void destroyIndexWriter() { synchronized (writerPool) { Iterator<Entry<String, IndexWriter>> iterator = writerPool .entrySet().iterator(); while (iterator.hasNext()) { Entry<String, IndexWriter> entry = iterator.next(); IndexWriter indexWriter = entry.getValue(); try { indexWriter.close(); } catch (Exception e) { log.error("writerPool销毁失败,原因:" + e.getMessage()); } } writerPool.clear(); } log.info("【创建索引池】完成销毁"); } /** * 释放旧查询索引 */ public static void destroyIndexReader(Map<Long, IndexReader> readerPool) { synchronized (readerPool) { Iterator<Entry<Long, IndexReader>> iterator = readerPool.entrySet() .iterator(); while (iterator.hasNext()) { Entry<Long, IndexReader> entry = iterator.next(); if ((System.currentTimeMillis() - entry.getKey()) >= STALE_INDEXREADER_SURVIVAL_TIME) { IndexReader indexReader = entry.getValue(); try { indexReader.close(); log.info("【查询索引池】完成销毁" + entry.getValue()); } catch (Exception e) { log.error("readerPool销毁失败,原因:" + e.getMessage()); } } } readerPool.clear(); } } /** * 刷新指定的indexReader--加载新的索引数据,若产生新的indexReader, * 则在indexReaderMap里替换旧的indexReader * * @param indexDirName * @param indexReader * @return {@link IndexReader} */ private synchronized static IndexReader refreshIndexReader(String poolDir, IndexReader indexReader) { try { destroyIndexReader(stalereaderPool); IndexReader newIndexReader = indexReader.reopen(); if (newIndexReader != indexReader) { IndexReader oldIndexReader = indexReader; stalereaderPool.put(System.currentTimeMillis(), oldIndexReader); readerPool.put(poolDir, newIndexReader); } } catch (Exception e) { log.error("刷新索引失败" + e.getMessage()); } // return newest IndexReader return readerPool.get(poolDir); } /** * 过滤特殊符号 * * @param str * @return * @throws PatternSyntaxException */ public static String stringFilter(String str) throws PatternSyntaxException { String regEx = "[`~!@#$%^&*()+=|{ }':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?·\'\"\\-\t\n\r]"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(str); return m.replaceAll("").trim(); } /** * 高亮设置 * * @param query * @param doc * @param field * @return */ private static String toHighlighter(Query query, Document doc, String field, Analyzer analyzer) { try { SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( "<font color=\"red\">", "</font>"); Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(doc.get(field) .length() + 100)); TokenStream tokenStream = analyzer.tokenStream(field, new StringReader(doc.get(field))); String highlighterStr = highlighter.getBestFragment(tokenStream, doc.get(field)); return highlighterStr == null ? doc.get(field) : highlighterStr; } catch (IOException e) { log.error(e.getMessage()); } catch (InvalidTokenOffsetsException e) { log.error(e.getMessage()); } return null; } @SuppressWarnings("static-access") private static IndexWriter getIndexWriter(String indexDir, String poolDir) throws CorruptIndexException, LockObtainFailedException, IOException { IndexWriter writer = writerPool.get(poolDir); if (writer == null) { synchronized (writerPool) { if (!writerPool.containsKey(poolDir)) { try { writer = createIndexWriter(indexDir + poolDir); if (writer != null) writerPool.put(poolDir, writer); } catch (IOException e) { if (IndexWriter.isLocked(FSDirectory .open(getIndexFile(indexDir + poolDir)))) { IndexWriter.unlock(FSDirectory .open(getIndexFile(indexDir + poolDir))); } log.error(e.getMessage()); e.printStackTrace(); destroy(); } } } } return writer; } private static IndexReader getIndexReader(String indexDir, String poolDir) throws CorruptIndexException, IOException { IndexReader reader = readerPool.get(poolDir); synchronized (readerPool) { if (!readerPool.containsKey(poolDir)) { try { reader = IndexReader.open(FSDirectory .open(getIndexFile(indexDir + poolDir))); if (reader != null) readerPool.put(poolDir, reader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } return reader; } private static IndexWriter createIndexWriter(String dir) throws CorruptIndexException, LockObtainFailedException, IOException { /* * mmseg4j:ComplexAnalyzer 适用于高匹配度的中文 lucene标准:StandardAnalyzer */ IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, getAnalyzer()); /* * 创建索引模式:CREATE,覆盖模式; conf.setOpenMode(OpenMode.CREATE); * * APPEND,追加模式 conf.setOpenMode(OpenMode.CREATE_OR_APPEND); */ conf.setOpenMode(OpenMode.CREATE_OR_APPEND); if (IndexWriter.isLocked(FSDirectory.open(getIndexFile(dir)))) { IndexWriter.unlock(FSDirectory.open(getIndexFile(dir))); } IndexWriter writer = new IndexWriter(FSDirectory .open(getIndexFile(dir)), conf); return writer; } /** * 获取分词模式 paodingAnalyer Paoding paoding = PaodingMaker.make(); return * PaodingAnalyzer.writerMode(paoding); //writer mode意味要同时支持最大和最小切词 * lucene标准:StandardAnalyzer new StandardAnalyzer(Version.LUCENE_36); * * @return */ private static Analyzer getAnalyzer() { Paoding paoding = PaodingMaker.make(); return PaodingAnalyzer.writerMode(paoding); } private static File getIndexFile(String dir) { return new File(new StringBuilder(new File(LuceneUtil.class .getResource("/").getPath()).getParentFile().getParentFile() .getPath().replace('\\', '/').toString()).append(dir) .toString()); } public static void main(String[] args) { System.out.println(stringFilter("[不懂就要问]请问 H6不能插u盘听歌吗 知道的说下 谢谢!")); // init(); // for (int i = 0; i < 50; i++) { // new Thread(new Runnable() { // // @Override // public void run() { // try { // Thread.currentThread().sleep(500); // } catch (InterruptedException e) { // e.printStackTrace(); // } // // IndexWriter close = null; // IndexWriter noClose = null; // IndexWriter searchLog = null; // try { // close = getIndexWriter("/WEB-INF/index/", "close"); // noClose = getIndexWriter("/WEB-INF/index/", "noClose"); // searchLog = getIndexWriter("/WEB-INF/index/", // "searchLog"); // // IndexReader readerc = getIndexReader("/WEB-INF/index/", // "close"); // IndexReader readern = getIndexReader("/WEB-INF/index/", // "noClose"); // IndexReader readers = getIndexReader("/WEB-INF/index/", // "searchLog"); // // // System.out.println(readerc); // // System.out.println(readern); // // System.out.println(readers); // // } catch (CorruptIndexException e) { // e.printStackTrace(); // } catch (LockObtainFailedException e) { // e.printStackTrace(); // } catch (IOException e) { // e.printStackTrace(); // } // // if (close == null || noClose == null) { // System.out.println("-----------"); // } // // System.out.println(close); // // System.out.println(noClose); // // System.out.println(searchLog); // // } // }).start(); // } // // // destroy(); } }