现在的位置: 首页 > 综合 > 正文

TermVector用法:相关搜索功能及提高高亮显示性能

2013年08月21日 ⁄ 综合 ⁄ 共 4753字 ⁄ 字号 评论关闭

转自:http://hi.baidu.com/z57354658/blog/item/b80f524b2c92e1fa82025cbd.html

  1. public class TermVectorTest {  
  2.       
  3.     Analyzer analyzer = new SimpleAnalyzer();  
  4.     Directory ramDir = new RAMDirectory();  
  5.       
  6.     public void createRamIndex() throws CorruptIndexException, LockObtainFailedException, IOException{  
  7.           
  8.         IndexWriter writer = new IndexWriter(ramDir,analyzer,IndexWriter.MaxFieldLength.LIMITED);  
  9.           
  10.         Document doc1 = new Document();  
  11.         doc1.add(new Field("title","java",Store.YES,Index.ANALYZED));  
  12.         doc1.add(new Field("author","callan",Store.YES,Index.ANALYZED));  
  13.         doc1.add(new Field("subject","java一门编程语言,用java的人很多,编程语言也不少,但是java最流行",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));  
  14.           
  15.         Document doc2 = new Document();  
  16.         doc2.add(new Field("title","english",Store.YES,Index.ANALYZED));  
  17.         doc2.add(new Field("author","wcq",Store.YES,Index.ANALYZED));  
  18.         doc2.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));  
  19.       
  20.         Document doc3 = new Document();  
  21.         doc3.add(new Field("title","asp",Store.YES,Index.ANALYZED));  
  22.         doc3.add(new Field("author","ca",Store.YES,Index.ANALYZED));  
  23.         doc3.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));  
  24.           
  25.         writer.addDocument(doc1);  
  26.         writer.addDocument(doc2);  
  27.         writer.addDocument(doc3);  
  28.           
  29.         writer.optimize();  
  30.         writer.close();  
  31.     }  
  32.       
  33.     public void search() throws CorruptIndexException, IOException{  
  34.         IndexReader reader = IndexReader.open(ramDir);  
  35.         IndexSearcher searcher = new IndexSearcher(reader);  
  36.         Term term = new Term("title","java");   //在title里查询java词条  
  37.         TermQuery query = new TermQuery(term);  
  38.         Hits hits = searcher.search(query);  
  39.         for (int i = 0; i < hits.length(); i++)  
  40.         {  
  41.             Document doc = hits.doc(i);  
  42.             System.out.println(doc.get("title"));  
  43.             System.out.println(doc.get("subject"));  
  44.             System.out.println("moreLike search: ");  
  45.               
  46.             morelikeSearch(reader,hits.id(i));  
  47.         }  
  48.     }  
  49.   
  50.     private void morelikeSearch(IndexReader reader,int id) throws IOException  
  51.     {  
  52.         //根据这个document的id获取这个field的Term Vector 信息,就是这个field分词之后在这个field里的频率、位置、等信息  
  53.         TermFreqVector vector = reader.getTermFreqVector(id, "subject");  
  54.           
  55.         BooleanQuery query = new BooleanQuery();    
  56.           
  57.         for (int i = 0; i < vector.size(); i++)  
  58.         {  
  59.              TermQuery tq = new TermQuery(new Term("subject",     
  60.                         vector.getTerms()[i]));   //获取每个term保存的Token  
  61.                      
  62.                  query.add(tq, BooleanClause.Occur.SHOULD);     
  63.   
  64.         }  
  65.           
  66.         IndexSearcher searcher = new IndexSearcher(ramDir);     
  67.              
  68.         Hits hits = searcher.search(query);     
  69.           
  70.         //显示代码,略  
  71.   
  72.           
  73.     }  
  74.   
  75. //Lucene使用TermVector提高高亮显示性能  
  76.     public void highterLightSearch() throws CorruptIndexException, IOException{  
  77.         IndexReader reader = IndexReader.open(ramDir);     
  78.           
  79.         IndexSearcher searcher = new IndexSearcher(reader);     
  80.              
  81.         TermQuery query = new TermQuery(new Term("subject","java"));     
  82.              
  83.         Hits hits = searcher.search(query);     
  84.              
  85.         //高亮显示设置     
  86.         SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>");  
  87.              
  88.         Highlighter highlighter =new Highlighter(simpleHTMLFormatter,new QueryScorer(query));     
  89.           
  90.          // 这个100是指定关键字字符串的context的长度,你可以自己设定,因为不可能返回整篇正文内容     
  91.         highlighter.setTextFragmenter(new SimpleFragmenter(100));     
  92.     
  93.         for(int i = 0; i < hits.length(); i++){     
  94.                  
  95.             Document doc = hits.doc(i);     
  96.                  
  97.             TermPositionVector termFreqVector = (TermPositionVector)reader.getTermFreqVector(hits.id(i), "subject");     
  98.                
  99.             TermFreqVector vector = reader.getTermFreqVector(hits.id(i), "subject");  
  100.             TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector);     
  101.                  
  102.             String result = highlighter.getBestFragment(tokenStream, doc.get("subject"));     
  103.     
  104.             System.out.println(doc.get("title"));     
  105.                  
  106.             System.out.println(result);     
  107.                  
  108.         }     
  109.   
  110.           
  111.     }  
  112.       
  113.     public static void main(String[] args) throws CorruptIndexException, IOException  
  114.     {  
  115.         TermVectorTest  t = new TermVectorTest();  
  116.         t.createRamIndex();  
  117.         t.search();  
  118.     }  
  119.   

抱歉!评论已关闭.