源码地址:http://download.csdn.net/detail/yx511500623/6258163
开发环境
eclipse for jee+jdk7+tomcat7
lucene4.4+crawler4j3.5
索引文件位置:/csdn-blog-crawler/data
记得把生成的索引放入:/csdn-blog-crawler/WebContent
关键code如下:
/csdn-blog-crawler/src/cn/crawler/lucene/util/HtmlUtil.java
package cn.crawler.lucene.util; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.CssSelectorNodeFilter; import org.htmlparser.util.NodeList; public class HtmlUtil{ public static String getText(String html,String id){ try { Parser parser = new Parser(html); NodeFilter filter=new CssSelectorNodeFilter("#"+id); NodeList nList=parser.extractAllNodesThatMatch(filter); return nList==null||nList.size()==0?null: nList.elementAt(0).toPlainTextString(); } catch (Exception e) { e.printStackTrace(); return null; } } public static String getTextByClass(String html,String css_class){ try { Parser parser = new Parser(html); NodeFilter filter=new CssSelectorNodeFilter("."+css_class); NodeList nList=parser.extractAllNodesThatMatch(filter); return nList==null||nList.size()==0?null: nList.elementAt(0).toPlainTextString(); } catch (Exception e) { e.printStackTrace(); return null; } } public static String filterText(String text){ if(text==null) return null; text=text.replace(">",">"); text=text.replace("<","<"); text=text.replace(""","\""); text=text.replace(" "," "); text=text.replace("&","&"); text=text.replace("©","©"); text=text.replace(" ",""); return text; } }
截图如下: