详细操作参看http://blog.csdn.net/sxyx2008/article/details/7904860
NutchAnalysis.jj文件加入中文分司后
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** JavaCC code for the Nutch lexical analyzer. */ options { STATIC = false; USER_CHAR_STREAM = true; OPTIMIZE_TOKEN_MANAGER = true; UNICODE_INPUT = true; //DEBUG_TOKEN_MANAGER = true; } PARSER_BEGIN(NutchAnalysis) package org.apache.nutch.analysis; import java.io.StringReader; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.searcher.Query; import org.apache.nutch.searcher.QueryFilters; import org.apache.nutch.searcher.Query.Clause; import org.apache.nutch.util.NutchConfiguration; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import java.io.*; import java.util.*; /** The JavaCC-generated Nutch lexical analyzer and query parser. */ public class NutchAnalysis { private static final String[] STOP_WORDS = { "a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS); private Analyzer analyzer = null; private String queryString; private QueryFilters queryFilters; /** Constructs a nutch analysis. */ public NutchAnalysis(String query, Analyzer analyzer) { this(new FastCharStream(new StringReader(query))); this.analyzer = analyzer; } /** True iff word is a stop word. Stop words are only removed from queries. * Every word is indexed. */ public static boolean isStopWord(String word) { return STOP_SET.contains(word); } /** Construct a query parser for the text in a reader. */ public static Query parseQuery(String queryString, Configuration conf) throws IOException { return parseQuery(queryString, null, conf); } /** Construct a query parser for the text in a reader. */ public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf) throws IOException { NutchAnalysis parser = new NutchAnalysis( queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf)); parser.queryString = queryString; parser.queryFilters = new QueryFilters(conf); return parser.parse(conf); } /** For debugging. */ public static void main(String[] args) throws Exception { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while (true) { System.out.print("Query: "); String line = in.readLine(); System.out.println(parseQuery(line, NutchConfiguration.create())); } } } PARSER_END(NutchAnalysis) TOKEN_MGR_DECLS : { /** Constructs a token manager for the provided Reader. */ public NutchAnalysisTokenManager(Reader reader) { this(new FastCharStream(reader)); } } TOKEN : { // token regular expressions // basic word -- lowercase it <WORD: ((<LETTER>|<DIGIT>|<WORD_PUNCT>)+ | <IRREGULAR_WORD>)> { matchedToken.image = matchedToken.image.toLowerCase(); } // special handling for acronyms: U.S.A., I.B.M., etc: dots are removed | <ACRONYM: <LETTER> "." (<LETTER> ".")+ > { // remove dots for (int i = 0; i < image.length(); i++) { if (image.charAt(i) == '.') image.deleteCharAt(i--); } matchedToken.image = image.toString().toLowerCase(); } // chinese, japanese and korean characters | <SIGRAM: (<CJK>)+ > // irregular words | <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)> | <#C_PLUS_PLUS: ("C"|"c") "++" > | <#C_SHARP: ("C"|"c") "#" > // query syntax characters | <PLUS: "+" > | <MINUS: "-" > | <QUOTE: "\"" > | <COLON: ":" > | <SLASH: "/" > | <DOT: "." > | <ATSIGN: "@" > | <APOSTROPHE: "'" > | <WHITE: ~[] > // treat unrecognized chars // as whitespace // primitive, non-token patterns | <#WORD_PUNCT: ("_"|"&")> // allowed anywhere in words | < #LETTER: // alphabets [ "\u0041"-"\u005a", "\u0061"-"\u007a", "\u00c0"-"\u00d6", "\u00d8"-"\u00f6", "\u00f8"-"\u00ff", "\u0100"-"\u1fff" ] > | <#CJK: // non-alphabets [ "\u3040"-"\u318f", "\u3300"-"\u337f", "\u3400"-"\u3d2d", "\u4e00"-"\u9fff", "\uf900"-"\ufaff" ] > | < #DIGIT: // unicode digits [ "\u0030"-"\u0039", "\u0660"-"\u0669", "\u06f0"-"\u06f9", "\u0966"-"\u096f", "\u09e6"-"\u09ef", "\u0a66"-"\u0a6f", "\u0ae6"-"\u0aef", "\u0b66"-"\u0b6f", "\u0be7"-"\u0bef", "\u0c66"-"\u0c6f", "\u0ce6"-"\u0cef", "\u0d66"-"\u0d6f", "\u0e50"-"\u0e59", "\u0ed0"-"\u0ed9", "\u1040"-"\u1049" ] > } /** Parse a query. */ Query parse(Configuration conf) : { Query query = new Query(conf); ArrayList terms; Token token; String field; boolean stop; boolean prohibited; } { nonOpOrTerm() // skip noise ( { stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; } // optional + or - operator ( <PLUS> {stop=false;} | (<MINUS> { stop=false;prohibited=true; } ))? // optional field spec. ( LOOKAHEAD(<WORD><COLON>(phrase(field)|compound(field))) token=<WORD> <COLON> { field = token.image; } )? ( terms=phrase(field) {stop=false;} | // quoted terms or terms=compound(field)) // single or compound term nonOpOrTerm() // skip noise { String[] array = (String[])terms.toArray(new String[terms.size()]); if (stop && field == Clause.DEFAULT_FIELD && terms.size()==1 && isStopWord(array[0])) { // ignore stop words only when single, unadorned terms in default field } else { if (prohibited) query.addProhibitedPhrase(array, field); else query.addRequiredPhrase(array, field); } } )* { return query; } } /** Parse an explcitly quoted phrase query. Note that this may return a single * term, a trivial phrase.*/ ArrayList phrase(String field) : { int start; int end; ArrayList result = new ArrayList(); String term; } { <QUOTE> { start = token.endColumn; } (nonTerm())* // skip noise ( term = term() { result.add(term); } // parse a term (nonTerm())*)* // skip noise { end = token.endColumn; } (<QUOTE>|<EOF>) { if (this.queryFilters.isRawField(field)) { result.clear(); result.add(queryString.substring(start, end)); } return result; } } /** Parse a compound term that is interpreted as an implicit phrase query. * Compounds are a sequence of terms separated by infix characters. Note that * htis may return a single term, a trivial compound. */ ArrayList compound(String field) : { int start; ArrayList result = new ArrayList(); String term; StringBuffer terms = new StringBuffer(); } { { start = token.endColumn; } term = term() { terms.append(term).append(" "); //result.add(term); } ( LOOKAHEAD( (infix())+ term() ) (infix())+ term = term() { terms.append(term).append(" "); //result.add(term); })* { if (this.queryFilters.isRawField(field)) { // result.clear(); result.add(queryString.substring(start, token.endColumn)); } else { org.apache.lucene.analysis.Token token; TokenStream tokens = analyzer.tokenStream( field, new StringReader(terms.toString())); while (true) { try { token = tokens.next(); } catch (IOException e) { token = null; } if (token == null) { break; } result.add(token.termText()); } try { tokens.close(); } catch (IOException e) { // ignore } } return result; } } /** Parse a single term. */ String term() : { Token token; } { ( token=<WORD> | token=<ACRONYM> | token=<SIGRAM>) { return token.image; } } /** Parse anything but a term or a quote. */ void nonTerm() : {} { <WHITE> | infix() } void nonTermOrEOF() : {} { nonTerm() | <EOF> } /** Parse anything but a term or an operator (plur or minus or quote). */ void nonOpOrTerm() : {} { (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))* } /** Characters which can be used to form compound terms. */ void infix() : {} { <PLUS> | <MINUS> | nonOpInfix() } /** Parse infix characters except plus and minus. */ void nonOpInfix() : {} { <COLON>|<SLASH>|<DOT>|<ATSIGN>|<APOSTROPHE> }
修改NutchDocumentAnalyzer.java
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.analysis; // JDK imports import java.io.Reader; import java.io.IOException; // Lucene imports import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; import org.apache.hadoop.conf.Configuration; import jeasy.analysis.*; /** * The analyzer used for Nutch documents. Uses the JavaCC-defined lexical * analyzer {@link NutchDocumentTokenizer}, with no stop list. This keeps it * consistent with query parsing. */ public class NutchDocumentAnalyzer extends NutchAnalyzer { /** Analyzer used to index textual content. */ private static Analyzer CONTENT_ANALYZER; // Anchor Analysis // Like content analysis, but leave gap between anchors to inhibit // cross-anchor phrase matching. /** * The number of unused term positions between anchors in the anchor field. */ public static final int INTER_ANCHOR_GAP = 4; /** Analyzer used to analyze anchors. */ private static Analyzer ANCHOR_ANALYZER; /** * @param conf */ public NutchDocumentAnalyzer(Configuration conf) { this.conf = conf; CONTENT_ANALYZER = new ContentAnalyzer(conf); ANCHOR_ANALYZER = new AnchorAnalyzer(); } /** Analyzer used to index textual content. */ private static class ContentAnalyzer extends Analyzer { private CommonGrams commonGrams; public ContentAnalyzer(Configuration conf) { this.commonGrams = new CommonGrams(conf); } /** Constructs a {@link NutchDocumentTokenizer}. */ public TokenStream tokenStream(String field, Reader reader) { // return this.commonGrams.getFilter(new NutchDocumentTokenizer(reader),field); if ("content".equals(field) || "title".equals(field)|| "DEFAULT".equals(field)) { MMAnalyzer analyzer = new MMAnalyzer(); return analyzer.tokenStream(field, reader); } else { return this.commonGrams.getFilter(new NutchDocumentTokenizer(reader), field); } } } private static class AnchorFilter extends TokenFilter { private boolean first = true; public AnchorFilter(TokenStream input) { super(input); } public final Token next() throws IOException { Token result = input.next(); if (result == null) return result; if (first) { result.setPositionIncrement(INTER_ANCHOR_GAP); first = false; } return result; } } private static class AnchorAnalyzer extends Analyzer { public final TokenStream tokenStream(String fieldName, Reader reader) { return new AnchorFilter(CONTENT_ANALYZER.tokenStream(fieldName, reader)); } } /** Returns a new token stream for text from the named field. */ public TokenStream tokenStream(String fieldName, Reader reader) { Analyzer analyzer; if ("anchor".equals(fieldName)) analyzer = ANCHOR_ANALYZER; else analyzer = CONTENT_ANALYZER; return analyzer.tokenStream(fieldName, reader); } }
copy “je-analysis-1.5.3.jar” to lib/
重新编译Nutch:
在build.xml添加一条指令(在第195行的下面加入一行),使的编译war文件的时候加入je-analysis的jar文件。build.xml
<lib dir="${lib.dir}">
<include name="lucene*.jar"/>
<include name="taglibs-*.jar"/>
<include name="hadoop-*.jar"/>
<include name="dom4j-*.jar"/>
<include name="xerces-*.jar"/>
<include name="tika-*.jar"/>
<include name="apache-solr-*.jar"/>
<include name="commons-httpclient-*.jar"/>
<include name="commons-codec-*.jar"/>
<include name="commons-collections-*.jar"/>
<include name="commons-beanutils-*.jar"/>
<include name="commons-cli-*.jar"/>
<include name="commons-lang-*.jar"/>
<include name="commons-logging-*.jar"/>
<include name="log4j-*.jar"/>
<include name="je-analysis-*.jar"/> <!-- add this line -->
</lib>
compile:
cd nutch-1.0
export ANT_HOME=/usr/local/apache-ant-1.7.1
/usr/local/apache-ant-1.7.1/bin/ant
/usr/local/apache-ant-1.7.1/bin/ant war
使用新生成的含中文分词功能的模块:只用到刚才编译生成的下面三个文件,替换Nutch 1.0的的对应文件
build/nutch-1.0.jar
build/nutch-1.0.job
build/nutch-1.0.war
别忘了把“je-analysis-1.5.3.jar” 拷贝到解压后的Nutch 1.0 war包的lib/
修改Ant编译出来的war包中WEB-INF\classes\nutch-site.xml文件
<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Put site-specific property overrides in this file. --> <configuration> <property> <name>http.agent.name</name> <value>My Nutch Spider</value> </property> <property> <name>searcher.dir</name> <value>E:/nutch-1.0/crawldata</value> </property> <property> <name>searcher.summary.length</name> <value>50</value> </property> </configuration>
http.agent.name 爬虫的名称与你的Nutch安装目录下conf下的爬虫名称必须一致 否则查询不到结果
searcher.dir Nutch从网站上爬取的数据的存放目录E:/nutch-1.0/crawldata 不用定位到索引文件 定位到数据文件的根目录即可
searcher.summary.length 设置摘要的长度