现在的位置: 首页 > 综合 > 正文

Nutch 1.0添加JE中文分词 要修改的几处地方说明

2014年02月01日 ⁄ 综合 ⁄ 共 12034字 ⁄ 字号 评论关闭

详细操作参看http://blog.csdn.net/sxyx2008/article/details/7904860

NutchAnalysis.jj文件加入中文分司后

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/** JavaCC code for the Nutch lexical analyzer. */

options {
  STATIC = false;
  USER_CHAR_STREAM = true;
  OPTIMIZE_TOKEN_MANAGER = true;
  UNICODE_INPUT = true;
//DEBUG_TOKEN_MANAGER = true;
}

PARSER_BEGIN(NutchAnalysis)

package org.apache.nutch.analysis;

import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;



import java.io.*;
import java.util.*;

/** The JavaCC-generated Nutch lexical analyzer and query parser. */
public class NutchAnalysis {

  private static final String[] STOP_WORDS = {
    "a", "and", "are", "as", "at", "be", "but", "by",
    "for", "if", "in", "into", "is", "it",
    "no", "not", "of", "on", "or", "s", "such",
    "t", "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
  };

  private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);

  private Analyzer analyzer = null;
  private String queryString;
  private QueryFilters queryFilters;
  

  /** Constructs a nutch analysis. */
  public NutchAnalysis(String query, Analyzer analyzer) {
    this(new FastCharStream(new StringReader(query)));
    this.analyzer = analyzer;
  }

  /** True iff word is a stop word.  Stop words are only removed from queries.
   * Every word is indexed.  */
  public static boolean isStopWord(String word) {
    return STOP_SET.contains(word);
  }

  /** Construct a query parser for the text in a reader. */
  public static Query parseQuery(String queryString, Configuration conf) throws IOException {
    return parseQuery(queryString, null, conf);
  }

  /** Construct a query parser for the text in a reader. */
  public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf)
    throws IOException {
    NutchAnalysis parser = new NutchAnalysis(
          queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf));
    parser.queryString = queryString;
    parser.queryFilters = new QueryFilters(conf);
    return parser.parse(conf);
  }

  /** For debugging. */
  public static void main(String[] args) throws Exception {
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    while (true) {
      System.out.print("Query: ");
      String line = in.readLine();
      System.out.println(parseQuery(line, NutchConfiguration.create()));
    }
  }

}

PARSER_END(NutchAnalysis)

TOKEN_MGR_DECLS : {


  /** Constructs a token manager for the provided Reader. */
  public NutchAnalysisTokenManager(Reader reader) {
    this(new FastCharStream(reader));
  }

}

TOKEN : {					  // token regular expressions

  // basic word -- lowercase it
<WORD: ((<LETTER>|<DIGIT>|<WORD_PUNCT>)+ | <IRREGULAR_WORD>)>
  { matchedToken.image = matchedToken.image.toLowerCase(); }

  // special handling for acronyms: U.S.A., I.B.M., etc: dots are removed
| <ACRONYM: <LETTER> "." (<LETTER> ".")+ > 
    {                                             // remove dots
      for (int i = 0; i < image.length(); i++) {
	if (image.charAt(i) == '.')
	  image.deleteCharAt(i--);
      }
      matchedToken.image = image.toString().toLowerCase();
    }

  // chinese, japanese and korean characters
| <SIGRAM: (<CJK>)+ >

   // irregular words
| <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)>
| <#C_PLUS_PLUS: ("C"|"c") "++" >
| <#C_SHARP: ("C"|"c") "#" >

  // query syntax characters
| <PLUS: "+" >
| <MINUS: "-" >
| <QUOTE: "\"" >
| <COLON: ":" >
| <SLASH: "/" >
| <DOT: "." >
| <ATSIGN: "@" >
| <APOSTROPHE: "'" >

| <WHITE: ~[] >                                   // treat unrecognized chars
                                                  // as whitespace
// primitive, non-token patterns

| <#WORD_PUNCT: ("_"|"&")>                        // allowed anywhere in words

| < #LETTER:					  // alphabets
    [
        "\u0041"-"\u005a",
        "\u0061"-"\u007a",
        "\u00c0"-"\u00d6",
        "\u00d8"-"\u00f6",
        "\u00f8"-"\u00ff",
        "\u0100"-"\u1fff"
    ]
    >

|  <#CJK:                                        // non-alphabets
      [
       "\u3040"-"\u318f",
       "\u3300"-"\u337f",
       "\u3400"-"\u3d2d",
       "\u4e00"-"\u9fff",
       "\uf900"-"\ufaff"
      ]
    >    

| < #DIGIT:					  // unicode digits
      [
       "\u0030"-"\u0039",
       "\u0660"-"\u0669",
       "\u06f0"-"\u06f9",
       "\u0966"-"\u096f",
       "\u09e6"-"\u09ef",
       "\u0a66"-"\u0a6f",
       "\u0ae6"-"\u0aef",
       "\u0b66"-"\u0b6f",
       "\u0be7"-"\u0bef",
       "\u0c66"-"\u0c6f",
       "\u0ce6"-"\u0cef",
       "\u0d66"-"\u0d6f",
       "\u0e50"-"\u0e59",
       "\u0ed0"-"\u0ed9",
       "\u1040"-"\u1049"
      ]
  >

}


/** Parse a query. */
Query parse(Configuration conf) :
{
  Query query = new Query(conf);
  ArrayList terms;
  Token token;
  String field;
  boolean stop;
  boolean prohibited;

}
{
  nonOpOrTerm()                                   // skip noise
  (
    { stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; }

                                                  // optional + or - operator
    ( <PLUS> {stop=false;} | (<MINUS> { stop=false;prohibited=true; } ))?

                                                  // optional field spec.
    ( LOOKAHEAD(<WORD><COLON>(phrase(field)|compound(field)))
      token=<WORD> <COLON> { field = token.image; } )?

    ( terms=phrase(field) {stop=false;} |         // quoted terms or
      terms=compound(field))                      // single or compound term

    nonOpOrTerm()                                 // skip noise

    {
      String[] array = (String[])terms.toArray(new String[terms.size()]);

      if (stop
          && field == Clause.DEFAULT_FIELD
          && terms.size()==1
          && isStopWord(array[0])) {
        // ignore stop words only when single, unadorned terms in default field
      } else {
        if (prohibited)
          query.addProhibitedPhrase(array, field);
        else
          query.addRequiredPhrase(array, field);
      }
    }
  )*
  
  { return query; }

}

/** Parse an explcitly quoted phrase query.  Note that this may return a single
 * term, a trivial phrase.*/
ArrayList phrase(String field) :
{
  int start;
  int end;
  ArrayList result = new ArrayList();
  String term;
}
{
  <QUOTE>

  { start = token.endColumn; }
  
  (nonTerm())*                                    // skip noise
  ( term = term() { result.add(term); }           // parse a term
    (nonTerm())*)*                                // skip noise

  { end = token.endColumn; }

  (<QUOTE>|<EOF>)
    
  {
    if (this.queryFilters.isRawField(field)) {
      result.clear();
      result.add(queryString.substring(start, end));
    }
    return result;
  }

}

/** Parse a compound term that is interpreted as an implicit phrase query.
 * Compounds are a sequence of terms separated by infix characters.  Note that
 * htis may return a single term, a trivial compound. */
ArrayList compound(String field) :
{
  int start;
  ArrayList result = new ArrayList();
  String term;
  StringBuffer terms = new StringBuffer();
}
{
  { start = token.endColumn; }

  term = term() {
    terms.append(term).append(" ");
    //result.add(term);
  }
  ( LOOKAHEAD( (infix())+ term() )
    (infix())+
    term = term() {
      terms.append(term).append(" ");
      //result.add(term);
    })*

  {
    if (this.queryFilters.isRawField(field)) {
//      result.clear();
      result.add(queryString.substring(start, token.endColumn));

    } else {
      org.apache.lucene.analysis.Token token;
      TokenStream tokens = analyzer.tokenStream(
                              field, new StringReader(terms.toString()));

      while (true) {
        try {
          token = tokens.next();
        } catch (IOException e) {
          token = null;
        }
        if (token == null) { break; }
        result.add(token.termText());
      }
      try {
        tokens.close();
      } catch (IOException e) {
        // ignore
      }
    }
    return result;
  }

}

/** Parse a single term. */
String term() :
{
  Token token;
}
{
  ( token=<WORD> | token=<ACRONYM> | token=<SIGRAM>)

  { return token.image; }
}


/** Parse anything but a term or a quote. */
void nonTerm() :
{}
{
  <WHITE> | infix()
}

void nonTermOrEOF() :
{}
{
  nonTerm() | <EOF>
}

/** Parse anything but a term or an operator (plur or minus or quote). */
void nonOpOrTerm() :
{}
{
  (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))*
}

/** Characters which can be used to form compound terms. */
void infix() :
{}
{
  <PLUS> | <MINUS> | nonOpInfix()
}

/** Parse infix characters except plus and minus. */
void nonOpInfix() :
{}
{
  <COLON>|<SLASH>|<DOT>|<ATSIGN>|<APOSTROPHE>
}

修改NutchDocumentAnalyzer.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.analysis;

// JDK imports
import java.io.Reader;
import java.io.IOException;

// Lucene imports
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.hadoop.conf.Configuration;
import jeasy.analysis.*;

/**
 * The analyzer used for Nutch documents. Uses the JavaCC-defined lexical
 * analyzer {@link NutchDocumentTokenizer}, with no stop list. This keeps it
 * consistent with query parsing.
 */
public class NutchDocumentAnalyzer extends NutchAnalyzer {

  /** Analyzer used to index textual content. */
  private static Analyzer CONTENT_ANALYZER;
  // Anchor Analysis
  // Like content analysis, but leave gap between anchors to inhibit
  // cross-anchor phrase matching.
  /**
   * The number of unused term positions between anchors in the anchor field.
   */
  public static final int INTER_ANCHOR_GAP = 4;
  /** Analyzer used to analyze anchors. */
  private static Analyzer ANCHOR_ANALYZER;

  /**
   * @param conf
   */
  public NutchDocumentAnalyzer(Configuration conf) {
    this.conf = conf;
    CONTENT_ANALYZER = new ContentAnalyzer(conf);
    ANCHOR_ANALYZER = new AnchorAnalyzer();
  }

  /** Analyzer used to index textual content. */
  private static class ContentAnalyzer extends Analyzer {
    private CommonGrams commonGrams;

    public ContentAnalyzer(Configuration conf) {
      this.commonGrams = new CommonGrams(conf);
    }

    /** Constructs a {@link NutchDocumentTokenizer}. */
	public TokenStream tokenStream(String field, Reader reader) {
		// return this.commonGrams.getFilter(new NutchDocumentTokenizer(reader),field);
		if ("content".equals(field) || "title".equals(field)|| "DEFAULT".equals(field)) {
			MMAnalyzer analyzer = new MMAnalyzer();
			return analyzer.tokenStream(field, reader);
		} else {
			return this.commonGrams.getFilter(new NutchDocumentTokenizer(reader), field);
		}
	}
  }

  private static class AnchorFilter extends TokenFilter {
    private boolean first = true;

    public AnchorFilter(TokenStream input) {
      super(input);
    }

    public final Token next() throws IOException {
      Token result = input.next();
      if (result == null)
        return result;
      if (first) {
        result.setPositionIncrement(INTER_ANCHOR_GAP);
        first = false;
      }
      return result;
    }
  }

  private static class AnchorAnalyzer extends Analyzer {
    public final TokenStream tokenStream(String fieldName, Reader reader) {
      return new AnchorFilter(CONTENT_ANALYZER.tokenStream(fieldName, reader));
    }
  }

  /** Returns a new token stream for text from the named field. */
  public TokenStream tokenStream(String fieldName, Reader reader) {
    Analyzer analyzer;
    if ("anchor".equals(fieldName))
      analyzer = ANCHOR_ANALYZER;
    else
      analyzer = CONTENT_ANALYZER;

    return analyzer.tokenStream(fieldName, reader);
  }
}

copy “je-analysis-1.5.3.jar” to lib/

重新编译Nutch:

在build.xml添加一条指令(在第195行的下面加入一行),使的编译war文件的时候加入je-analysis的jar文件。build.xml

      <lib dir="${lib.dir}">

        <include name="lucene*.jar"/>

  <include name="taglibs-*.jar"/>

  <include name="hadoop-*.jar"/>

        <include name="dom4j-*.jar"/>

        <include name="xerces-*.jar"/>

        <include name="tika-*.jar"/>

        <include name="apache-solr-*.jar"/>

        <include name="commons-httpclient-*.jar"/>

        <include name="commons-codec-*.jar"/>

        <include name="commons-collections-*.jar"/>

        <include name="commons-beanutils-*.jar"/>

        <include name="commons-cli-*.jar"/>

        <include name="commons-lang-*.jar"/>

        <include name="commons-logging-*.jar"/>

        <include name="log4j-*.jar"/>

        <include name="je-analysis-*.jar"/>   <!-- add this line -->

      </lib>

compile:

cd nutch-1.0

export ANT_HOME=/usr/local/apache-ant-1.7.1

/usr/local/apache-ant-1.7.1/bin/ant

/usr/local/apache-ant-1.7.1/bin/ant war

使用新生成的含中文分词功能的模块:只用到刚才编译生成的下面三个文件,替换Nutch 1.0的的对应文件

build/nutch-1.0.jar

build/nutch-1.0.job

build/nutch-1.0.war

别忘了把“je-analysis-1.5.3.jar” 拷贝到解压后的Nutch 1.0 war包的lib/

修改Ant编译出来的war包中WEB-INF\classes\nutch-site.xml文件

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>

<property>
  <name>http.agent.name</name>
  <value>My Nutch Spider</value>
</property>

<property>
  <name>searcher.dir</name>
  <value>E:/nutch-1.0/crawldata</value>
</property>

<property>
  <name>searcher.summary.length</name>
  <value>50</value>
</property>

</configuration>

http.agent.name 爬虫的名称与你的Nutch安装目录下conf下的爬虫名称必须一致 否则查询不到结果

searcher.dir Nutch从网站上爬取的数据的存放目录E:/nutch-1.0/crawldata 不用定位到索引文件 定位到数据文件的根目录即可

searcher.summary.length 设置摘要的长度

抱歉!评论已关闭.