现在的位置: 首页 > 综合 > 正文

solr 自定义分词器

2012年09月14日 ⁄ 综合 ⁄ 共 3251字 ⁄ 字号 评论关闭


package com.besttone.analyzer;

import java.io.Reader;
import java.util.Map;

import org.apache.solr.analysis.BaseTokenizerFactory;

public class CommaTokenizerFactory extends BaseTokenizerFactory {

	public void init(Map<String, String> args) {

	public CommaTokenizer create(Reader input) {
		return new CommaTokenizer(luceneMatchVersion, input);


package com.besttone.analyzer;

import java.io.Reader;

import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;

public class CommaTokenizer extends CharTokenizer {

	 * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
	 * to match See {@link <a href="#version">above</a>}
	 * @param in
	 *            the input to split up into tokens
	public CommaTokenizer(Version matchVersion, Reader in) {
		super(matchVersion, in);

	 * Construct a new WhitespaceTokenizer using a given {@link AttributeSource}
	 * .
	 * @param matchVersion
	 *            Lucene version to match See
	 *            {@link <a href="#version">above</a>}
	 * @param source
	 *            the attribute source to use for this {@link Tokenizer}
	 * @param in
	 *            the input to split up into tokens
	public CommaTokenizer(Version matchVersion, AttributeSource source,
			Reader in) {
		super(matchVersion, source, in);

	 * Construct a new WhitespaceTokenizer using a given
	 * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
	 * @param matchVersion
	 *            Lucene version to match See
	 *            {@link <a href="#version">above</a>}
	 * @param factory
	 *            the attribute factory to use for this {@link Tokenizer}
	 * @param in
	 *            the input to split up into tokens
	public CommaTokenizer(Version matchVersion, AttributeFactory factory,
			Reader in) {
		super(matchVersion, factory, in);

	 * Construct a new CommaTokenizer.
	 * @deprecated use {@link #CommaTokenizer(Version, Reader)} instead. This
	 *             will be removed in Lucene 4.0.
	public CommaTokenizer(Reader in) {

	 * Construct a new CommaTokenizer using a given {@link AttributeSource}.
	 * @deprecated use {@link #CommaTokenizer(Version, AttributeSource, Reader)}
	 *             instead. This will be removed in Lucene 4.0.
	public CommaTokenizer(AttributeSource source, Reader in) {
		super(source, in);

	 * Construct a new CommaTokenizer using a given
	 * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
	 * @deprecated use
	 *             {@link #CommaTokenizer(Version, AttributeSource.AttributeFactory, Reader)}
	 *             instead. This will be removed in Lucene 4.0.
	public CommaTokenizer(AttributeFactory factory, Reader in) {
		super(factory, in);

	 * Collects only characters which do not satisfy
	 * {@link Character#isWhitespace(int)}.
	protected boolean isTokenChar(int c) {
		// return !Character.isWhitespace(c);
		// 44表示逗号
		return !(c == 44);



protected boolean isTokenChar(int c) {
		// return !Character.isWhitespace(c);
		// 44表示逗号
		return !(c == 44);


char[] c = new char[]{'a',',','b'};

Character.codePointAt(c, 1);





    <fieldType name="text_comma" class="solr.TextField" positionIncrementGap="100">
        <tokenizer class="com.besttone.analyzer.CommaTokenizerFactory"/>


