现在的位置: 首页 > 综合 > 正文

使用solr实现pinyin分词,针对短词搜索,比如电影搜索

2013年09月09日 ⁄ 综合 ⁄ 共 5190字 ⁄ 字号 评论关闭

1,增加solr拼音查询原理:

pinyin4j-2.5.0.jar 
下载地址:
 http://sourceforge.net/projects/pinyin4j/
solr环境使用3.6.2。

2,Token代码:

package com.freewebsys.index.analysis;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

public class PinyinTokenizer extends Tokenizer {

	private static final int DEFAULT_BUFFER_SIZE = 512;

	private boolean done = false;
	private int finalOffset;
	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	private HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
	// 链接字符串.
	private String padding_char = " ";

	// 构造函数.
	public PinyinTokenizer(Reader reader) {
		this(reader, DEFAULT_BUFFER_SIZE);
	}

	public PinyinTokenizer(Reader input, int bufferSize) {
		super(input);
		termAtt.resizeBuffer(bufferSize);
		format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
		format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
		format.setVCharType(HanyuPinyinVCharType.WITH_V);
	}

	@Override
	public final boolean incrementToken() throws IOException {
		if (!done) {
			clearAttributes();
			done = true;
			int upto = 0;
			char[] buffer = termAtt.buffer();
			System.out.println(String.valueOf(buffer));
			while (true) {
				final int length = input.read(buffer, upto, buffer.length
						- upto);
				if (length == -1)
					break;
				upto += length;
				if (upto == buffer.length)
					buffer = termAtt.resizeBuffer(1 + buffer.length);
			}
			termAtt.setLength(upto);
			String str = termAtt.toString();
			termAtt.setEmpty();
			StringBuilder stringBuilder = new StringBuilder();
			StringBuilder firstLetters = new StringBuilder();
			StringBuilder cnLetters = new StringBuilder();
			StringBuilder allPinYinLetters = new StringBuilder();

			for (int i = 0; i < str.length(); i++) {
				char c = str.charAt(i);
				if (c < 128) {
					stringBuilder.append(c);
				} else {
					try {
						String[] strs = PinyinHelper.toHanyuPinyinStringArray(
								c, format);
						if (strs != null) {
							// get first result by default
							String first_value = strs[0];
							// TODO more than one pinyin
							// 拼接中文字符.
							cnLetters.append(c);
							cnLetters.append(this.padding_char);
							// 全部拼音字符.
							allPinYinLetters.append(first_value);
							// 拼接拼音字符.
							stringBuilder.append(first_value);
							stringBuilder.append(this.padding_char);
							// 拼接首字母字符.
							firstLetters.append(first_value.charAt(0));

						}
					} catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) {
						badHanyuPinyinOutputFormatCombination.printStackTrace();
					}
				}
			}

			// let's join them

			termAtt.append(stringBuilder.toString());
			termAtt.append(this.padding_char);
			termAtt.append(cnLetters.toString());
			termAtt.append(this.padding_char);
			termAtt.append(firstLetters.toString());
			termAtt.append(this.padding_char);
			// 将全部拼音分词成一个一个输入数据索引。
			termAtt.append(mergeNGramPinYin(allPinYinLetters.toString()));

			finalOffset = correctOffset(upto);
			offsetAtt.setOffset(correctOffset(0), finalOffset);
			return true;
		}
		return false;
	}

	@Override
	public final void end() {
		// set final offset
		offsetAtt.setOffset(finalOffset, finalOffset);
	}

	@Override
	public void reset(Reader input) throws IOException {
		super.reset(input);
		this.done = false;
	}

	public static String mergeNGramPinYin(String allPinYin) {
		// 读取字符串
		StringReader reader = new StringReader(allPinYin);
		// 设置临时变量
		String[] pinYinBuffer = null;
		if (StringUtils.isNotBlank(allPinYin)) {
			// 设置数组长度
			pinYinBuffer = new String[allPinYin.length()];
			StringBuffer tmpAppendPinYin = new StringBuffer();
			NGramTokenizer nGramTokenizer = new NGramTokenizer(reader);
			for (int i = 0; i < allPinYin.length(); i++) {
				try {
					// 循环递增
					nGramTokenizer.incrementToken();
					// 取得解析后的字符串
					CharTermAttribute charTermAttribute = nGramTokenizer
							.getAttribute(CharTermAttribute.class);
					tmpAppendPinYin.append(String.valueOf(
							charTermAttribute.buffer()).trim());
					// 每次都给数据赋值.
					pinYinBuffer[i] = tmpAppendPinYin.toString();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		// 将结果按照空格合并.
		return StringUtils.join(pinYinBuffer, " ");

	}

}

PinyinTokenizerFactory:

package com.freewebsys.index.analysis;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.analysis.BaseTokenizerFactory;
import java.io.Reader;

/**
 */
public class PinyinTokenizerFactory extends BaseTokenizerFactory {

	@Override
	public Tokenizer create(Reader input) {
		return new PinyinTokenizer(input);
	}

}

3,solr 的schema.xml配置:

对于 index和query分开配置:
		<!-- standard_text.标准分词.创建使用pinyin分词,搜索不使用. -->
		<fieldType name="standard_text" class="solr.TextField"
			positionIncrementGap="100">
			<analyzer type="index">
				<tokenizer class="com.freewebsys.index.analysis.PinyinTokenizerFactory" />
				<filter class="solr.LowerCaseFilterFactory" />
				<filter class="solr.WordDelimiterFilterFactory"
					generateWordParts="1" generateNumberParts="1" catenateWords="0"
					catenateNumbers="1" catenateAll="0" splitOnCaseChange="1" />
			</analyzer>
			<analyzer type="query">
				<tokenizer class="solr.StandardTokenizerFactory" />
			</analyzer>
		</fieldType>

4,使用分词效果:

如果在管理后台测试可以看到上面的分词结果,就说明pinyin分词配置好了。

5,不足:

目前看在搜索完成后,高亮显示还有点问题,因为索引里面增加了很多拼音。

局限性,只能简单对比较段的内容进行分词,比如商品,电影,图书等,要是文字太长使用NGramTokenizer 分词就会增加一堆没用的pinyin。

抱歉!评论已关闭.