现在的位置: 首页 > 综合 > 正文

向量空间模型文档相似度计算实现(C#)

2019年08月13日 ⁄ 综合 ⁄ 共 4473字 ⁄ 字号 评论关闭

读者可以根据自己的需要进行加壳或改写,本文权当抛砖引玉。

笔者加的壳在:

http://download.csdn.net/source/1143450

 

VSM模型介绍:

http://blog.csdn.net/Felomeng/archive/2009/03/25/4024078.aspx

 

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using System.Text.RegularExpressions;

 

namespace Felomeng.VSMSimilarity

{

    class SVMModle

    {

        /// <summary>

        /// 降维词表

        /// </summary>

        private List<string> reducingKeys = new List<string>();

        /// <summary>

        /// 构造函数:使用降维表

        /// </summary>

        /// <param name="reducingKeys">降维词表</param>

        public SVMModle(List<string> reducingKeys)

        {

            this.reducingKeys = reducingKeys;

        }

        /// <summary>

        /// 构造函数:不使用降维表

        /// </summary>

        public SVMModle()

        {

        }

        /// <summary>

        /// 相似度计算

        /// </summary>

        /// <param name="text1">文档1(分好词的,分词符为非汉字字符)</param>

        /// <param name="text2">文档2(分好词的,分词符为非汉字字符)</param>

        /// <returns>两篇文章的相似度</returns>

        public double Similarity(string text1, string text2)

        {

            double similarity = 0.0, numerator = 0.0, denominator1 = 0.0, denominator2 = 0.0;

            int temp1, temp2;

            Dictionary<string, int> dictionary1 = GetDictionary(text1);

            Dictionary<string, int> dictionary2 = GetDictionary(text2);

            if ((dictionary1.Count < 1) || (dictionary2.Count < 1))//如果任一篇文章中不含有汉字

            {

                return 0.0;

            }

            Dictionary<string, int>.KeyCollection keys1 = dictionary1.Keys;

            foreach (string key in keys1)

            {

                dictionary1.TryGetValue(key, out temp1);

                if (!dictionary2.TryGetValue(key, out temp2))

                {

                    temp2 = 0;

                }

                dictionary2.Remove(key);

                numerator += temp1 * temp2;

                denominator1 += temp1 * temp1;

                denominator2 += temp2 * temp2;

            }

            Dictionary<string, int>.KeyCollection keys2 = dictionary2.Keys;

            foreach (string key in keys2)

            {

                dictionary2.TryGetValue(key, out temp2);

                denominator2 += temp2 * temp2;

            }

            similarity = numerator / (Math.Sqrt(denominator1 * denominator2));

            return similarity;

        }

        /// <summary>

        /// 相似度计算

        /// </summary>

        /// <param name="text1">第一篇文档的词频词典</param>

        /// <param name="text2">第二篇文档的词频词典</param>

        /// <returns>两篇文档的相似度</returns>

        public double Similarity(Dictionary<string, int> text1, Dictionary<string, int> text2)

        {

            double similarity = 0.0, numerator = 0.0, denominator1 = 0.0, denominator2 = 0.0;

            int temp1, temp2;

            Dictionary<string, int> dictionary1 = new Dictionary<string,int>( text1);

            Dictionary<string, int> dictionary2 = new Dictionary<string,int>( text2);

            if ((dictionary1.Count < 1) || (dictionary2.Count < 1))//如果任一篇文章中不含有汉字

            {

                return 0.0;

            }

            Dictionary<string, int>.KeyCollection keys1 = dictionary1.Keys;

            foreach (string key in keys1)

            {

                dictionary1.TryGetValue(key, out temp1);

                if (!dictionary2.TryGetValue(key, out temp2))

                {

                    temp2 = 0;

                }

                dictionary2.Remove(key);

                numerator += temp1 * temp2;

                denominator1 += temp1 * temp1;

                denominator2 += temp2 * temp2;

            }

            Dictionary<string, int>.KeyCollection keys2 = dictionary2.Keys;

            foreach (string key in keys2)

            {

                dictionary2.TryGetValue(key, out temp2);

                denominator2 += temp2 * temp2;

            }

            similarity = numerator / (Math.Sqrt(denominator1 * denominator2));

            return similarity;

        }

        /// <summary>

        /// 统计文档词频词典

        /// </summary>

        /// <param name="text">已分词文档,分隔符为非汉语字符</param>

        /// <returns>该文档词频词典</returns>

        public Dictionary<string, int> GetDictionary(string text)

        {

            Dictionary<string, int> dictionary = new Dictionary<string, int>();

            Regex regex = new Regex(@"[/u4e00-/u9fa5]+");

            MatchCollection results = regex.Matches(text);

            int temp;

            foreach (Match word in results)

            {

                if (dictionary.TryGetValue(word.Value, out temp))

                {

                    temp++;

                    dictionary.Remove(word.Value);

                    dictionary.Add(word.Value, temp);

                }

                else

                {

                    dictionary.Add(word.Value, 1);

                }

            }

            return dictionary;

        }

    }

}

还有很多可以优化的地方,大家多加思考。如果能够得到适当优化的话,速度还能提高很多。

 

抱歉!评论已关闭.