现在的位置: 首页 > 综合 > 正文

用mysql数据库写的分词算法代码

2014年08月29日 ⁄ 综合 ⁄ 共 6724字 ⁄ 字号 评论关闭

我辛苦的整了几天才整好的 拿来给大家分享一下 希望可以帮助大家 以下分为四步:每步都有注释说明的

#region  一.先从article表里查询数据
    /// <summary>

    /// 一.先从article表里查询数据

    /// </summary>

    public void fenciBind()

    {

        string sql = "select * from article;";

        string str = ConfigurationManager.ConnectionStrings["ConnectionString"].ToString();

        MySqlConnection con = new MySqlConnection(str);

        con.Open();

        MySqlDataAdapter msda = new MySqlDataAdapter(sql, con);

        DataTable dt = new DataTable();

        msda.Fill(dt);

        for (int i = 0; i < dt.Rows.Count; i++)//循环数据库里的数据

        {

            string strcon = dt.Rows[i][1].ToString();//标题

            strcon += dt.Rows[i][3].ToString();//内容

            DateTime strtime = Convert.ToDateTime(dt.Rows[i][4]);//时间

       //判断时间 在一天内容不让他进行分词

            if (strtime < DateTime.Now.AddDays(-1))

            {

                string artsrt = dt.Rows[i]["id"].ToString();

                fencistr(strcon, artsrt);

                con.Close();

            }

        }

    }

    #endregion

#region  二.article表里的数据进行分词
    /// <summary>

    /// 二.article表里的数据进行分词

    /// </summary>

    /// <param name="strcon">article表里要分词标题和内容的数据</param>

    /// <param name="artsrt">article表里数据id编号</param>

    /// <returns></returns>

    public string fencistr(string strcon, string artsrt)

    {

        StringBuilder sb = new StringBuilder();

        sb.Remove(0, sb.Length);

        string t1 = "";

        Analyzer analyzer = new Lucene.China.ChineseAnalyzer();

        StringReader sr = new StringReader(strcon);

        TokenStream stream = analyzer.TokenStream(null, sr);

        Token t = stream.Next();

        while (t != null)

        {

            t1 = t.ToString();   //显示格式: (关键词,0,2) ,需要处理

            t1 = t1.Replace("(", "");

            char[] separator = { ',' };

            t1 = t1.Split(separator)[0];

            sb.Append("," + t1);

            t = stream.Next();

        }

        //三.汉字转换拼音

        pinyinstr(sb.ToString(), sb.ToString(), artsrt);

        return strcon;

    }

    #endregion

#region 三.汉字转换拼音
    /// <summary>

    ///三.汉字转换拼音

    /// </summary>

    /// <param name="sb">用于转化为pingyin的汉字</param>

    /// <param name="sbstr">用于往keywords表里添加的汉字</param>

    /// <param name="artsrt">article表里数据id编号</param>

    /// <returns></returns>

    public string pinyinstr(string sb, string sbstr, string artsrt)

    {

        string pystr = null;//pinyin表用逗号隔开的集合

        string s = sb.ToString();

        for (int i = 0; i < s.Length; i++)

        {

            if (ChineseChar.IsValidChar(s[i]))

            {

                ChineseChar CString = new ChineseChar(s[i]);

                for (int ii = 0; ii < CString.PinyinCount; ii++)

                {

                    string PinYins = CString.Pinyins[ii].ToString().ToLower();

                    if (PinYins[PinYins.Length - 1].CompareTo('5') < 0)

                    {

                        pystr += PinYins;

                    }

                }

                pystr += "|";

            }

            pystr += ",";

        }

        pystr = pystr.Replace("|,", "").TrimEnd(',');

        Opestr(pystr, sbstr, artsrt);//四.创建拼音数据表并添加数据

        return pystr;

    }

    #endregion

#region 四.创建拼音数据表并添加数据 同时往keywords表里添加数据
    /// <summary>

    /// 创建拼音数据表并添加数据 同时往keywords表里添加数据

    /// </summary>

    /// <param name="pystr">要创建的每个pinyin表</param>

    /// <param name="sbstr">article表里的数据分词后逗号隔开的字符串</param>

    /// <param name="artsrt">article表里数据id编号</param>

    public void Opestr(string pystr, string sbstr, string artsrt)

    {

        string[] PinYins = pystr.Trim().Split(',');

        for (int i = 1; i < PinYins.Length; i++)

        {

            //四.创建拼音数据表并添加数据

            //*************************1.往keywords表里添加数***********************

            //(1).查询keywords表,并判断keywords表里pinyin是否存在相同的

            string str = ConfigurationManager.ConnectionStrings["ConnectionString"].ToString();

            MySqlConnection con = new MySqlConnection(str);

            con.Open();

            string kwssql = "select * from `hww_article_search`.`keywords`";

            MySqlDataAdapter kwsmda = new MySqlDataAdapter(kwssql, con);

            DataTable kwdt = new DataTable();

            kwsmda.Fill(kwdt);

            string[] hzstr = sbstr.Trim().Split(',');//汉字用逗号分割的数据

            for (int ii = 1; ii < hzstr.Length; ii++)

            {

                if (kwdt.Rows.Count != 0) //(2).如果pinyin表里有数据,则先进行判断是否有相同的pinyin值

                {

                    string kwstr = null;

                    for (int ll = 0; ll < kwdt.Rows.Count; ll++)

                    {

                        kwstr += "," + kwdt.Rows[ll]["pinyin"].ToString();

                    }

                    string kwpy = PinYins[i];

                    if (!kwstr.Contains(kwpy))

                    {

                        //(3).不存在相同的pinyin则添加

                        string kwsql = "INSERT INTO `hww_article_search`.`keywords` (`keyword`, `pinyin`) VALUES ('" + hzstr[ii] + "', '" + PinYins[ii] + "');";

                        MySqlCommand kwcom = new MySqlCommand(kwsql, con);

                        kwcom.ExecuteNonQuery();//添加

                    }

                }

                else//如果没有数据则添加

                {

                    string kwsql = "INSERT INTO `hww_article_search`.`keywords` (`keyword`, `pinyin`) VALUES ('" + hzstr[ii] + "', '" + PinYins[ii] + "');";

                    MySqlCommand kwcom = new MySqlCommand(kwsql, con);

                    kwcom.ExecuteNonQuery();//添加

                }

            }

        //*******************************2.建库建表*****************************

            //(1).先建库

            string sqlcre = "create table if not exists `hww_article_search`.`" + PinYins[i] + "` ( `id` int(10) not null auto_increment, `article_id` int(10) unsigned not null,primary key(`id`));";

            MySqlCommand com = new MySqlCommand(sqlcre, con);

            com.ExecuteNonQuery();

            //(2).查询pinyin表,并判断yinpin表里article_id是否存在此相同的id

            string sqlif = "select * from `hww_article_search`.`" + PinYins[i] + "`";

            MySqlDataAdapter msdaif = new MySqlDataAdapter(sqlif, con);

            DataTable dtif = new DataTable();

            msdaif.Fill(dtif);

            //(3).往pinyin表里添加数据sql语句

            string sqladd = " insert into `hww_article_search`.`" + PinYins[i] + "`(`article_id`) values(" + artsrt + ");";

            MySqlCommand comadd = new MySqlCommand(sqladd, con);

            //(4).添加成功后修改article表里的时间为当前时间sql语句

            string uptimesql = "update `hww_article_search`.`article` set `update_time`='" + DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + "' where `id`=" + artsrt + ";";

            MySqlCommand comtime = new MySqlCommand(uptimesql, con);

            //如果pinyin表里有数据,则先进行判断是否有相同的id值

            if (dtif.Rows.Count != 0)

            {

                string sdi = null;

                for (int j = 0; j < dtif.Rows.Count; j++)

                {

                    sdi += dtif.Rows[j]["article_id"].ToString() + ",";

                }

                string sad = artsrt;

                if (!sdi.Contains(sad))

                {

                    //3.不存在相同的id号则添加

                    comadd.ExecuteNonQuery();//添加

                    comtime.ExecuteNonQuery();//修改

                }

            }

            else//如果没有数据则添加

            {

                comadd.ExecuteNonQuery();//添加

                comtime.ExecuteNonQuery();//修改

            }

        }

    }

    #endregion

运行结果如下图:




 

抱歉!评论已关闭.