现在的位置: 首页 > 综合 > 正文

页面信息采集

2013年10月21日 ⁄ 综合 ⁄ 共 6324字 ⁄ 字号 评论关闭

原理:利用页面为get方式传值可获取多页分页信息,批量分析数据,录入数据库

 

:此处用到某人写的字符串处理方法,using Globe.Common;    @-@|||

 

页面:

输入要采集的网址:|   http://...    |

输入总页数:|   350   |

输入希望学的语言:|   英语   |

{ 开始采集分析数据 }

 

代码:

    /// <summary>
    /// 返回网页的二进制流
    /// </summary>
    /// <param name="url">要返回网页的URL地址</param>
    /// <returns></returns>
    public byte[] GetWebByte(string url)
    {
        byte[] pagedata = null;
        try
        {
            using (WebClient wb = new WebClient())   //创建一个WebClient实例
            {
                pagedata = wb.DownloadData(url);
                return pagedata;
            }
        }
        catch
        {
            return null;
        }

    }

 

    /// <summary>
    /// 返回网页的源代码
    /// </summary>
    /// <param name="url">要返回网页的URL地址</param>
    /// <param name="EncodingName">网页使用的编码名称(如“gb2312”)</param>
    public string GetWebText(string url, string EncodingName)
    {
        byte[] pageData = null;
        try
        {
            pageData = GetWebByte(url);
            return System.Text.Encoding.GetEncoding(EncodingName).GetString(pageData);
        }
        catch
        {
            return null;
        }
    }

 

    /// <SUMMARY>
    /// 得到网页的编码方式
    /// </SUMMARY>
    /// <PARAM name="url">
    /// </PARAM>
    /// <RETURNS></RETURNS>
    private System.Text.Encoding GetPageEncoding(string url)
    {   
        System.Text.Encoding encod = System.Text.Encoding.Default;   
        try
        {       
            System.Net.WebRequest request = System.Net.WebRequest.Create(url);       
            System.Net.WebResponse response = request.GetResponse();       
            System.IO.Stream stream = response.GetResponseStream();       
            byte[] header = new byte[1024];       
            stream.Read(header, 0, header.Length);       
            string head = Encoding.Default.GetString(header);       
            stream.Close();       
            stream.Dispose();       
            string patternEncode = "<META[^>]+charset//s*=//s*(?<EN>[//w-]+)";       
            Regex regEncode = new Regex(patternEncode, RegexOptions.IgnoreCase);       
            Match match = regEncode.Match(head);       
            if (match.Success)       
            {           
                encod = System.Text.Encoding.GetEncoding(match.Groups["En"].Value);       
            }   
        }   
        catch   
        { }   
       
        return encod;
    }

 

    protected void Button1_Click(object sender, EventArgs e)
    {
        string content;
        ArrayList list;

        string url = this.txtURL.Text;
        string wishLang = this.txtWish.Text;
        int num = Convert.ToInt32(this.txtNUM.Text);

        SqlConnection con = new SqlConnection("server=.;database=InterSpeakers;pwd=;uid=sa;");
        con.Open();

        SqlCommand cmd;

        //循环采集
        for(int n = 1; n <= num; n++)
        {
            content = GetWebText(url + "&page=" + n, "UTF-8");

            list = CharManage.GetCharMatching(content, "short_desc", "display:none", true);

            int count = list.Count;

            for (int i = 0; i < count; i++)
            {
                string name = CharManage.GetCharOnceFront(list[i].ToString(), "<a", 0, "<", 0);
                name = CharManage.GetCharOnceFront(name,">",1,"<",-1).Replace("'",".");

                //查看该数据是否已经存在(防止录入重复数据,此处以用户名为依据)
                cmd = new SqlCommand("select count(1) from info where name = '" + name + "'",con);
                int isHave = Convert.ToInt32(cmd.ExecuteScalar());

                //存在
                if (isHave > 0)
                {
                    cmd = new SqlCommand("select wishLang from info where name = '" + name + "'",con);
                    string w = cmd.ExecuteScalar().ToString();

                    w += "," + wishLang;

                    //Globe.Common.Javascript.JsAlert(w);
                    cmd = new SqlCommand("update info set wishLang = '" + w + "' where name='" + name +"'",con);
                    cmd.ExecuteNonQuery();
                }
                else
                {
                    string age_from = CharManage.GetCharOnceFront(list[i].ToString(), "age_from", 0, "<", -1);

                    string country = CharManage.GetCharOnceFront(age_from, ", ", 2, "&", -1).Replace("'", ".");

                    age_from = CharManage.GetCharOnceFront(age_from, ">", 1, ",", -1).Replace("'", ".");

                    cmd = new SqlCommand("insert into info (name,ageForm,country,wishLang) values('" + name + "','" + age_from + "','" + country + "','" + wishLang + "')", con);
                    cmd.ExecuteNonQuery();
                }
            }
        }

        con.Close();

}

 

统计注册人员相关信息,分析上采集的数据

囧啊。。。

此处我用了个很衰的方法,其实不用这样的。。。

我应该用GROUP BY ,COUNT就可以解决。。。

但我的思路是,先从数据库中读出全部国家到内存中,

然后再重内存中一个一个的取国家再到数据库中统计属于该国家的总人数。。。

囧啊。。。

太丢人啦。。。

 

页面:

 

输入国家:|             |

想学的语言:|              |

{ 开始统计 }

 

代码:

 

    string s;

    protected void Page_Load(object sender, EventArgs e)
    {
    }

    protected void Button1_Click(object sender, EventArgs e)
    {
        s = "select count(0) from info ";

       

         //关键字country存在
        if (this.txtCountry.Text.Trim() != "")
        {
            s += "where country = '" + this.txtCountry.Text + "'" ;

            //关键字希望学的语言存在
            if (this.txtWish.Text.Trim() != "")
            {
                s += " and wishLang like '%" + this.txtWish.Text + "%'";
            }
            else
            { }
        }
        else
        {
            if (this.txtWish.Text.Trim() != "")
            {
                s += "where wishLang like '%" + this.txtWish.Text + "%'";
            }
            else
            { }
        }

 

        SqlConnection con = new SqlConnection("server=.;database=InterSpeakers;pwd=;uid=sa;");
        con.Open();

        SqlCommand cmd = new SqlCommand(s,con);

        int count = Convert.ToInt32(cmd.ExecuteScalar());

        con.Close();

        this.labNum.Text = count.ToString();

        readCount();
    }

    protected void readCount()
    {
        string country;
        int count;
        int sum = 0;

        SqlConnection con = new SqlConnection("server=.;database=InterSpeakers;pwd=;uid=sa;");
        con.Open();

        //搜索国家列表

        SqlDataAdapter sda = new SqlDataAdapter();
        sda.SelectCommand = new SqlCommand("select name from country",con);

        DataSet ds = new DataSet();
        sda.Fill(ds, "country");

        for(int i = 0;i<ds.Tables["country"].Rows.Count;i++)
        {
            country = ds.Tables["country"].Rows[i][0].ToString();

            string scountry = s + " and country = '" + country + "'";
            SqlCommand cmd = new SqlCommand(scountry, con);
            count = Convert.ToInt32(cmd.ExecuteScalar());

            sum += count;

            if (count > 0)
            {
                Response.Write("国家:" + country.Replace(".", "'") + "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;学习" + this.txtWish.Text + "的人数:" + count + "<br/>");
            }
            else
            { }
        }

        ds.Clear();
        con.Close();

        Response.Write("总人数:" + sum);
    }

抱歉!评论已关闭.