现在的位置: 首页 > 综合 > 正文

业余写的一个小工具_XPathTool(C#源码)

2013年12月07日 ⁄ 综合 ⁄ 共 7155字 ⁄ 字号 评论关闭

     因为前段时间对于爬虫有些兴趣,所以研究了一下HtmlAgilityPack.dll而它是可以基于XPath来解析的。

     关于Xpath相关的查看这里 http://www.w3school.com.cn/xpath/index.asp

     网上找了半天没找到几个XPath工具。找到一份源代码,在它的基础上自己做了一个Xpath工具

如图 这里是通过XPath获取百度音乐歌曲名   

源代码

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using HtmlAgilityPack;
using System.Threading;
using System.Text.RegularExpressions;

namespace XPathTools
{
    public partial class Form1 : Form
    {
        public Form1()
        {            
            InitializeComponent();
            comboBox1.SelectedIndexChanged += comboBox1_SelectedIndexChanged;
            comboBox2.SelectedIndexChanged += comboBox2_SelectedIndexChanged;
        }

        private void comboBox2_SelectedIndexChanged(object sender, EventArgs e)
        {
            try
            {
                textBox3.Text = hd.DocumentNode.SelectNodes(comboBox2.Text)[0].InnerHtml;
            }
            catch (System.Exception ex)
            {
                MessageBox.Show("表达式有误" + ex.ToString());
            }
            //throw new NotImplementedException();
        }

        //鼠标滚轮
        private void comboBox1_SelectedIndexChanged(object sender, EventArgs e)
        {
            try
            {
                textBox3.Text = hd.DocumentNode.SelectNodes(comboBox1.Text)[0].InnerHtml;
            }
            catch (System.Exception ex)
            {
                MessageBox.Show("表达式有误" + ex.ToString());
                return;
            }
            comboBox2.Text = comboBox1.Text;
            //throw new NotImplementedException();
        }

        //指定文件路径
        private void button1_Click(object sender, EventArgs e)
        {
           textBox3.Text = textBox2.Text = null;
           OpenFileDialog ofg = new OpenFileDialog();
           ofg.Filter = "网页文件(*.html)|*.html;*.xml;*.htm;*.txt";
           ofg.Multiselect = false;
           if (ofg.ShowDialog() == DialogResult.OK)
           {
               textBox1.Text = ofg.FileName;
               if (ofg.FileName == null)
               {
                   return;
               }
               textBox1.ReadOnly = true;
               textBox2.ReadOnly = true;
               //将选择的文件加载到tab1的textbox中
               FileStream fs = new FileStream(textBox1.Text, FileMode.OpenOrCreate, FileAccess.Read);
               StreamReader sr = new StreamReader(fs,  UnicodeEncoding.GetEncoding("GB2312"));
               textBox2.AppendText(sr.ReadToEnd());
               sr.Close();
               fs.Close();


               //开始解析文件
               StartAnalyse();
           }
            
        }

        HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument();
        //开始分析文件的xpath路径
        private void StartAnalyse()
        {
            comboBox1.Items.Clear();
            comboBox2.Items.Clear();

            hd.LoadHtml(textBox2.Text);
            Thread th = new Thread(NewMethod);
            th.Start();
            //throw new NotImplementedException();
        }

        //向combox1添加数据
        private void UIContorol(string str)
        {           

            //textBox1.Text = str;
            comboBox1.Items.Add(str);
            comboBox1.Text = str;
            
            //让combox2等于combox1 combox2 也可以自己根据关键字查询得出          
            comboBox2.Text = str;
            //toolStripStatusLabel1.Text = str;
        }

        private delegate void Dg(string str);
        Dictionary<string, string> D = new Dictionary<string, string>();
        private void NewMethod()
        {
            Dg dgUIContorol = new Dg(UIContorol);

            List<string> returnList = new List<string>();
            string str = textBox2.Text;
            string s = "<script[\\s\\S]*?</script>";
            MatchCollection ms = Regex.Matches(str, s, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            foreach (Match m in ms)
            {
                str = str.Replace(m.Value, "");
            }
            Dictionary<string, int> dic = new Dictionary<string, int>();
            List<string> strList = new List<string>();
            strList.Add(".");

            string strPattern = "<([^<>]*?)>";
            MatchCollection Matches = Regex.Matches(str, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);

            foreach (Match NextMatch in Matches)
            {
                if (!NextMatch.Groups[0].Value.EndsWith("/ >") && !NextMatch.Groups[0].Value.EndsWith("/>") && !NextMatch.Groups[0].Value.StartsWith("<!"))
                {
                    if (NextMatch.Groups[0].Value.StartsWith("</"))
                    {
                        if (NextMatch.Groups[0].Value.Replace("</", "<").ToLower() == strList[strList.Count - 1].ToLower())
                        {
                            strList.RemoveAt(strList.Count - 1);
                        }
                    }
                    else
                    {
                        string strOldXpath = XpathRow(strList, dic);
                        string strp = "(<(?<body>[^>]*?) [^>]*?>)|(<(?<body>[^>]*?)>)";
                        string v = Regex.Matches(NextMatch.Groups[0].Value, strp, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value.ToLower();
                        if (v.ToUpper() != "LINK" && v.ToUpper() != "META" && v.ToUpper() != "SCRIPT" && v.ToUpper() != "IMG" && v.ToUpper() != "INPUT" && v.ToUpper() != "FORM")
                        {

                            AddRowNumber(strOldXpath, "<" + v + ">", dic);
                            strList.Add("<" + v + ">");
                            returnList.Add(XpathRow(strList, dic));
                            //label1.Text = returnList.Last();
                            try
                            {
                                string SelectNodes = hd.DocumentNode.SelectNodes(returnList.Last())[0].InnerHtml;
                                textBox3.Invoke(dgUIContorol, new object[] { returnList.Last() });
                                D.Add(returnList.Last(), SelectNodes);
                                //if (D.ContainsKey("./html[1]/body[1]/table[1]/tr[1]/td[1]/table[1]/tr[2]/td[1]/table[1]/tr[1]/td[1]/div[1]/div[1]/fieldset[1]/div[1]"))
                            }
                            catch
                            {
                            }
                        }
                    }
                }
                else
                {

                }
            }
            //listBox1.DataSource = returnList;
            //listBox1.Items.Add(returnList.Count);
            if (strList.Count == 1)
            {
                //toolStripStatusLabel1.Text = "OK";
            }
            else
            {
                //toolStripStatusLabel1.Text = "False";
            }
        }

        private string XpathRow(List<string> strList, Dictionary<string, int> dic)
        {

            StringBuilder sb = new StringBuilder();
            foreach (var str in strList)
            {
                string strPattern = "<(?<body>[^>]*?)>";
                string v = "";
                try
                {
                    v = Regex.Matches(str, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value;

                    string temp = sb.ToString() + v;
                    v = v + "[" + dic[temp].ToString() + "]";
                }
                catch
                {
                    v = str;
                }


                sb.Append(v + "/");
            }
            return sb.ToString().TrimEnd('/');
        }

        private void AddRowNumber(string strOldXpatch, string NewNode, Dictionary<string, int> dic)
        {
            if (strOldXpatch == "")
            {
                if (!dic.ContainsKey("."))
                {
                    dic.Add(".", 0);
                }
                else
                {
                    dic["."] = 0;
                }
                return;
            }
            string strPattern = "<(?<body>[^>]*?)>";
            string v = Regex.Matches(NewNode, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value;
            if (dic.ContainsKey(strOldXpatch + "/" + v))
            {
                dic[strOldXpatch + "/" + v]++;
            }
            else
            {
                dic.Add(strOldXpatch + "/" + v, 1);
            }
        }


        //获取指定的远程网页
        private void button2_Click(object sender, EventArgs e)
        {

        }
        
        private void Form1_Load(object sender, EventArgs e)
        {

        }

        //执行xpath查询
        private void button3_Click(object sender, EventArgs e)
        {
            comboBox2.Items.Clear();
            foreach (string str in D.Where(fun => fun.Value.ToLower().Contains(textBox5.Text.ToLower())).Select(fun => fun.Key))
            {               
               comboBox2.Items.Add(str);
               comboBox2.Text = str;
            }
            
        }
        //获取远程
        private void button2_Click_1(object sender, EventArgs e)
        {
            textBox3.Text = textBox2.Text = null;
            if (textBox1.Text == null)
            {
                MessageBox.Show("地址不能为空!");
                return;
            }
            string strUrl = textBox1.Text;
            HtmlWeb hw = new HtmlWeb();
            string url = strUrl;
            try
            {
                hd = hw.Load(url);
            }
            catch (System.Exception ex)
            {
                MessageBox.Show(ex.ToString());
                return;
            }
            textBox2.Text = hd.DocumentNode.InnerHtml;
            

            //开始解析标签
            StartAnalyse();
        }  

        //解析textbox1中的标签
        private void button4_Click(object sender, EventArgs e)
        {
            //分析textbox2中的xpath项
            StartAnalyse();
        }

        private void textBox2_TextChanged(object sender, EventArgs e)
        {

        }

        //执行combox1中的XPath语句
        private void OnXPath(object sender, EventArgs e)
        {
            try
            {
                textBox3.Text = hd.DocumentNode.SelectNodes(comboBox1.Text)[0].InnerHtml;
            }
            catch (System.Exception ex)
            {
                MessageBox.Show("表达式有误" + ex.ToString());
            }

            //
            comboBox2.Text = comboBox1.Text;
        }

        //获取匹配的结果值
        private void button6_Click(object sender, EventArgs e)
        {
            //标签
            string strLabel =  textBox6.Text;
            //值
            string strValue= textBox7.Text;
            string strXPathLabel_Val = "descendant::" + strLabel;
            //XPath语句
            string strXPath = comboBox2.Text;
            HtmlNode node = hd.DocumentNode.SelectSingleNode(strXPath);
            
            //HtmlNode ^node = doc->GetElementbyId("entry_content");
            if (node == null)
            {
                return ;
            }

            Form2 f2 = new Form2();
            try
            {
                //HtmlNodeCollection atts  = node.SelectNodes("//*[@background or @lowsrc or @src or @href]");
                //这样得到的是基于全文的
                //HtmlNodeCollection hrefs = node.SelectNodes("//a[@href]");
                //这样得到的是基于本节点的                                
                HtmlNodeCollection hrefs = node.SelectNodes(strXPathLabel_Val);
                if (hrefs == null)
                {
                    return;
                }
                
                foreach (HtmlNode href in hrefs)
                {
                    if (href.Attributes[strValue] == null)
                    {
                        continue;
                    }                     
                    String strImg = href.Attributes[strValue].Value;
                    f2.AddData2ListView(textBox6.Text, strValue, strImg);
                }

            }
            catch (System.Exception ex)
            {
                MessageBox.Show(ex.ToString());
            }
            finally
            {
                f2.Show();
            }
	
        }
    }
}

源代码下载地址:

http://download.csdn.net/detail/witch_soya/4978587

【上篇】
【下篇】

抱歉!评论已关闭.