因为前段时间对于爬虫有些兴趣,所以研究了一下HtmlAgilityPack.dll而它是可以基于XPath来解析的。
关于Xpath相关的查看这里 http://www.w3school.com.cn/xpath/index.asp
网上找了半天没找到几个XPath工具。找到一份源代码,在它的基础上自己做了一个Xpath工具
如图 这里是通过XPath获取百度音乐歌曲名
源代码
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; using System.Windows.Forms; using HtmlAgilityPack; using System.Threading; using System.Text.RegularExpressions; namespace XPathTools { public partial class Form1 : Form { public Form1() { InitializeComponent(); comboBox1.SelectedIndexChanged += comboBox1_SelectedIndexChanged; comboBox2.SelectedIndexChanged += comboBox2_SelectedIndexChanged; } private void comboBox2_SelectedIndexChanged(object sender, EventArgs e) { try { textBox3.Text = hd.DocumentNode.SelectNodes(comboBox2.Text)[0].InnerHtml; } catch (System.Exception ex) { MessageBox.Show("表达式有误" + ex.ToString()); } //throw new NotImplementedException(); } //鼠标滚轮 private void comboBox1_SelectedIndexChanged(object sender, EventArgs e) { try { textBox3.Text = hd.DocumentNode.SelectNodes(comboBox1.Text)[0].InnerHtml; } catch (System.Exception ex) { MessageBox.Show("表达式有误" + ex.ToString()); return; } comboBox2.Text = comboBox1.Text; //throw new NotImplementedException(); } //指定文件路径 private void button1_Click(object sender, EventArgs e) { textBox3.Text = textBox2.Text = null; OpenFileDialog ofg = new OpenFileDialog(); ofg.Filter = "网页文件(*.html)|*.html;*.xml;*.htm;*.txt"; ofg.Multiselect = false; if (ofg.ShowDialog() == DialogResult.OK) { textBox1.Text = ofg.FileName; if (ofg.FileName == null) { return; } textBox1.ReadOnly = true; textBox2.ReadOnly = true; //将选择的文件加载到tab1的textbox中 FileStream fs = new FileStream(textBox1.Text, FileMode.OpenOrCreate, FileAccess.Read); StreamReader sr = new StreamReader(fs, UnicodeEncoding.GetEncoding("GB2312")); textBox2.AppendText(sr.ReadToEnd()); sr.Close(); fs.Close(); //开始解析文件 StartAnalyse(); } } HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument(); //开始分析文件的xpath路径 private void StartAnalyse() { comboBox1.Items.Clear(); comboBox2.Items.Clear(); hd.LoadHtml(textBox2.Text); Thread th = new Thread(NewMethod); th.Start(); //throw new NotImplementedException(); } //向combox1添加数据 private void UIContorol(string str) { //textBox1.Text = str; comboBox1.Items.Add(str); comboBox1.Text = str; //让combox2等于combox1 combox2 也可以自己根据关键字查询得出 comboBox2.Text = str; //toolStripStatusLabel1.Text = str; } private delegate void Dg(string str); Dictionary<string, string> D = new Dictionary<string, string>(); private void NewMethod() { Dg dgUIContorol = new Dg(UIContorol); List<string> returnList = new List<string>(); string str = textBox2.Text; string s = "<script[\\s\\S]*?</script>"; MatchCollection ms = Regex.Matches(str, s, RegexOptions.IgnoreCase | RegexOptions.Compiled); foreach (Match m in ms) { str = str.Replace(m.Value, ""); } Dictionary<string, int> dic = new Dictionary<string, int>(); List<string> strList = new List<string>(); strList.Add("."); string strPattern = "<([^<>]*?)>"; MatchCollection Matches = Regex.Matches(str, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled); foreach (Match NextMatch in Matches) { if (!NextMatch.Groups[0].Value.EndsWith("/ >") && !NextMatch.Groups[0].Value.EndsWith("/>") && !NextMatch.Groups[0].Value.StartsWith("<!")) { if (NextMatch.Groups[0].Value.StartsWith("</")) { if (NextMatch.Groups[0].Value.Replace("</", "<").ToLower() == strList[strList.Count - 1].ToLower()) { strList.RemoveAt(strList.Count - 1); } } else { string strOldXpath = XpathRow(strList, dic); string strp = "(<(?<body>[^>]*?) [^>]*?>)|(<(?<body>[^>]*?)>)"; string v = Regex.Matches(NextMatch.Groups[0].Value, strp, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value.ToLower(); if (v.ToUpper() != "LINK" && v.ToUpper() != "META" && v.ToUpper() != "SCRIPT" && v.ToUpper() != "IMG" && v.ToUpper() != "INPUT" && v.ToUpper() != "FORM") { AddRowNumber(strOldXpath, "<" + v + ">", dic); strList.Add("<" + v + ">"); returnList.Add(XpathRow(strList, dic)); //label1.Text = returnList.Last(); try { string SelectNodes = hd.DocumentNode.SelectNodes(returnList.Last())[0].InnerHtml; textBox3.Invoke(dgUIContorol, new object[] { returnList.Last() }); D.Add(returnList.Last(), SelectNodes); //if (D.ContainsKey("./html[1]/body[1]/table[1]/tr[1]/td[1]/table[1]/tr[2]/td[1]/table[1]/tr[1]/td[1]/div[1]/div[1]/fieldset[1]/div[1]")) } catch { } } } } else { } } //listBox1.DataSource = returnList; //listBox1.Items.Add(returnList.Count); if (strList.Count == 1) { //toolStripStatusLabel1.Text = "OK"; } else { //toolStripStatusLabel1.Text = "False"; } } private string XpathRow(List<string> strList, Dictionary<string, int> dic) { StringBuilder sb = new StringBuilder(); foreach (var str in strList) { string strPattern = "<(?<body>[^>]*?)>"; string v = ""; try { v = Regex.Matches(str, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value; string temp = sb.ToString() + v; v = v + "[" + dic[temp].ToString() + "]"; } catch { v = str; } sb.Append(v + "/"); } return sb.ToString().TrimEnd('/'); } private void AddRowNumber(string strOldXpatch, string NewNode, Dictionary<string, int> dic) { if (strOldXpatch == "") { if (!dic.ContainsKey(".")) { dic.Add(".", 0); } else { dic["."] = 0; } return; } string strPattern = "<(?<body>[^>]*?)>"; string v = Regex.Matches(NewNode, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value; if (dic.ContainsKey(strOldXpatch + "/" + v)) { dic[strOldXpatch + "/" + v]++; } else { dic.Add(strOldXpatch + "/" + v, 1); } } //获取指定的远程网页 private void button2_Click(object sender, EventArgs e) { } private void Form1_Load(object sender, EventArgs e) { } //执行xpath查询 private void button3_Click(object sender, EventArgs e) { comboBox2.Items.Clear(); foreach (string str in D.Where(fun => fun.Value.ToLower().Contains(textBox5.Text.ToLower())).Select(fun => fun.Key)) { comboBox2.Items.Add(str); comboBox2.Text = str; } } //获取远程 private void button2_Click_1(object sender, EventArgs e) { textBox3.Text = textBox2.Text = null; if (textBox1.Text == null) { MessageBox.Show("地址不能为空!"); return; } string strUrl = textBox1.Text; HtmlWeb hw = new HtmlWeb(); string url = strUrl; try { hd = hw.Load(url); } catch (System.Exception ex) { MessageBox.Show(ex.ToString()); return; } textBox2.Text = hd.DocumentNode.InnerHtml; //开始解析标签 StartAnalyse(); } //解析textbox1中的标签 private void button4_Click(object sender, EventArgs e) { //分析textbox2中的xpath项 StartAnalyse(); } private void textBox2_TextChanged(object sender, EventArgs e) { } //执行combox1中的XPath语句 private void OnXPath(object sender, EventArgs e) { try { textBox3.Text = hd.DocumentNode.SelectNodes(comboBox1.Text)[0].InnerHtml; } catch (System.Exception ex) { MessageBox.Show("表达式有误" + ex.ToString()); } // comboBox2.Text = comboBox1.Text; } //获取匹配的结果值 private void button6_Click(object sender, EventArgs e) { //标签 string strLabel = textBox6.Text; //值 string strValue= textBox7.Text; string strXPathLabel_Val = "descendant::" + strLabel; //XPath语句 string strXPath = comboBox2.Text; HtmlNode node = hd.DocumentNode.SelectSingleNode(strXPath); //HtmlNode ^node = doc->GetElementbyId("entry_content"); if (node == null) { return ; } Form2 f2 = new Form2(); try { //HtmlNodeCollection atts = node.SelectNodes("//*[@background or @lowsrc or @src or @href]"); //这样得到的是基于全文的 //HtmlNodeCollection hrefs = node.SelectNodes("//a[@href]"); //这样得到的是基于本节点的 HtmlNodeCollection hrefs = node.SelectNodes(strXPathLabel_Val); if (hrefs == null) { return; } foreach (HtmlNode href in hrefs) { if (href.Attributes[strValue] == null) { continue; } String strImg = href.Attributes[strValue].Value; f2.AddData2ListView(textBox6.Text, strValue, strImg); } } catch (System.Exception ex) { MessageBox.Show(ex.ToString()); } finally { f2.Show(); } } } }
源代码下载地址: