现在的位置: 首页 > 综合 > 正文

网页提取正文

2012年01月22日 ⁄ 综合 ⁄ 共 6507字 ⁄ 字号 评论关闭

上半年发了一篇 网页提取正文代码 的博客,今天想到了其中还有些问题没有解决,于是利用了几个小时的时间做了一下修正,再将新的代码贴出来,供需要的朋友使用。补充说明,这些代码只适合新闻资讯博客类的网页,论坛知道微博等等都不适用。欢迎朋友提更好的解决方案,谢谢 。 chenping@live.cn 

 

   1 using System;

  2 using System.Collections.Generic;
  3 using System.Linq;
  4 using System.Text;
  5 using System.Windows.Forms;
  6 
  7 namespace SmartReader
  8 {
  9     public class WebContent
 10     {
 11         public WebContent() { }
 12         private string Url { getset; }
 13         private string Content { getset; }
 14         public WebContent(string url)
 15         {
 16             this.Url = url;
 17         }
 18         /// <summary>
 19         /// 根据网址获取网页正文
 20         /// </summary>
 21         /// <param name="url">网址</param>
 22         /// <returns>网页正文</returns>
 23         public string GetContent(string url)//使用时调用此方法
 24         {
 25             ThreadWebBrowser(url);
 26             return this.Content;
 27         }
 28         private void ThreadWebBrowser(string url)
 29         {
 30             System.Threading.Thread thread = new System.Threading.Thread(new System.Threading.ParameterizedThreadStart(BeginCatch));
 31             thread.SetApartmentState(System.Threading.ApartmentState.STA);
 32             thread.Start(url);
 33             thread.Join();
 34             while (thread.IsAlive)
 35             {
 36                 System.Windows.Forms.Application.DoEvents();
 37             }
 38         }
 39         private void BeginCatch(object obj)
 40         {
 41             try
 42             {
 43                 string url = obj.ToString();
 44                 System.Windows.Forms.WebBrowser webBrowser = new System.Windows.Forms.WebBrowser();
 45                 webBrowser.ScriptErrorsSuppressed = true;
 46                 webBrowser.Navigate("about:blank");
 47                 string htmlCode = string.Empty;
 48                 htmlCode = GetHtmlCode(url, Encoding.Default);
 49                 string charset = SniffwebCodeReturnList(htmlCode, "charset=""\"");
 50                 if (charset != Encoding.Default.BodyName)//如果网页的编码与默认编码不同,则重新使用网页的编码获取网页源代码
 51                 {
 52                     htmlCode = GetHtmlCode(url, Encoding.GetEncoding(charset));
 53                 }
 54                 webBrowser.Document.Write(htmlCode);
 55 
 56                 Dictionary<stringstring> dict = new Dictionary<stringstring>();
 57                 HtmlElementCollection allElement = webBrowser.Document.Body.All;
 58                 for (int i = 0; i < allElement.Count; i++)
 59                 {
 60                     if (!dict.Keys.Contains(allElement[i].OuterHtml))
 61                     {
 62                         if (allElement[i].InnerText != null && allElement[i].InnerText.Length > 100)
 63                         {
 64                             dict.Add(allElement[i].OuterHtml, allElement[i].InnerText);
 65                         }
 66                     }
 67                 }
 68                 string content = dict.OrderByDescending(p => p.Value.Length * p.Value.Length / p.Key.Length)
 69                     .FirstOrDefault().Value ?? string.Empty;
 70 
 71                 this.Content = content;
 72             }
 73             catch
 74             {
 75 
 76             }
 77         }
 78         //根据网址获取网页源代码
 79         private static string GetHtmlCode(string url, Encoding encode)
 80         {
 81             string htmlCode = string.Empty;
 82             System.Net.HttpWebRequest webRequest;
 83             webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
 84             webRequest.Timeout = 30000;
 85             webRequest.Method = "GET";
 86             webRequest.UserAgent = "Mozilla/4.0";
 87             webRequest.Headers.Add("Accept-Encoding""gzip, deflate");
 88             System.Net.HttpWebResponse webResponse;
 89             webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
 90             if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
 91             {
 92                 using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
 93                 {
 94                     using (System.IO.Compression.GZipStream zipStream =
 95                         new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
 96                     {
 97                         using (System.IO.StreamReader sr = new System.IO.StreamReader(zipStream, encode))
 98                         {
 99                             htmlCode = sr.ReadToEnd();
100                         }
101                     }
102                 }
103             }
104             else
105             {
106                 using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
107                 {
108                     using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, encode))
109                     {
110                         htmlCode = sr.ReadToEnd();
111                     }
112                 }
113             }
114 
115             return htmlCode;
116         }
117 
118         //从html源代码中截取一段代码
119         private static string SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)
120         {
121             try
122             {
123                 System.Collections.ArrayList codeList = new System.Collections.ArrayList();
124                 System.Text.RegularExpressions.Regex regex =
125                     new System.Text.RegularExpressions.Regex(wordsBegin + @"(?<code>[\s\S]+?)" + wordsEnd,
126                         System.Text.RegularExpressions.RegexOptions.Compiled | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
127                 for (System.Text.RegularExpressions.Match match = regex.Match(code); match.Success; match = match.NextMatch())
128                 {
129                     codeList.Add(match.Groups["code"].ToString());
130                 }
131                 if (codeList.Count > 0)
132                 {
133                     return codeList[0].ToString();
134                 }
135                 else
136                 {
137                     return string.Empty;
138                 }
139             }
140             catch
141             {
142                 return string.Empty;
143             }
144         }
145     }
146 }

 

调用方式就很简单了,见下面的代码

 

 1             WebContent webContent = new WebContent();

2             string content = webContent.GetContent("http://www.cnblogs.com/kandy/archive/2011/08/30/how_to_get_web_content_from_url.html");
3             MessageBox.Show(content);

 

 

 

 

 

抱歉!评论已关闭.