现在的位置: 首页 > 综合 > 正文

C# 獲取網頁的中的字符串(不包含html)大顯神通

2014年09月05日 ⁄ 综合 ⁄ 共 5350字 ⁄ 字号 评论关闭

1:首先要獲取網頁中的全部信息,有下面3中方法

 

 using System.Net;
using System.IO;

   private void Page_Load(object sender, System.EventArgs e)
   {
    // 在此处放置用户代码以初始化页面
    string url="http:www.sina.com";
    System.IO.Stream stream=null;
    WebClient client=new WebClient();
    stream=client.OpenRead(url);
   
    StreamReader readerOfStream = new StreamReader(stream,System.Text.Encoding.GetEncoding("GB2312"));
    string aa= readerOfStream.ReadToEnd();
    Response.Write(aa);
    // Close the stream.
    stream.Close();

   }

private void InitializeComponent()
   {   
    this.Load += new System.EventHandler(this.Page_Load);

   }

Microsoft Visual Studio .net 2003 下运行成功。

接上文,以下是普遍采用的 c# 用System.Net 读取网页源代码 的书写方式。

   System.Net.HttpWebRequest req;
    System.Net.HttpWebResponse res;
    string url="www.taobao.com";
    req = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
    res = (System.Net.HttpWebResponse)req.GetResponse();
    System.IO.StreamReader strm = new System.IO.StreamReader(res.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));
    string aa=strm.ReadToEnd();
    Response.Write(aa);

   方法2

在WindowsApplication 用WebBrowser控件实现 读取网页。  
在工具箱里右击,选添加/删除选项,在弹出的对话框里选Com组件选项卡,找到   
Microsoft Web 浏览器 选定 后确定, 在工具箱里选择WebBrowser控件,拖放到窗体上,然后写代码:   

private void Form1_Load(object sender, System.EventArgs e)
   {
     string   str="";  
    System.Object   nullObject=0;  
    System.Object   nullObjStr=str;  
    this.axWebBrowser1.Navigate("www.taobao.com",ref   nullObject,ref    
   
     nullObjStr,ref   nullObjStr,ref   nullObjStr);  
   }

 

2:把剛才獲取網頁的的信息作為一個字符串(去掉html格式)
  
public static string DelHTML(string Htmlstring)//将HTML去除

         {      
             #region
             //删除脚本

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             //删除HTML

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<(.[^>]*)>","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"([/r/n])[/s]+","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"-->","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<!--.*","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            
             //Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<A>.*</A>","");
        
             //Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<[a-zA-Z]*=/.[a-zA-Z]*/?[a-zA-Z]+=/d&/w=%[a-zA-Z]*|[A-Z0-9]","");

                        

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(quot|#34);","/"",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(amp|#38);","&",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(lt|#60);","<",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(gt|#62);",">",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(nbsp|#160);"," ",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(iexcl|#161);","/xa1",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring,@"&(cent|#162);","/xa2",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(pound|#163);","/xa3",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(copy|#169);","/xa9",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring, @"&#(/d+);","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            
             Htmlstring.Replace("<","");

             Htmlstring.Replace(">","");

             Htmlstring.Replace("/r/n","");

             //Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
             #endregion


             return Htmlstring;

         }
 

 

3:可以提取字符串中的漢字

 (1)方法

string str="56756 後即可更換過分 215 後即可更換過分 12131sfagsa";
string ret=System.Text.RegularExpressions.Regex.Replace(str,@"[^(/u4e00-/u9fa5)]","");

 

 

(2)方法

private string getTemp(string str)
    {
       
char[] clist= str.ToCharArray();
       
string tmpstr = "";
       
int code = 0;
       
int chfrom = Convert.ToInt32("4e00", 16);    //范围(0x4e00~0x9fff)转换成int(chfrom~chend)
        int chend = Convert.ToInt32("9fff", 16);
       
for(int i=0;i<clist.Length;i++)
        {
            code
=Char.ConvertToUtf32(str,i);
           
if (code >= chfrom && code <= chend)    
            {
                tmpstr
+= clist[i].ToString();
            }

        }
        return tmpstr;
    }

 

4:可以去掉數字

 

 public static string RemoveNumber(string key)
        {
            return Regex.Replace(key, @"/d", "");
        }

 

5:可以去掉非數字

  public static string RemoveNotNumber(string key)
        {
            return Regex.Replace(key, @"[^/d]*", "");
        }

抱歉!评论已关闭.