1.简单的获得页面内容
import urllib2 content = urllib2.urlopen('http://www.hao123.com').read() f=open("1.html",'w'); f.write(content) f.close()
但是这样存在一些站点采取了保护,因此要伪装成浏览器的正常请求,
2.
# -*- coding: cp936 -*- import urllib, urllib2, cookielib class Dawn: '''这是一个访问浏览器的方法,目前只是写着玩,已经一年没有写Python,该忘的都忘了吧''' timeout = 30 def __init__(self): '''初始化模块,增加cookie支持''' httpHandler = urllib2.HTTPHandler() httpsHandler = urllib2.HTTPSHandler() cookie = cookielib.CookieJar() cookie_support = urllib2.HTTPCookieProcessor(cookie) opener = urllib2.build_opener(cookie_support, httpHandler, httpsHandler) urllib2.install_opener(opener) def getHeader(self): '''返回浏览器header''' header = { "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13", #"User-Agent" = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language":"zh-cn,zh;q=0.5", #"Accept-Encoding":"gzip,deflate", "Accept-Charset":"GB2312,utf-8;q=0.7,*;q=0.7", "Keep-Alive":"115", "Connection":"keep-alive" } return header def request(self, url, headers=None, data = None): '''请求处理''' if headers is None: header = self.getHeader() #开始设置请求数据 req = urllib2.Request( url = url, headers = header ) if data is not None: data = urllib.urlencode(data) #请求开始 try: request = urllib2.urlopen(req, data, self.timeout) source = request.read() request.close() except: source = None #print "connect faild..." return source if __name__ == "__main__": dawn = Dawn() urls=["www.baidu.com","www.163.com","oschina.net","www.sina.com"] for item in urls: url="http://"+item fileName=item+".html" content=dawn.request(url) f=open(fileName,"w") f.write(content) f.close()