现在的位置: 首页 > 综合 > 正文

使用Python多线程抓取并验证代理

2013年09月05日 ⁄ 综合 ⁄ 共 4506字 ⁄ 字号 评论关闭

最简单的,只要用Python的urllib2.urlopen()函数就可以了;
然后,有个网站喜欢封人,所以,得找一批代理,轮流抓它的信息;
有的网站不允许程序抓取,所以,就得加入一些头信息;
有的网站需要登录,这时就要用到Cookies;
最后,为了提高效率,最好是使用多线程。(PS,有个地方要注意,urlopen这个函数,设定了一个全局对象opener,所以如果你使用了多个线程,每个线程使用一个代理,那么,不能使用urlopen这个函数,而应该使用opener.open)

下面是我用Python写的一个抓代理的脚本,虽然现在已经不在教育网内部了,不过有时候还是需要用一下代理的:)

 

 

 

# -*- coding: cp936 -*-
import urllib2,re,thread,time

import socket
socket.setdefaulttimeout(10)
   
#-----------------------定义抓取代理的函数-------------------------------#

def getcnproxy(name):
    pagenum=0
    result=[]
    getallpages=0
    trycount=0
    while getallpages==0 and trycount<=6:
        pagenum=pagenum+1
        url='http://www.proxycn.com/html_proxy/http-'+str(pagenum)+'.html'
        try:
            html=urllib2.urlopen(url)
            ip=''
            for line in html:
                if '''onDblClick="clip''' in line:
                    proxy=line[line.find("clip('")+6:line.find("')")]
                    lock.acquire()
                    print name,proxy
                    lock.release()
                    result.append(proxy)
                if '下一页|尾页' in line:
                    getallpages=1
        except:
            trycount=trycount+1
            pagenum=pagenum-1
    proxylist[0]=result
    return result

def getproxycn(name):
    pagenum=0
    result=[]
    getallpages=0
    trycount=0
    while pagenum<=9 and trycount<=2:
        pagenum=pagenum+1
        url='http://www.cnproxy.com/proxy'+str(pagenum)+'.html'
        try:
            html=urllib2.urlopen(url)
            for line in html:
                if "HTTP" in line:
                    proxy=line[line.find('<td>')+4:line.find('&#820')]+line[line.find(':'):line.find('</td><td>')]
                    lock.acquire()
                    print name,proxy
                    lock.release()
                    result.append(proxy)
        except:
            trycount=trycount+1
            pagenum=pagenum-1
    proxylist[1]=result
    return result

   
#------------------------- --------------- 结束代理抓取函数定义 --------------------------------------------------#

#------------------------------------------ 验证代理的函数定义 ---------------------------------------------------#

def proxycheckone(proxy):
    url='http://www.facebook.com'
    proxy_url = 'http://'+proxy
    proxy_support = urllib2.ProxyHandler({'http': proxy_url})
    opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
    r=urllib2.Request(url)
    r.add_header("Accept-Language","zh-cn")    #加入头信息,这样可以避免403错误
    r.add_header("Content-Type","text/html; charset=gb2312")
    r.add_header("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)")
    trycount=1
    while trycount<=2:
        try:
            T0=time.time()
            f=opener.open(r)
            data=f.read()
            if 'Welcome to Facebook!' in data:
                T=time.time()-T0              
                break
            else:return []
        except:
            time.sleep(3)
            trycount=trycount+1
    if trycount>2:
        return []
    else:
        return proxy+'$'+str(trycount)+'#'+str(T)

def proxycheck(idnum):
    while 1:
        r.acquire()
        try:
            i=proxylist[0]
            del proxylist[0]
            r.release()
        except:
            r.release()
            x[idnum]=1
            break
        b=proxycheckone(i)
        if len(b)>0:
            a.acquire()
            y.append(b)
            a.release()

#---------------------------------------- 验证代理的函数定义结束 -------------------------------------------------#

#----------------------------- 抓取代理,抓取到的代理放在proxies.txt中,以/n分隔 --------------------------------#

#x='''
lock=thread.allocate_lock()
proxylist=[[],[]]
thread.start_new(getcnproxy,('cnproxy',))
thread.start_new(getproxycn,('proxycn',))
while [] in proxylist:
    time.sleep(30)
proxylist=proxylist[0]+proxylist[1]
w=open('proxies.txt','a')
w.write('/n'.join(proxylist))
w.close()
del proxylist
print 'get all proxies!/n/n'
#'''

#----------------------------- 抓取代理完毕,抓取到的代理放在proxies.txt中,以/n分隔 -------------------------------#

#--------------------------------------------------- 验证代理 -----------------------------------------------------#

w=open('proxies.txt')
proxylist=list(set((re.sub(r'(/t+[^/n]*/n|/n)',',',w.read())).split(',')))
while '' in proxylist:
    del proxylist[proxylist.index('')]
w.close()

lock=thread.allocate_lock()
r=thread.allocate_lock()
a=thread.allocate_lock()
y=[]
x=[0]*120

for idnum in range(0,120):
    thread.start_new(proxycheck,(idnum,))

while 0 in x:
    print len(proxylist),sum(x),"left",len(y)
    time.sleep(10)

w=open('proxies.txt','w')
w.write(re.sub('^/n','',re.sub(r'/n+','/n','/n'.join(y)+'/n')))
w.close()

#-------------------------------------------------- 验证代理完毕 --------------------------------------------------#

 

 

 

 

源代码和编译后的程序在此下载

抱歉!评论已关闭.