参考了别人的代码。给代码添加了多线程和Queue的结合应用。
#!/usr/bin/env python # -*- coding:utf-8 -*- ''' Created on 2013-3-21 @author: naughty ''' # author: wklken from sgmllib import SGMLParser import os import urllib import urllib2 import urlparse from Queue import Queue from threading import Thread save_path = '/home/zoer' passUrls = set() qimg = Queue() class URLLister(SGMLParser): def reset(self): SGMLParser.reset(self) self.urls = [] self.imgs = [] def start_a(self, attrs): href = [ v for k, v in attrs if k == "href" and v.startswith("http")] if href: self.urls.extend(href) def start_img(self, attrs): src = [ v for k, v in attrs if k == "src" and v.startswith("http") ] if src: self.imgs.extend(src) def get_url_of_page(url, if_img=False): ''' 获取一个页面上的所有链接。 if_img:如果为true,则获取的是页面上的所有图片的链接 ''' urls = [] try: f = urllib2.urlopen(url, timeout=3).read() url_listen = URLLister() url_listen.feed(f) if if_img: urls.extend(url_listen.imgs) else: urls.extend(url_listen.urls) except urllib2.URLError, e: print e return urls def get_page_html(begin_url, depth, main_site_domain): ''' 递归处理页面 ''' if depth <= 0: return print 'handle ' + begin_url passUrls.add(begin_url) #=========================================================================== # 读取本页面上的图片 #=========================================================================== urls = get_url_of_page(begin_url, True) #=========================================================================== # 添加图片到queue #=========================================================================== for murl in urls: firstindex = murl.find('?') if firstindex != -1: print firstindex murl = murl[:firstindex] print 'add img url:' + murl qimg.put(murl) #=============================================================================== # 读取本页面上的链接 #=============================================================================== urls = get_url_of_page(begin_url) if urls: for murl in urls: if not murl in passUrls: get_page_html(murl, depth - 1, main_site_domain) class DPThread(Thread): ''' 下载线程 ''' def run3(self): while True: filename = qimg.get() filename = filename.split("/")[-1] #dist = os.path.join(save_path, filename) dist = save_path + '/' + filename print dist print 'try connecting ' + filename if filename.endswith('jpg') or filename.endswith('png') or filename.endswith('gif') or filename.endswith('bmp') or filename.endswith('jpeg'): print 'downloading ' + filename dist = dist.replace('\\', '/') urllib.urlretrieve(filename, dist, None) print "Done: ", filename qimg.task_done() def run(self): while True: murl = qimg.get() print 'one '+murl filename = murl.split("/")[-1] urlopen = urllib.URLopener() try: fp = urlopen.open(murl) data = fp.read() fp.close() f = open(save_path + "/" + filename, 'w+b') f.write(data) f.close() except IOError: print "download error!" + url qimg.task_done() if __name__ == "__main__": #=========================================================================== # 抓取图片首个页面 #=========================================================================== url = "http://image.baidu.com" #url='http://bringgooglereaderback.com/' #=========================================================================== # 图片保存路径 #=========================================================================== if not os.path.exists(save_path): os.mkdir(save_path) #=========================================================================== # 遍历深度 #=========================================================================== max_depth = 1 main_site_domain = urlparse.urlsplit(url).netloc get_page_html(url, max_depth, main_site_domain) for i in range(1): t = DPThread() t.setDaemon(True) t.start() qimg.join() print 'end'
上面代码中有一个run方法和一个run3方法。
run3方法中使用urllib的urlretrieve来读取图片。发现这样速度很慢。所以直接在run方法中用了urllib的URLopener来打开图片地址并读取数据直接写到本地磁盘的文件中。
----------------------------------------------------------------------
遇到严重的问题:
由于一开始线程没有设置成Daemon的,所以即使Queue中没有内容了,脚本还是不会退出。设置成Daemon之后,在Queue中每有内容之后,脚本就会退出了。
---------------------------------------------------------
原因解释如下:
python中得thread的一些机制和C/C++不同:在C/C++中,主线程结束后,其子线程会默认被主线程kill掉。而在python中,主线程结束后,会默认等待子线程结束后,主线程才退出。
python对于thread的管理中有两个函数:join和setDaemon
- join:如在一个线程B中调用threada.join(),则threada结束后,线程B才会接着threada.join()往后运行。
- setDaemon:主线程A启动了子线程B,调用b.setDaemaon(True),则主线程结束时,会把子线程B也杀死,与C/C++中得默认效果是一样的。