现在的位置: 首页 > 综合 > 正文

Python爬虫——人人好友相册多线程下载(二)

2014年04月05日 ⁄ 综合 ⁄ 共 3085字 ⁄ 字号 评论关闭

改进:加入了多线程下载,提高了照片匹配的准确度,好友相册能够完整下载

一、Cookie获得

chrome 浏览器 Mac Command + Alt + I   windows 好像是F12  打开开发者工具进行抓包

二、抓取图片

http://friend.renren.com/groupsdata   从次页面用正则表达式获取全部好友的ID

http://photo.renren.com/photo/' + 好友ID + '/album/relatives/profile  
从此页面可获得好友相册的ID

http://photo.renren.com/photo/好友ID/album-相册ID?frommyphoto   从相册页面获取照片ID

三、多线程下载

Download类继承了threading.Thread,重写了run()方法,传入了一个存放照片URL的set(),遍历集合进行下载

在实际抓取照片中,每一个相册将会开启一个线程进行下载

代码需附上你自己的人人Cookie

# coding=utf8
import os
import re
import threading
import urllib2


COOKIE = '你自己人人的Cookie'
HEADERS = {'cookie': COOKIE}

# find title     
def find_title(mypage):
    myMatch = re.search(r'<title>(.+?)</title>', mypage, re.S)
    title = u'undefined'
    if myMatch:
        title = myMatch.group(1)
    else:
        print u'find no title'
        # 文件名不能包含以下字符: \ / : * ? " < > |
        title = title.replace('\\', '').replace('/', '').replace(':', '').replace('*', '').replace('?', '').replace('"',
                                                                                                                    '').replace(
            '>', '').replace('<', '').replace('|', '')
    return title

def login_renren(url):
    try:
        req = urllib2.Request(url, headers=HEADERS)
        page = urllib2.urlopen(req).read()
        page = page.decode('utf-8')
        title = find_title(page)
        print title
        return page
    except:
        page = ur''
        return page

def find_friendlist():
    url_friend = 'http://friend.renren.com/groupsdata'    #friend list
    req = urllib2.Request(url_friend, headers=HEADERS)
    try:
        page = urllib2.urlopen(req).read()
        page = page.decode('utf-8')
    except:
        print 'cookie is error'
        page = ''
    pattern = re.compile(r'"fid":\d*?,')
    if pattern.findall(page):
        list = pattern.findall(page)
        friend_file = open('id.txt', 'w')
        for i in list:
            id = i[6:-1]
            friend_file.write(id)
            friend_file.write(os.linesep)
        friend_file.close()
    else:
        print 'find no friendID'

# http://photo.renren.com/photo/XXXXXXXXX/album/relatives/profile
# http://photo.renren.com/photo/XXXXXXXXX/album-535947620?frommyphoto
def find_ablumUrl():
    list = ur''
    file = open('id.txt')
    ablum = open('albumlist.txt', 'w')
    while 1:
        line = file.readline()
        if line:
            line = line[:-1]
            photo_url = 'http://photo.renren.com/photo/' + str(line) + '/album/relatives/profile'
            print photo_url
            data = login_renren(photo_url)
            pattern = re.compile(r'http://photo.renren.com/photo/(.+?)frommyphoto')
            if pattern.findall(data):
                list = pattern.findall(data)
            else:
                print 'find no album id'
                #remove duplicate album id
            albumid_set = set()
            for i in list:
                albumid_set.add(i)

            for i in albumid_set:
                album_list = 'http://photo.renren.com/photo/' + str(i) + 'frommyphoto'
                print album_list
                ablum.write(album_list)
                ablum.write(os.linesep)
        else:
            break

def download_album():
    file = open('albumlist.txt')
    while 1:
        line = file.readline()
        if not line:
            break
        else:
            list = ''
            data = login_renren(line)
            pattern = re.compile(r'large:.*?\.jpg', re.I)   #large xlarge
            if pattern.findall(data):
                list = pattern.findall(data)
            else:
                print 'found no image'

            photo_url = set()
            for i in list:
                i = i[7:]
                photo_url.add(i)
                print i   # test
            try:
                d = Download(photo_url)
                print d.name
                d.start()
            except:
                print u'download error   ' + line
    file.close()

#download by thread
class Download(threading.Thread):
    def __init__(self, que):
        threading.Thread.__init__(self)
        self.que = que

    def run(self):
        for i in self.que:
            data = urllib2.urlopen(i).read()
            path = str(i[-15:-5]) + '.jpg'
            f = open(path, 'wb')  # 存储下载的图片
            f.write(data)
            f.close()
        return

#start
def start_photo_grap():
    login_renren(URL)
    find_friendlist()
    find_ablumUrl()
    download_album()


URL = r'http://www.renren.com'

if __name__ == '__main__':
    start_photo_grap()
    print 'success '

抱歉!评论已关闭.