现在的位置: 首页 > 综合 > 正文

Python爬虫——人人好友相册自动下载(一)

2014年04月05日 ⁄ 综合 ⁄ 共 2977字 ⁄ 字号 评论关闭

#版本一,COOKIE登陆,不能多线程下载,不能刷新加载更多的好友
#登陆参数,需要你COOKIE,人人ID(可在网址最后的9位数字找到)
# coding=utf8
import os
import re
import urllib2


COOKIE = '你自己人人的COOKIE'
HEADERS = {'cookie' : COOKIE}

# find title     
def find_title(mypage):
    myMatch = re.search(r'<title>(.+?)</title>', mypage, re.S)
    title = u'undefined'
    if myMatch:
        title = myMatch.group(1)
    else:
        print u'find no title'
    # 文件名不能包含以下字符: \ / : * ? " < > |  
        title = title.replace('\\', '').replace('/', '').replace(':', '').replace('*', '').replace('?', '').replace('"', '').replace('>', '').replace('<', '').replace('|', '')
    return title

def login_renren(url):
    try:
        req = urllib2.Request(url, headers=HEADERS)
        page = urllib2.urlopen(req).read()
        page = page.decode('utf-8')
        title = find_title(page)
        print title
        return page
    except:
        page = ur''
        return page
    
def downImage(filePath, savePath):  # 根据filePath里面的url自动下载图片  
    tt = 0  # name  
    sour = open(filePath, 'r')  
    while 1:  
        line = sour.readline()  
        if line:  
            # 判断从文件中读取的url是不是图片类型,这里是jpg类型  
            if(line.find('jpg') > 0):  
                data = urllib2.urlopen(line).read()
                path = savePath +str(line[-10:-5])+ str(tt) + '.jpg'  
                f = open(path, 'wb')  # 在tmp文件中存储下载的图片  
                f.write(data)  
                f.close()  
                tt = tt + 1  
            else:  
                pass
        else:
            break  
    sour.close()
      
# http://www.renren.com/你的人人ID号码#!//friend/manage     
def find_friendId(loginID):
    list = ur''
    sour = open('id.txt', 'w')
    friendManager_url = r'http://www.renren.com/' + str(loginID) + '#!//friend/manage'
    print friendManager_url
    page = login_renren(friendManager_url)
    pattern3 = re.compile(r'namecard=".*?"\shref')
    if pattern3.findall(page):
        list = pattern3.findall(page)
    else:
        print 'find no friend id'
    
    pattern2 = re.compile(r'\d{9}')
    for i in list:
        if pattern2.search(i):
            id = pattern2.search(i).group()
        sour.write(id)
        sour.write(os.linesep)
    sour.close()
    
# http://photo.renren.com/photo/你好友的ID/album/relatives/profile   		这里是你好友的相册目录
# http://photo.renren.com/photo/你好友的ID/album-535947620?frommyphoto            这里是你好友的相册
def find_AblumUrl():
    list = ur''
    file = open('id.txt')
    ablum = open('ablumlist.txt', 'w') 
    while 1:
        line = file.readline()
        if line:
            line = line[:-1]
            photo_url = 'http://photo.renren.com/photo/' + str(line) + '/album/relatives/profile'
            print photo_url
            data = login_renren(photo_url)
            pattern = re.compile(r'http://photo.renren.com/photo/(.+?)frommyphoto')
            if pattern.findall(data):
                list = pattern.findall(data)
            else:
                print 'find no ablum id'
            
            for i in list:
                print i
                album_list = 'http://photo.renren.com/photo/' + str(i) + 'frommyphoto'
                print album_list
                ablum.write(album_list)
                ablum.write(os.linesep)
        else:
            break
        
# xLarge:'http://fmn.rrfmn.com/fmn058/20130603/0035/original_1l5N_40d00000290b125d.jpg' 这是一个大图的URL地址,通过他可以进行下载    
def getImageUrl(data, filePath):
    list = ur''
    sour = open(filePath, 'w')
    pattern = re.compile(r'xLarge:.*?\.jpg')
    if pattern.findall(data):
        list = pattern.findall(data)
    else:
        print 'found no image'
    
    for i in list:
        i = i[8:]
        sour.write(i)
        sour.write(os.linesep)
    sour.close()

def searchAlbum(filePath):
    file = open('ablumlist.txt')
    while 1:
        line = file.readline()
        if not line:
            break
        else:
            data = login_renren(line)
            getImageUrl(data, filePath)
            downImage(filePath, savePath)  
    file.close()
    
LOGINID = '你的ID号'     #请输入你人人ID  http://www.renren.com/XXXXXXXXX
URL = r'http://www.renren.com'
savePath = r''
filePath = r'image_list.txt'

find_friendId(LOGINID)
find_AblumUrl()
searchAlbum(filePath)
downImage(filePath, savePath)
print 'OK '
    

抱歉!评论已关闭.