现在的位置: 首页 > 综合 > 正文

Python – 获取校内(人人网)的所有好友照片存储到本地

2012年06月25日 ⁄ 综合 ⁄ 共 3512字 ⁄ 字号 评论关闭

涉及内容包括:

0.POST DATA到校内网

1.正则表达式

2.urllib2的使用

3.Cache机制

[TODO]

1.SMTP的使用:自动发邮件到指定邮箱

2.SQLITE的使用:把图片存储到数据库

有问题或者建议,欢迎联系:lihe21327@gmail.com

注意要更改Login的账号和密码。

只贴代码:

#coding=utf-8
from urllib import urlencode
from time import sleep
from random import randint
import sqlite3,cookielib,time,re,os
import urllib2,shutil

add_num = 0
class RenrenRobot:
    def __init__(self):
        print '*********************************'
        print '*Hi,This is Hark\'s Robot For RR*'
        print '*********************************'
        print 'Contact With me: lihe21327@gmail.com'
        print 'Version: 0.1'
        print ''
        self.allCount = 0
        self.newCount = 0
        #Create Dir
        self.strSpiderBasePath = r'c:\leehark_spider'
        
        if os.path.exists(self.strSpiderBasePath) == False:
            os.mkdir(self.strSpiderBasePath)
        
    def Download(self,id):
        # Fetching Your Friend's Recent Updated Pictures' URL whose id is '@param = id'
        self.strBaseFilePath = (r'c:\leehark_spider\id_%s' % id)
        if os.path.exists(self.strBaseFilePath) == False:
            os.mkdir(self.strBaseFilePath)
        strFriendURL = ('http://photo.renren.com/photo/%s/album/relatives' % id)
        res = urllib2.urlopen(strFriendURL)
        print res.geturl()
        print res.getcode()
        print res.info()
        str = res.read()
        
        strPattern = r'<li>.<a href="(http://photo.renren.com/photo/[^<>]*?)" class="picture">'
        r = re.findall(strPattern,str,re.S|re.I)
        if r == None:
            return
        fout = open('c:\\spider\\tmp.txt','wb')
        for item in r:
            photoURL = item
            print 'Picture\'s URL: ' , photoURL
            ret = urllib2.urlopen(photoURL)
            strPicURLContent = ret.read()
            strPicPattern = r'<img id="photo" src="(http://.*?)" title=".*?" style=".*?">'
            rPic = re.findall(strPicPattern,strPicURLContent,re.S|re.I)
            for subItem in rPic:
                self.allCount+=1
                # Construct FilePath
                strItem = subItem.replace('/','_')
                strItem = strItem.replace(':','_')
                strItem = strItem.replace('&','_')
                strItem = strItem.replace('\\','_')
                strFilePath = self.strBaseFilePath+'\\'+strItem
                
                if os.path.exists(strFilePath):
                    # Hit the Old one
                    print 'Fetched Already : ',subItem
                    continue

                # This is New Image,Fetch it
                self.newCount+=1
                print 'Fetching New Image :'
                print subItem,' --> ',strFilePath
                ret = urllib2.urlopen(subItem)

                # Get Image Size
                headers = ret.info().headers
                length = 0
                for header in headers:
                    if header.find('Length') != -1:
                        length = header.split(':')[-1].strip()
                        length = int(length)
                print 'Image length = ',length

                # Copy Content To Disk
                fd = open(strFilePath, 'wb') 
                shutil.copyfileobj(ret,fd,0x10000) 
                fd.close() 
                ret.close()

                # Do not overload download
                time.sleep(1)
                
    def WalkFriends(self):
        strFriendURL=r"http://friend.renren.com/myfriendlistx.do"
        try:
            res = urllib2.urlopen(strFriendURL)
        except :
            print '******',"visit MY FRIEND LIST *",'BIG ERROR ******'
        else:
            strFriendURLContent = res.read()
            #print strFriendURLContent
            strFriendPattern = r'{"id":([0-9]*?),"vip":'
            rFriend = re.findall(strFriendPattern,strFriendURLContent,re.S|re.I)
            for subItem in rFriend:
                print 'Fetching : ',subItem
                self.Download(subItem)
                
    def Login(self,username,pwd):
        # Set Cookie Jar
        cookie = cookielib.CookieJar()
        cookie_file = urllib2.HTTPCookieProcessor(cookie)
        opener = urllib2.build_opener(cookie_file)
        urllib2.install_opener(opener)

        # Construct Post Data
        data = {
            'email':username,
            'password':pwd,
            'origURL':'',
            'domain':'renren.com',
            'formName':'',
            'method':'',
            'isplogin':'true',
            'submit':'登陆'
            }
        web_data = urlencode(data)
        print 'web_data : ',web_data
        header = {'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2'}
        req=urllib2.Request(url='http://www.renren.com/PLogin.do',
                    data = web_data,
                    headers = header)
        try:
            result = urllib2.urlopen(req)
        except :
            print '******',username,'BIG ERROR ******'
            return
        else:
            print result.geturl()
            print result.getcode()
            print result.info()
            print 'Go to YOURPAGE'
            self.WalkFriends()
        print 'Picture Count : ',self.allCount
        print 'New Count : ',self.newCount
            
            
def main():
    robot = RenrenRobot()
    robot.Login('lihe21327@hotmail.com','******')
if __name__ == '__main__':
    main()

抱歉!评论已关闭.