最近写了个豆瓣个人电台自动下载加心歌曲的小程序,基本能够下载,但需要手动将"http://douban.fm/mine?type=liked"页面全都下载下来,有点蛋疼,- - !!。由于还没有实现程序登录豆瓣的功能,暂时先这样用吧。
#!/usr/bin/python2.7 # -*- coding:utf -*- import urllib import re import socket import cookielib import urllib2 socket.setdefaulttimeout(1) def getpag(url): done = False try: response = urllib.urlopen(url) re = response.read() done = True except Exception as e: print "error in getpag({0})".format(url) if done: return re else: return "" def removehtml(s): p = re.compile(r'(<.*?>)|(&.*?;)', re.S) return p.sub("", s) def removeotherword(s): p = re.compile(r'((([\(\[{])|(\xef\xbc\x88)).*?(([\)\]}])|(\xef\xbc\x89)))|(^\s+)|(\s+$)') return p.sub("", s) # login douban & sv cookie # todo def logindouban(): loginurl = "http://www.douban.com/accounts/login" data = urllib.urlencode ({'source':'simple','form_email':'vodmaker@gmail.com','form_password':'xxx','remember':'on',}) print data cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) opener.addheaders = [("User-agent", "Mozilla/5.0 (X11; Linux i686; rv:2.0.1) Gecko/20100101 Firefox/4.0.1")] req = urllib2.Request(loginurl, data) print urllib2.urlopen(req).geturl() # # s1 match exactly to s2 def matchexact(s1, s2): ps1 = re.compile(s1, re.I) if ps1.search(s2) is not None: return True else: return False # s1 match to s2 def matchmost(s1, s2): s1 = re.compile(r'\s').sub("\\s*", s1) ps1 = re.compile(s1, re.I) if ps1.search(s2) is not None: return True else: return False # [[name, artist], ...] per list_pag def getmusiclist_perpag(list_pag): ptable = re.compile(r'<table\s*class="olts"\s*width="100%">.*?</table>', re.S) htable = ptable.search(list_pag).group() ptbody = re.compile(r'<tbody>.*?</tbody>', re.S) htbody = ptbody.search(htable).group() ptr = re.compile(r'<tr>\s*<td>(.*?)</td>.*?<span>(.*?)</span>.*?</tr>', re.S) musiclist = [] for m in ptr.finditer(htbody): print "music:" + removeotherword(m.group(1)) + "artist:" + removeotherword(m.group(2)) musiclist.append([removeotherword(m.group(1)), removeotherword(m.group(2))]) return musiclist # download music from mp3.yahoo.com # parameter muscilist [[name, artist], ...] def downloadfromyahoo(musiclist): listurl = "http://music.yahoo.cn/s?q={0}&m=0" for ma in musiclist: music = ma[0] artist = ma[1] print "Music:\t" + music + "\tArtist:\t" + artist + "is Downloading..." u = listurl.format(urllib.quote_plus(music)) listpag = getpag(u) ptable = re.compile(r'<div class="yst-music">.*?</table>', re.S) if ptable.search(listpag) is None: print "No search result of {0} in yahoo.cn".format(music) continue htable = ptable.search(listpag).group() ptr = re.compile(r'<tr>\s*<td class="m_song">\s*<a href=".*?url=(.*?)"' r'.*?>(.*?)</a>' r'.*?<td class="m_singer">.*?>(.*?)</a>' r'.*?<td.*?<td>(.*?)</td>' r'.*?<td>(.*?)[mM][bB]' r'.*?</tr>' , re.S); find = False for m in ptr.finditer(htable): downurl = urllib.unquote(m.group(1)) music_t = removeotherword(removehtml(m.group(2))) artist_t = removeotherword(removehtml(m.group(3))) type_t = removeotherword(removehtml(m.group(4))) size_t = removeotherword(removehtml(m.group(5))) if matchexact(music_t, music) and matchexact(artist_t, artist) and float(size_t) > 2: print "download from :" + downurl + "" try: music_stream = urllib.urlopen(downurl).read() open("./down/"+music+"."+type_t, "wb").write(music_stream) find = True print "download success: music:{0}, artist:{1}".format(music, artist) break except Exception as e: continue print e if not find: for m in ptr.finditer(htable): downurl = urllib.unquote(m.group(1)) music_t = removeotherword(removehtml(m.group(2))) artist_t = removeotherword(removehtml(m.group(3))) type_t = removeotherword(removehtml(m.group(4))) size_t = removeotherword(removehtml(m.group(5))) if matchmost(music_t, music) and matchmost(artist_t, artist) and float(size_t) > 1: print "download from :" + downurl + "" try: music_stream = urllib.urlopen(downurl).read() open("./down/"+music+"."+type_t, "wb").write(music_stream) find = True print "download success: music:{0}, artist:{1}".format(music, artist) break except Exception as e: continue print e if not find: print "download failed: music:{0}, artist:{1}".format(music, artist) # end downloadfromyahoo func musiclist = [] for i in range(17): f = open("{0}.html".format(i), "r") listp = f.read() musiclist += getmusiclist_perpag(listp) downloadfromyahoo(musiclist)
貌似豆瓣有屏蔽程序访问页面的措施,目前仍纠结于如何实现登录的部分,不能保证一定能解决登录豆瓣自动抓取加心页面的功能,程序更新期限未知。