现在的位置: 首页 > 综合 > 正文

豆瓣电台加心歌曲自动下载(python实现)

2013年09月18日 ⁄ 综合 ⁄ 共 4041字 ⁄ 字号 评论关闭

 

最近写了个豆瓣个人电台自动下载加心歌曲的小程序,基本能够下载,但需要手动将"http://douban.fm/mine?type=liked"页面全都下载下来,有点蛋疼,- - !!。由于还没有实现程序登录豆瓣的功能,暂时先这样用吧。

#!/usr/bin/python2.7
# -*- coding:utf -*-

import urllib
import re
import socket
import cookielib
import urllib2

socket.setdefaulttimeout(1)

def getpag(url):
	done = False
	try:
		response = urllib.urlopen(url)
		re = response.read()
		done = True
	except Exception as e:
		print "error in getpag({0})".format(url)
	if done:
 		return re
 	else:
 		return ""

def removehtml(s):
	p = re.compile(r'(<.*?>)|(&.*?;)', re.S)
	return p.sub("", s)
	
def removeotherword(s):
	p = re.compile(r'((([\(\[{])|(\xef\xbc\x88)).*?(([\)\]}])|(\xef\xbc\x89)))|(^\s+)|(\s+$)')
	return p.sub("", s)
	


# login douban & sv cookie
# todo
def logindouban():
	loginurl = "http://www.douban.com/accounts/login"
	data = urllib.urlencode	({'source':'simple','form_email':'vodmaker@gmail.com','form_password':'xxx','remember':'on',})
	print data
	cj = cookielib.CookieJar()
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	urllib2.install_opener(opener)
	opener.addheaders = [("User-agent", "Mozilla/5.0 (X11; Linux i686; rv:2.0.1) Gecko/20100101 Firefox/4.0.1")]
	req = urllib2.Request(loginurl, data)
	print urllib2.urlopen(req).geturl() 	
#



# s1 match exactly to s2
def matchexact(s1, s2):
	ps1 = re.compile(s1, re.I)
	if ps1.search(s2) is not None:
		return True
	else:
		return False
	

# s1 match to s2	
def matchmost(s1, s2):
	s1 = re.compile(r'\s').sub("\\s*", s1)
	ps1 = re.compile(s1, re.I)
	if ps1.search(s2) is not None:
		return True
	else:
		return False
	

# [[name, artist], ...] per list_pag
def getmusiclist_perpag(list_pag):
	ptable = re.compile(r'<table\s*class="olts"\s*width="100%">.*?</table>', re.S)	
	htable = ptable.search(list_pag).group()
	ptbody = re.compile(r'<tbody>.*?</tbody>', re.S)
	htbody = ptbody.search(htable).group()
	ptr = re.compile(r'<tr>\s*<td>(.*?)</td>.*?<span>(.*?)</span>.*?</tr>', re.S)
	musiclist = []
	for m in ptr.finditer(htbody):
		print "music:" + removeotherword(m.group(1)) + "artist:" + removeotherword(m.group(2))
		musiclist.append([removeotherword(m.group(1)), removeotherword(m.group(2))])
	return musiclist


# download music from mp3.yahoo.com
# parameter muscilist [[name, artist], ...]
def downloadfromyahoo(musiclist):
	listurl = "http://music.yahoo.cn/s?q={0}&m=0"
	for ma in musiclist:
		music = ma[0]
		artist = ma[1]
		print "Music:\t" + music + "\tArtist:\t" + artist + "is Downloading..."
		u = listurl.format(urllib.quote_plus(music))		
		listpag = getpag(u)
		ptable = re.compile(r'<div class="yst-music">.*?</table>', re.S)
		if ptable.search(listpag) is None:
			print "No search result of {0} in yahoo.cn".format(music)
			continue
		htable = ptable.search(listpag).group()
		ptr = re.compile(r'<tr>\s*<td class="m_song">\s*<a href=".*?url=(.*?)"'
						 r'.*?>(.*?)</a>'
						 r'.*?<td class="m_singer">.*?>(.*?)</a>'
						 r'.*?<td.*?<td>(.*?)</td>'
						 r'.*?<td>(.*?)[mM][bB]'
						 r'.*?</tr>'
						 , re.S);
		find = False
		for m in ptr.finditer(htable):
			downurl = urllib.unquote(m.group(1))
			music_t = removeotherword(removehtml(m.group(2)))
			artist_t = removeotherword(removehtml(m.group(3)))
			type_t = removeotherword(removehtml(m.group(4)))
			size_t = removeotherword(removehtml(m.group(5)))
			if matchexact(music_t, music) and matchexact(artist_t, artist) and float(size_t) > 2:
				print "download from :" + downurl + ""
				try:
					music_stream = urllib.urlopen(downurl).read()
					open("./down/"+music+"."+type_t, "wb").write(music_stream)
					find = True
					print "download success: music:{0}, artist:{1}".format(music, artist)
					break
				except Exception as e:
					continue
					print e
		if not find:
			for m in ptr.finditer(htable):
				downurl = urllib.unquote(m.group(1))
				music_t = removeotherword(removehtml(m.group(2)))
				artist_t = removeotherword(removehtml(m.group(3)))
				type_t = removeotherword(removehtml(m.group(4)))
				size_t = removeotherword(removehtml(m.group(5)))
				if matchmost(music_t, music) and matchmost(artist_t, artist) and float(size_t) > 1:
					print "download from :" + downurl + ""
					try:
						music_stream = urllib.urlopen(downurl).read()
						open("./down/"+music+"."+type_t, "wb").write(music_stream)
						find	 = True
						print "download success: music:{0}, artist:{1}".format(music, artist)
						break
					except Exception as e:
						continue
						print e
		if not find:
			print "download failed: music:{0}, artist:{1}".format(music, artist) 
# end downloadfromyahoo func
	
musiclist = []
for i in range(17):
	f = open("{0}.html".format(i), "r")
	listp = f.read()
	musiclist += getmusiclist_perpag(listp)
downloadfromyahoo(musiclist)

 

貌似豆瓣有屏蔽程序访问页面的措施,目前仍纠结于如何实现登录的部分,不能保证一定能解决登录豆瓣自动抓取加心页面的功能,程序更新期限未知。

抱歉!评论已关闭.