python百度贴吧爬虫

现在的位置: 首页 > 综合 > 正文

python百度贴吧爬虫

2013年12月03日 ⁄ 综合 ⁄ 共 3396字 ⁄ 字号小中大 ⁄ 评论关闭

推荐，我的项目地址：https://github.com/youyudehexie/BaiduTiebaSpider

__init__.py

#encoding=utf-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

import urllib2
import urllib
import re
from BeautifulSoup import BeautifulSoup 

def _readtopics(filename='topics.txt'):
	try:
		fd = open(filename, 'r')
		content = fd.read()
		return content
	except:
		print 'fail to read topics.txt'

def _writetopics(hrefs, filename='topics.txt'):
	try:
		content = '\n'.join(hrefs)
		fd = open(filename, 'w')
		fd.write(content)
	except:
		print 'fail to write topics.txt'

def _writearticles(content, filename='articles.txt'):
	try:
		fd = open(filename, 'a')
		fd.write(content)
	except:
		print 'fail to write topics.txt'


def _download(url):
	try:
		response = urllib2.urlopen(url).read().decode("gbk")

	except:  
		print 'urllib2 error'
		return

	return response


#----------- 处理页面上的各种标签 -----------
class HTML_Tool:
    # 用非 贪婪模式 匹配 \t 或者 \n 或者 空格 或者 超链接 或者 图片
    BgnCharToNoneRex = re.compile("(\t|\n| |<a.*?>|<img.*?>)")
    
    # 用非 贪婪模式 匹配 任意<>标签
    EndCharToNoneRex = re.compile("<.*?>")

    # 用非 贪婪模式 匹配 任意<p>标签
    BgnPartRex = re.compile("<p.*?>")
    CharToNewLineRex = re.compile("(<br/>|</p>|<tr>|<div>|</div>)")
    CharToNextTabRex = re.compile("<td>")

    # 将一些html的符号实体转变为原始符号
    replaceTab = [("<","<"),(">",">"),("&","&"),("&","\""),(" "," ")]
    
    def Replace_Char(self,x):
        x = self.BgnCharToNoneRex.sub("",x)
        x = self.BgnPartRex.sub("\n    ",x)
        x = self.CharToNewLineRex.sub("\n",x)
        x = self.CharToNextTabRex.sub("\t",x)
        x = self.EndCharToNoneRex.sub("",x)

        for t in self.replaceTab:  
            x = x.replace(t[0],t[1])  
        return x  


class TiebaSpider:
	def __init__(self, review = False):
		self.hrefs = []
		self.review = review
		self.myTool = HTML_Tool()

	def start_request(self, **kw):
		tiebas = kw['tieba']
		page = kw['page']

		if not self.review:
			return self.get_topics_list(tiebas, page)
		else:
			self.hrefs = _readtopics().split('\n')
			return self.get_articles()

	def get_topics_list(self, tiebas, page):
		urls = []
		for tieba in tiebas:
			for p in range(page):
				pn = 50 * p
				url_template = 'http://tieba.baidu.com/f?kw=%s&pn=%s' % (tieba, pn)
				urls.append(url_template)

		_writetopics(hrefs=self.hrefs)

		return self.download_topic(urls)

	def parse_topic(self, response):
		hrefPat = re.compile(r'href=\"(.*?)\"')
		titlePat = re.compile(r'title=\"(.*?)\"')

		soup = BeautifulSoup(response)
		topics = soup.body.findAll("div", {"class" : "threadlist_text threadlist_title j_th_tit  notStarList "})
		for topic in topics:
			href = hrefPat.findall(str(topic))
			if href:
				self.hrefs.append(href[0])


	def download_topic(self, urls):
		for url in urls:
			response = _download(url)
			if response:
				self.parse_topic(response)

			else:
				continue

		_writetopics(hrefs=self.hrefs)

		return self.get_articles()

	def download_articles(self, urls):
		for url in urls:
			response = _download(url)
			if response:
				self.parse_articles(response)

			else:
				continue 

	def parse_articles(self, response):
		titlePat = re.compile(r'title\:\"(.*?)\"')
		title = titlePat.findall(str(response))

		bodyPat = re.compile(r'<cc>(.*?)<\/cc>')
		body = bodyPat.findall(str(response))

		items = {}
		if title and body:
			data = self.myTool.Replace_Char(body[0].replace("\n","").encode('utf-8'))
			items['title'] = title[0]
			data = data.replace("\r", "")
			items['body'] = data.split('\n')[0]
			self.output(items)


	def output(self, items):
		content = '%s\t%s\n' % (items['title'], items['body'])
		print '爬取 %s ' % items['title']
		_writearticles(content)

	def get_articles(self):
		hrefs = self.hrefs
		urls = []
		for href in hrefs:
			url_template = 'http://tieba.baidu.com' + href + '?see_lz=1'
			urls.append(url_template)

		self.download_articles(urls)

example.py

#encoding=utf-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
from tiebaspider import TiebaSpider

if __name__ == "__main__":
	spider = TiebaSpider()
	tieba = ['wow', '李毅'] #爬WOW吧，如果多个贴吧 
	spider.start_request(tieba=tieba, page=1) # tieba: 贴吧列表 page:贴吧页数

【上篇】海量数据面试题整理
【下篇】普元软件获一亿战略投资董事长刘亚东谈三大核心竞争优势

作者: dbtel

该日志由 dbtel 于10年前发表在综合分类下，最后更新于 2013年12月03日.
转载请注明: python百度贴吧爬虫 | 学步园 +复制链接

抱歉!评论已关闭.

学步园

python百度贴吧爬虫

作者: dbtel

书签

最新文章New

本站推荐

返回首页