推荐,我的项目地址:https://github.com/youyudehexie/BaiduTiebaSpider
__init__.py
#encoding=utf-8 import sys reload(sys) sys.setdefaultencoding( "utf-8" ) import urllib2 import urllib import re from BeautifulSoup import BeautifulSoup def _readtopics(filename='topics.txt'): try: fd = open(filename, 'r') content = fd.read() return content except: print 'fail to read topics.txt' def _writetopics(hrefs, filename='topics.txt'): try: content = '\n'.join(hrefs) fd = open(filename, 'w') fd.write(content) except: print 'fail to write topics.txt' def _writearticles(content, filename='articles.txt'): try: fd = open(filename, 'a') fd.write(content) except: print 'fail to write topics.txt' def _download(url): try: response = urllib2.urlopen(url).read().decode("gbk") except: print 'urllib2 error' return return response #----------- 处理页面上的各种标签 ----------- class HTML_Tool: # 用非 贪婪模式 匹配 \t 或者 \n 或者 空格 或者 超链接 或者 图片 BgnCharToNoneRex = re.compile("(\t|\n| |<a.*?>|<img.*?>)") # 用非 贪婪模式 匹配 任意<>标签 EndCharToNoneRex = re.compile("<.*?>") # 用非 贪婪模式 匹配 任意<p>标签 BgnPartRex = re.compile("<p.*?>") CharToNewLineRex = re.compile("(<br/>|</p>|<tr>|<div>|</div>)") CharToNextTabRex = re.compile("<td>") # 将一些html的符号实体转变为原始符号 replaceTab = [("<","<"),(">",">"),("&","&"),("&","\""),(" "," ")] def Replace_Char(self,x): x = self.BgnCharToNoneRex.sub("",x) x = self.BgnPartRex.sub("\n ",x) x = self.CharToNewLineRex.sub("\n",x) x = self.CharToNextTabRex.sub("\t",x) x = self.EndCharToNoneRex.sub("",x) for t in self.replaceTab: x = x.replace(t[0],t[1]) return x class TiebaSpider: def __init__(self, review = False): self.hrefs = [] self.review = review self.myTool = HTML_Tool() def start_request(self, **kw): tiebas = kw['tieba'] page = kw['page'] if not self.review: return self.get_topics_list(tiebas, page) else: self.hrefs = _readtopics().split('\n') return self.get_articles() def get_topics_list(self, tiebas, page): urls = [] for tieba in tiebas: for p in range(page): pn = 50 * p url_template = 'http://tieba.baidu.com/f?kw=%s&pn=%s' % (tieba, pn) urls.append(url_template) _writetopics(hrefs=self.hrefs) return self.download_topic(urls) def parse_topic(self, response): hrefPat = re.compile(r'href=\"(.*?)\"') titlePat = re.compile(r'title=\"(.*?)\"') soup = BeautifulSoup(response) topics = soup.body.findAll("div", {"class" : "threadlist_text threadlist_title j_th_tit notStarList "}) for topic in topics: href = hrefPat.findall(str(topic)) if href: self.hrefs.append(href[0]) def download_topic(self, urls): for url in urls: response = _download(url) if response: self.parse_topic(response) else: continue _writetopics(hrefs=self.hrefs) return self.get_articles() def download_articles(self, urls): for url in urls: response = _download(url) if response: self.parse_articles(response) else: continue def parse_articles(self, response): titlePat = re.compile(r'title\:\"(.*?)\"') title = titlePat.findall(str(response)) bodyPat = re.compile(r'<cc>(.*?)<\/cc>') body = bodyPat.findall(str(response)) items = {} if title and body: data = self.myTool.Replace_Char(body[0].replace("\n","").encode('utf-8')) items['title'] = title[0] data = data.replace("\r", "") items['body'] = data.split('\n')[0] self.output(items) def output(self, items): content = '%s\t%s\n' % (items['title'], items['body']) print '爬取 %s ' % items['title'] _writearticles(content) def get_articles(self): hrefs = self.hrefs urls = [] for href in hrefs: url_template = 'http://tieba.baidu.com' + href + '?see_lz=1' urls.append(url_template) self.download_articles(urls)
example.py
#encoding=utf-8 import sys reload(sys) sys.setdefaultencoding( "utf-8" ) from tiebaspider import TiebaSpider if __name__ == "__main__": spider = TiebaSpider() tieba = ['wow', '李毅'] #爬WOW吧,如果多个贴吧 spider.start_request(tieba=tieba, page=1) # tieba: 贴吧列表 page:贴吧页数