#-*-coding:utf-8-*- from pyquery import PyQuery as pq import CommonUtils ''' 搞趣网抓取 ''' def parseGao7(url='http://www.gao7.com/free/1-0-0-3-0-0-1') : result=[] data=pq(url) if data : lis=data('.app-list li') for li in lis: li=pq(li) _img=li('a img').attr('src') main_div=li('div.app-list-main') _title=main_div('h3').text() _category=li('p').eq(0)('span').text() _updtime=li('p').eq(1)('span.upd-time').text() _desc=li('p').eq(2).text() r_div=li('div.app-list-r') _commentnum=r_div('a').text() _oldpirce=r_div('p del').text() _newprice=r_div('p span').text() dict={} dict['img']=_img dict['title']=_title dict['category']=_category dict['updtime']=_updtime dict['comment']=_commentnum dict['oldprice']=_oldpirce dict['newproce']=_newprice dict['desc']=_desc result.append(dict) return result def getNextPageUrl(url, domain='http://www.gao7.com') : try : data=CommonUtils.getUrlContent(url) content=pq(data) next_page=[i for i in content('div.ui-page a').items() if i.text()=='下一页'] next_page_url = next_page.pop().attr('href') if next_page_url : return domain + next_page_url except IndexError as e1: print(e1) except Exception as e2: print(e2) if __name__ == '__main__' : result=[] url = 'http://www.gao7.com/free/1-0-0-3-0-0-1' while url : print(url) page_result=parseGao7(url) result.append(page_result) url=getNextPageUrl(url) print(result)
CommonUtil.py
#-*-coding:utf-8-*- ''' 常用方法工具类 ''' import urllib.request import gzip ''' 打印分隔符 ''' def printSplitLine(dchar='*', dnum=30) : print(dchar*dnum) ''' 格式化打印字典 ''' def printDict(dict): if dict : for key, value in dict.items() : print('key=%s; value=%s' % (key, value)) ''' 格式化打印列表 ''' def printList(list): if list : for value in list : print(value) ''' 根据URL返回内容,有些页面可能需要gzip解压缩 ''' def getUrlContent(url): #返回页面内容 doc = urllib.request.urlopen(url).read() #解码 try: html=gzip.decompress(doc).decode("utf-8") except: html=doc.decode("utf-8") return html if __name__ == '__main__' : #printSplitLine() # dict={} # dict['a']='aaa' # dict['b']='bbb' # dict['c']='ccc' # printDict(dict) # list=[] # list.append('1') # list.append('2') # list.append('3') # printList(list) print(getUrlContent('http://www.app111.com/free/'))