#!/usr/env python
#-*- coding: utf-8 -*-
import requests
import os,sys
import time
import MySQLdb
import re
num=0
dataresult=[]
def main():
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='addressbookdb',charset="utf8")
conn.query("set names utf8")
except Exception,e:
print e
sys.exit()
cursor=conn.cursor()
for k in range(1,7773):
try:
url="http://www.anzhi.com/list_1_"+str(k)+"_hot.html"
print url
html=requests.get(url)
result=html.content
pattern=re.compile('<span class="app_name"><a href="(.+?)">')
daresult=re.findall(pattern,result)
global dataresult
dataresult+=daresult
dataresult=list(set(dataresult))
print len(dataresult)
except:
time.sleep(30)
pass
f=file("anzhi.txt","a+")
content=str(len(dataresult))
f.write(content)
f.close()
print len(dataresult)
for i in dataresult:
print i
t='http://www.anzhi.com'+i
print t
try:
html=requests.get(t)
result=html.content
except:
time.sleep(30)
pass
pattern=re.compile('<div class="detail_line">[\s\S]*?<h3>(.+?)</h3>')#名称
data0=re.findall(pattern,result)
print data0[0]
datasub=re.sub('/','',data0[0])
pattern=re.compile('<span class="app_detail_version">(.+?)</span>')#版本号
data1=re.findall(pattern,result)
print data1[0]
pattern=re.compile('开发者:(.+?)</span>')#开发者
data2=re.findall(pattern,result)
print data2[0]
pattern=re.compile('发布时间:(.+?)</li>')#发布时间
data3=re.findall(pattern,result)
print data3[0]
pattern=re.compile('文件大小:(.+?)</span></li>')#文件大小
data4=re.findall(pattern,result)
print data4[0]
pattern=re.compile('系统支持:(.+?)</li>')#支持固件
data5=re.findall(pattern,result)
print data5[0]
pattern=re.compile('所属类别:(.+?)</li>')#类别
data6=re.findall(pattern,result)
print data6[0]
pattern=re.compile('<div class="app_detail_infor">([\s\S]*?)</div>')#介绍
data7=re.findall(pattern,result)
for items in data7:
print re.sub('<br />',' ',items)
sql="insert into anzhi(name,version,developer,pubtime,filesize,support,classifyintroduction) values(%s,%s,%s,%s,%s,%s,%s,%s)"
for items in data7:
try:
values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],data6[0],re.sub('</p> <br />',' ',items))
except:
pass
try:
cursor.execute(sql,values)
conn.commit()
except:
pass
pattern=re.compile('appicon=(.+?)>安装到手机')
data=re.findall(pattern,result)
for j in data:
print j
try:
temp=requests.get(j[1:-1])
except:
time.sleep(30)
pass
global num
f=file("anzhi/"+datasub,"w+")
num=num+1
print num
f.write(temp.content)
cursor.close()
conn.close()
f.close()
if __name__=="__main__":
main()
http://www.anzhi.com/soft_199360.html
Traceback (most recent call last):
File "anzhi84.py", line 111, in <module>
main()
File "anzhi84.py", line 55, in main
print data0[0]
IndexError: list index out of range