现在的位置: 首页 > 综合 > 正文

python3.2 多线程小爬虫一只

2013年09月27日 ⁄ 综合 ⁄ 共 1687字 ⁄ 字号 评论关闭
# -*- coding:utf-8 -*-
"""
多线程抓取网页
"""

import gzip
from urllib import request as urllib2

import threading

from pyquery import PyQuery as pq

threads = []
web_site_url = "http://www.oschina.net/question/tag/python"  # OS CHINA 下python标签

def work(url):
    """
    callback function
    """
    # 出现urllib2.HTTPError: HTTP Error 403: Forbidden错误是由于网站禁止爬虫,可以在请求加上头信息,伪装成浏览器访问
    # 伪装浏览器头
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    if not url:
        url = web_site_url
    req = urllib2.Request(url=url, headers = headers)
    feed_data = urllib2.urlopen(req).read()
    feed_data = gzip_decode_content(feed_data)
    data = pq(feed_data)
    get_next_page(data)
    if data :
        data("ul li.question").each(parse_html)


def parse_html(i, element ):
    pq_element = pq(element)
    user_img = pq_element("a.ShowUserOutline img").attr("src")
    # user_name = pq_element("a.ShowUserOutline img").attr("title")
    question = pq_element("div.qbody h2 a").text()
    date_str = pq_element("div.qbody div.Date").text()
    date_str = date_str.split(",")[0].strip()
    print("%s\t%s\t%s" % (question, date_str, user_img))

def get_next_page(data):
    if data :
        page_li = data("ul.pager").eq(1).find("li.next")
        if page_li :
            page_params = page_li.find("a").attr("href")
            next_page_url = web_site_url + page_params
            threading.Thread(target=work, args=(next_page_url, )).start()


def gzip_decode_content(doc=""):
    """
    根据URL返回内容,有些页面可能需要 gzip 解压缩
    """

    try:
        html = gzip.decompress(doc).decode("utf-8") #解码
    except:
        html=doc.decode("utf-8")
    return html


def main():
    work(())

if __name__ == "__main__":
    main()


# import urllib.request
#
# url = "http://www.oschina.net/"
# headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')
#
# opener = urllib.request.build_opener()
# opener.addheaders = [headers]
# data = opener.open(url).read()
#
# print(data)

抱歉!评论已关闭.