周六无事生非,一直觉得虎嗅网的文章质量很高,早上醒来突然有个念头想把它的文章都搞下来;花了半个钟头,写了个小程序,jdk6下采用httpclient3.1、线程池ThreadPoolExecutor、jsoup1.7.1,commons.io
* 等工具多线程抓取虎嗅网全部文章,并以文本文件形式持久化入磁盘,速度还是可以的半个小时把全站所有文章搞定
import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.methods.GetMethod; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import org.apache.commons.io.FileUtils; /** * @ClassName: CrawlHuxiu * @Description: jdk6下采用httpclient3.1、线程池ThreadPoolExecutor、jsoup1.7.1和commons.io * 等工具包多线程抓取虎嗅网全部文章,并以文本文件形式持久化入磁盘 * @author Leon温陵 * @version V1.0 */ public class CrawlHuxiu { /** * @Title: main * @Description: 采用多线程抓取虎嗅网全文 * @param args * @author * @throws IOException * @date 2012-12-15 */ public static void main(String[] args) throws IOException, InterruptedException { CrawlHuxiu crawlHuxiu= new CrawlHuxiu(); // 创建 HttpClient 的多线程实例 MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager(); HttpClient httpClient = new HttpClient(connectionManager); crawlHuxiu.getHuxiu(httpClient, "http://www.huxiu.com/article/8000/1.html"); } /**根据jsoup解析出爬取回来的虎嗅网文章内容,利用common.io包放入磁盘 * @param doc * @param i * @throws IOException */ static void crawlHuxiu(Document doc, int i) throws IOException { List<String> yuliao= new ArrayList<String>(); Elements title= doc.select("title");//主题 String str = title.first().text().replace("-看点-@虎嗅网", ""). replace("-观点-@虎嗅网", "").replace("-读点-@虎嗅网", ""); System.out.println("title: "+str); Elements userAndTime = doc.select(".author-name"); // 发帖时间 Elements content= doc.select("#article_content");//内容 if(str.contains("提示信息 - 虎嗅网")) { System.out.println("文章被删除"); return; } String baseUrl = "http://www.huxiu.com/article/"; yuliao.add(str); yuliao.add(baseUrl + i + "/1.html"); yuliao.add(userAndTime.get(0).select(".fc1").text()); yuliao.add(userAndTime.get(1).text()); yuliao.add(content.get(0).text()); File file = new File("c:\\huxiu\\"+str+".txt"); FileUtils.writeLines(file, yuliao); yuliao.clear(); return; } /** * 线程池执行的任务,抓取网页 */ static class ThreadPoolTask implements Runnable { private static final long serialVersionUID = 0; HttpClient httpClient =null; HttpMethod getMethod = null; int i = 0; // 保存任务所需要的数据 private Object threadPoolTaskData; ThreadPoolTask(Object tasks) { this.threadPoolTaskData = tasks; } public ThreadPoolTask(HttpMethod getMethod, int i) { // TODO Auto-generated constructor stub } public ThreadPoolTask(HttpClient httpClient, HttpMethod getMethod, int i) { // TODO Auto-generated constructor stub this.httpClient = httpClient; this.getMethod = getMethod; this.i = i; } public void run() { // 处理一个任务,这里的处理方式太简单了,仅仅是一个打印语句 try { System.out.println("executing request " + getMethod.getURI()); // 执行getMethod int status = httpClient.executeMethod(getMethod); System.out.println("status:" + status); // 连接返回的状态码 if (HttpStatus.SC_OK == status) { // 获取到的内容 Document doc = null; try { doc = Jsoup.parse(getMethod.getResponseBodyAsStream(), "utf-8", ""); } catch (HttpException e) { e.printStackTrace(); } if (getMethod.getURI().getURI().startsWith( "http://www.huxiu.com/article/")) { crawlHuxiu(doc, i); } } } catch (URIException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } finally { // 释放连接 getMethod.releaseConnection(); } } } /** * 获取虎嗅网的全体文章 * * @param httpClient * @param startUrl * @throws InterruptedException */ public void getHuxiu(HttpClient httpClient, String startUrl) throws InterruptedException { // 构造一个线程池 ThreadPoolExecutor threadPool = new ThreadPoolExecutor(100, 500, 3, TimeUnit.SECONDS, new ArrayBlockingQueue<Runnable>(10000), new ThreadPoolExecutor.DiscardOldestPolicy()); String baseUrl = "http://www.huxiu.com/article/"; // 请求URI HttpMethod getMethod = null; String id = startUrl.split("/")[4]; Integer count = Integer.valueOf(id); for (int i = count; i > 100; i--) { // 根据虎嗅网文章url的特点,构造请求URI,用具有100个活动线程的线程池进行加载 getMethod = new GetMethod(baseUrl + i + "/1.html"); threadPool.execute(new ThreadPoolTask(httpClient, getMethod, i)); Thread.sleep(10); } while(true) { Thread.sleep(10); if(threadPool.getActiveCount()==0) { threadPool.shutdown(); break; } } } }