现在的位置: 首页 > 综合 > 正文

使用HttpClient登陆web-login使用JerichoHTMLParser和htmlparser抓取html

2013年02月03日 ⁄ 综合 ⁄ 共 2978字 ⁄ 字号 评论关闭

httpclient http://jakarta.apache.org/commons/httpclient/

Jericho HTML Parser http://jerichohtml.sourceforge.net/

htmlparser http://sourceforge.net/projects/htmlparser/

HttpClientLogin.java

 package web;

import java.io.IOException;
import java.util.Iterator;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import au.id.jericho.lib.html.FormField;
import au.id.jericho.lib.html.FormFields;
import au.id.jericho.lib.html.Source;

public class HttpClientLogin{

 public static void main(String args[]) throws HttpException, IOException,
   ParserException {
  HttpClient client = new HttpClient();
  client.getParams().setContentCharset("utf-8");
  client.getHostConfiguration().setHost("127.0.0.1", 80, "http");
  client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);

  GetMethod authget = new GetMethod("/admin/login.php");
  client.executeMethod(authget);
  authget.releaseConnection();

  PostMethod authpost = new PostMethod("/admin/login.php");
  NameValuePair action = new NameValuePair("action",
    "http://127.0.0.1/admin/login.php");
  NameValuePair method = new NameValuePair("method", "post");
  NameValuePair userid = new NameValuePair("TextBox1", "kefu");
  NameValuePair password = new NameValuePair("TextBox2", "kefu");

  authpost.setRequestBody(new NameValuePair[] { method, action, userid,
    password, });
  client.executeMethod(authpost);
  authpost.releaseConnection();

  String anotherPage = "/admin/index.php";
  GetMethod anotherPageGet = new GetMethod(anotherPage);
  client.executeMethod(anotherPageGet);

  Source source = new Source(anotherPageGet.getResponseBodyAsStream());

  source.fullSequentialParse();
  FormFields formFields = source.findFormFields();

  String temp = "";
  for (Iterator i = formFields.iterator(); i.hasNext();) {
   FormField formField = (FormField) i.next();
   if (formField.getName() != null
     && formField.getName().equals("temp")) {
    temp = (String) formField.getFormControl().getAttributesMap()
      .get("value");
   }
  }// 抓取 temp filed 当访问该page产生的临时值

  NameValuePair temp_ = new NameValuePair("temp", temp);

  PostMethod anotherPagePost = new PostMethod(anotherPage);// 提交
  anotherPagePost.setRequestBody(new NameValuePair[] { temp_ });// 加入该查询条件

  client.executeMethod(anotherPagePost);// 提交查询条件

  String resultHtml = anotherPagePost.getResponseBodyAsString();
  Parser parser = Parser.createParser(new String(resultHtml.getBytes(),
    "8859_1"), "8859-1");

  String filterStr = "table";
  NodeFilter filter = new TagNameFilter(filterStr);
  NodeList nodes = parser.extractAllNodesThatMatch(filter);// 抓取查询结果
  Node node = nodes.elementAt(3);// 截取talbe

  String result = new String(node.toHtml().getBytes("8859_1"));
 }

}

 

抱歉!评论已关闭.