现在的位置: 首页 > 综合 > 正文

javax w3c 网页解析(一)

2017年12月06日 ⁄ 综合 ⁄ 共 1622字 ⁄ 字号 评论关闭

package test;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.URL;

import javax.swing.text.Document;
import javax.swing.text.EditorKit;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

 

public class javahtml {
 
   public static void main(String[] args)
    throws Exception
   {
    EditorKit kit = new HTMLEditorKit();
    Document doc = kit.createDefaultDocument();
  
    // The Document class does not yet handle charset's properly.
    doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
  
    // Create a reader on the HTML content.
  
    Reader rd = getReader("http://hexun.com/kangojian/default.html");
  
    // Parse the HTML.
  
    kit.read(rd, doc, 0);
  
    //  The HTML text is now stored in the document
  
    HTMLDocument.Iterator it = ((HTMLDocument) doc).getIterator(HTML.Tag.A);
    
    
    while(it.isValid())
    {
     SimpleAttributeSet s = (SimpleAttributeSet)it.getAttributes();
     String href = (String)s.getAttribute(HTML.Attribute.HREF);
     System.out.println(href);
     it.next();
    }

   }
  
   // Returns a reader on the HTML data. If 'uri' begins
   // with "http:", it's treated as a URL; otherwise,
   // it's assumed to be a local filename.
  
   static Reader getReader(String uri)
    throws IOException
   {
    // Retrieve from Internet.
    if (uri.startsWith("http:"))
    {
     HttpURLConnection conn = (HttpURLConnection) new URL(uri).openConnection();
     return new InputStreamReader(conn.getInputStream());
    }
    // Retrieve from file.
    else
    {
     return new FileReader(uri);
    }
   }
    }

 

抱歉!评论已关闭.