首先读取头部信息,读到字符编码就停止,设定字符编码后继续读取网页内容。
package test; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; public class Main { static String[] coding={"utf-8", "gb2312", "gbk", "unicode"}; public static void main(String args[]) throws IOException{ String urlStr = "http://www.sina.com.cn"; URL url = new URL(urlStr); URLConnection connection = url.openConnection(); String ss = connection.getContentType(); System.out.println(ss); BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream())); System.out.println("build connection"); String s = null; String pageCoding = null; while((s = br.readLine())!=null){ if(s.contains("charset")){ br.close(); for(int i=0;i<coding.length;i++){ if(s.contains(coding[i])){ pageCoding=coding[i]; System.out.println(pageCoding); br=new BufferedReader(new InputStreamReader(url.openStream(), pageCoding)); System.out.println("build connection2"); break; } } if(pageCoding==null){ System.out.println("error"+s); } break; } } while((s=br.readLine())!=null){ System.out.println(s); } } }