1. 从网页上抓取我们感兴趣的内容.
2. 得到网页的源代码, 通过正则表达式找的我们需要的信息.保存起来.
3.代码实现
URL url = new URL(网页地址); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); //设置代理,有些网页不允许Java访问. connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); connection.setDoOutput(true); //得到网页编码 String charset = getCharset(connection.getContentType()); //初始化输出流 BufferedReader br = new BufferedReader(new InputStreamReader( connection.getInputStream(), charset)); while ((str = br.readLine()) != null) { //邮箱正则表达式 Pattern pattern = Pattern .compile("[a-zA-Z0-9_.-]+@[a-zA-Z0-9-]+\\.[a-zA-Z]{2,4}"); Matcher matcher = pattern.matcher(str); //如果找到则输出 while (matcher.find()) { String reString = matcher.group(); System.out.println(reString); } }