如何将百度作为自己网站的搜索引擎,以百度新闻为例。
解决方案:读取页面+正则表达式
百度新闻搜索糖尿病,搜索出的内容查看源文件,知结果的每一项以如下格式呈现(这里是两条数据):
<table cellspacing=0 cellpadding=2>
<tr>
<td class="text"><a href="http://news.163.com/10/0612/08/68VCT81V00014AED.html " mon="a=5&pn=1" target=_blank><span><b>高血压、心脑血管、<font color="#C60A00">糖尿病</font>市民见证全松食疗热</b></span></a> <font color=#6f6f6f> <nobr>网易 2010-6-12 08:21</nobr></font><br><font size=-1>我病比较多,主要是<font color="#C60A00">糖尿病</font>,现在眼吧都属于<font color="#C60A00">糖尿病</font>并发症,看东西特别模糊。你看我坐在外面看电视,就这么近那么大的字我看不清楚。自打喝了全松茶以后,最明显的就是眼睛,现在看电视一点问题都没有了,看的相当清楚。老伴看我喝着好,她也想试试,本来她失眠特别严重,...</font>
</td></tr></table><br>
<table cellspacing=0 cellpadding=2>
<tr>
<td class="text"><a href="http://news.163.com/10/0612/05/68V25R7P00014AED.html " mon="a=5&pn=2" target=_blank><span><b>“生命动力” 将病人往死里推</b></span></a> <font color=#6f6f6f> <nobr>网易 2010-6-12 05:13</nobr></font><br><font size=-1>前日,有专家称,让<font color="#C60A00">糖尿病</font>患者不忌口,是将病人往死里推! 尤先生患<font color="#C60A00">糖尿病</font>多年,半年多前,他听了一场讲座后,开始服用一种叫“生命动力”的药物,大半年花去近一万元。尤先生还按照当初讲座介绍的,吃东西不忌口。前几天他去医院检查发现,<font color="#C60A00">糖尿病</font>根本没治好,而且身体感觉更差了。...</font>
</td></tr></table><br>
自己程序读出来的结果:
img:null
imgHref:null
title:高血压、心脑血管、<font color="#C60A00">糖尿病</font>市民见证全松食疗热
from:网易 2010-6-12 08:21
href:http://news.163.com/10/0612/08/68VCT81V00014AED.html
text:<font size=-1>我病比较多,主要是<font color="#C60A00">糖尿病</font>,现在眼吧都属于<font color="#C60A00">糖尿病</font>并发症,看东西特别模糊。你看我坐在外面看电视,就这么近那么大的字我看不清楚。自打喝了全松茶以后,最明显的就是眼睛,现在看电视一点问题都没有了,看的相当清楚。老伴看我喝着好,她也想试试,本来她失眠特别严重,...</font>
otherMess:null
otherHref:null
-------------------------------------------
img:null
imgHref:null
title:“生命动力” 将病人往死里推
from:网易 2010-6-12 05:13
href:http://news.163.com/10/0612/05/68V25R7P00014AED.html
text:<font size=-1>前日,有专家称,让<font color="#C60A00">糖尿病</font>患者不忌口,是将病人往死里推! 尤先生患<font color="#C60A00">糖尿病</font>多年,半年多前,他听了一场讲座后,开始服用一种叫“生命动力”的药物,大半年花去近一万元。尤先生还按照当初讲座介绍的,吃东西不忌口。前几天他去医院检查发现,<font color="#C60A00">糖尿病</font>根本没治好,而且身体感觉更差了。...</font>
otherMess:null
otherHref:null
-------------------------------------------
代码实现:
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; class Mess {
public String img;
public String imgHref;
public String title;
public String from;
public String href;
public String text;
public String otherMess;
public String otherHref;
@Override
public String toString(){
return "img:"+img+"\n"+
"imgHref:"+imgHref+"\n"+
"title:"+title+"\n"+
"from:"+from+"\n"+
"href:"+href+"\n"+
"text:"+text+"\n"+
"otherMess:"+otherMess+"\n"+
"otherHref:"+otherHref+"\n-------------------------------------------\n";
}
} public class Baidu {
private static String TableArea = "<table\\s+cellspacing[^>]*>(.(?!<table[^>]*>))*</table>";
private static String Thumbnail = "<td\\s+class\\s*\\=\\s*\"\\s*thumbnail\\s*\"\\s*>(.(?!href))*\\s+href\\=\"([^\"]+)\"(.(?!src))*\\s+src=\"([^\"]+)\"(.(?!<td))*</td>";
private static String TextArea = "<td\\s+class\\s*\\=\\s*\"\\s*text\\s*\"\\s*>(.(?!<td))*</td>";
private static String Href = "href\\s*=\\s*\"([^\"]+)\"";
private static String Nobr = "<nobr>([^<]+)</nobr>";
private static String HighLight = "<span><b>((.(?!/b>))*)";
private static String Text = "</nobr>(.(?!br>))*<br>(<font size\\=(.(?!(a\\s+)|(/td)))*)";//2
private static String OtherMess = "<a href\\s*=\\s*\"([^\"]+)\">(<font color=#008000>(.(?!(/td>)))*)</td>";
public static List<Mess> BaiduFileMess(StringBuffer sb){
List<Mess> reTem = new ArrayList<Mess>();
Pattern p = Pattern.compile(TableArea);
Matcher m = p.matcher(sb);
while(m.find()){
Mess tem = new Mess();
String text , oneTable = m.group();
Matcher mThumb = Pattern.compile(Thumbnail).matcher(oneTable);
Matcher mText = Pattern.compile(TextArea).matcher(oneTable);
if(mThumb.find()){
tem.imgHref = mThumb.group(2);
tem.img = mThumb.group(4);
}
if(mText.find()){
text = mText.group();
Matcher mHighLight = Pattern.compile(HighLight).matcher(text);
if(mHighLight.find())tem.title = mHighLight.group(1);
mHighLight = Pattern.compile(Nobr).matcher(text);
if(mHighLight.find())tem.from = mHighLight.group(1);
mHighLight = Pattern.compile(Href).matcher(text);
if(mHighLight.find())tem.href = mHighLight.group(1);
mHighLight = Pattern.compile(Text).matcher(text);
if(mHighLight.find())tem.text = mHighLight.group(2);
mHighLight = Pattern.compile(OtherMess).matcher(text);
if(mHighLight.find())tem.otherHref = mHighLight.group(1);
if(mHighLight.find())tem.otherMess = mHighLight.group(2);
}
reTem.add(tem);
}
return reTem;
}
public static StringBuffer readWebcontent(String urlvalue) {
StringBuffer result = new StringBuffer();
String line2;
try {
URL url = new URL(urlvalue);
HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
BufferedReader in
= new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
while ((line2 = in.readLine()) != null) {
result.append(line2);
}
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
public static void main(String[] args) {
String url = "http://news.baidu.com/ns?word=糖尿病&tn=news&from=news&ie=gb2312&bs=%CC%C7%C4%F2%B2%A1&sr=0&cl=2&rn=20&ct=1&prevct=no";
/* 百度显示规则:
* rn=20 每页显示多少条记录
* pn=20 分页 从多少页开始显示rn调记录
* String url="http://news.baidu.com/ns?word=糖尿病&tn=news&from=news&ie=gb2312&bs=%CC%C7%C4%F2%B2%A1&sr=0&cl=2&rn=20&ct=1&prevct=no&pn=20";
*/
StringBuffer result = Baidu.readWebcontent(url);
//System.out.println(result.toString()+"\n\n\n\n");
List<Mess> reTem = BaiduFileMess(result);
System.out.println(reTem.size());
for (Iterator iterator = reTem.iterator(); iterator.hasNext();) {
Mess m = (Mess) iterator.next();
System.out.println(m.toString());
}
}
}