【可搜索范围】
1. 正常网页搜索
数据:URL、标题,描述
2. 百科(百度,360)
数据:URL、标题,描述,作者,发布日期
3. 微博
数据:URL、标题,描述,作者,发布日期,微博类型(新浪、腾讯、163、搜狐),评论数,转发数
4. 新闻
数据:URL、标题,描述,发布日期,发布网站
5. 论坛
数据:URL、标题,描述,发布日期,发布网站
6. 博客
数据:URL、标题,描述,发布日期,发布网站
【原理】
1. 获取各大搜索引擎,url及分页规则。
2. 结构化解析,参考:http://blog.csdn.net/a286352250/article/details/14520643
各大搜索引擎【URL规则配置,分页规则】最新更新:2013-11-08
public class SeoConfiguration { public SeoConfiguration() { super(); } public SeoConfiguration(String name, String url, String pageParam, String query, Integer pageIndex) { super(); this.name = name; this.url = url; this.pageParam = pageParam; this.query = query; this.pageIndex = pageIndex; } public SeoConfiguration(Integer id, String name, String url, String pageParam, String query, Integer pageIndex) { super(); this.id = id; this.name = name; this.url = url; this.pageParam = pageParam; this.query = query; this.pageIndex = pageIndex; } private Integer id; private String name; private String url; private String pageParam; private String query; private Integer pageIndex; public final static String NAME_SOSO="soso"; public final static String NAME_BAIDU="baidu"; public final static String NAME_GOOGLE="google"; public final static String NAME_BING_WEB="bingweb"; public final static String NAME_SOGOU="sogou"; public final static String NAME_YOUDAO="youdao"; public final static String NAME_360="360"; public final static String NAME_BAIDU_WEIBO="baidu_weibo"; public final static String NAME_BAIDU_BBS="baidu_bbs"; public final static String NAME_BAIDU_BLOG="baidu_blog"; public final static String NAME_BAIDU_NEWS="baidu_news"; public final static String NAME_360_NEWS="360_news"; public final static String NAME_BAIDU_BAIKE="baidu_baike"; public final static String NAME_360_BAIKE="360_baike"; public final static String NAME_GOOGLE_BLOG="google_blog"; public final static String NAME_BING_YINGXIANG="bing_yingxiang"; //分页 - pn , 10倍数 , 默认:0 private final static String baidu_url="http://www.baidu.com/s?ie=utf-8&usm=6&rsv_page=1&wd="; //分页 - start , 10倍数 , 默认:0 private final static String google_url="http://ajax.googleapis.com/ajax/services/search/web?v=2.0&rsz=large&q="; //分页 - pg , 顺序 , 默认:1 private final static String soso_url="http://www.soso.com/q?sc=web&ch=w.uf&num=10&w="; //分页 - Offset , 顺序 , 默认:0 private final static String bing_web_url="http://cn.bing.com/search?go=&qs=bs&first=1&FORM=PORE&q="; //分页 - page , 顺序 , 默认:1 private final static String sogou_url="http://www.sogou.com/web?query="; //分页 - Offset , 顺序 , 默认:1 private final static String youdao_url="http://www.youdao.com/search?ue=utf8&keyfrom=web.nextPage×ort=0&q="; //分页 - pn , 10倍数 , 默认1 private final static String qihu360_url="http://www.so.com/s?j=0&q="; //分页 - pn , 20倍数 , 默认0 private final static String baidu_weibo_url="http://www.baidu.com/s?cl=2&tn=baiduwb&rn=20&ie=utf-8&rtt=2&wd="; //分页 - pn , 10倍数 , 默认0 private final static String baidu_bbs_url="http://www.baidu.com/s?pbs=1&tn=baidurt&bsst=1&ie=utf-8&rtt=1&wd="; //分页 - pn , 10倍数 , 默认0 private final static String baidu_blog_url="http://www.baidu.com/s?tn=baidurt&rtt=1&pbl=1&pbs=0&bsst=1&ie=utf-8&wd="; //分页 - pn , 20倍数 , 默认0 private final static String baidu_news_url="http://news.baidu.com/ns?bt=0&et=0&si=&rn=20&tn=news&ie=utf-8&ct=1&cl=2&word="; //分页 - pn , 10倍数 , 默认1 private final static String baidu_360_url="http://news.so.com/ns?tn=news&rank=rank&q="; //分页 - pn , 20倍数 , 默认0 private final static String baidu_baike_url="http://baike.baidu.com/search?type=0&pn=0&rn=10&submit=search&word="; //分页 - pn , 顺序 , 默认1 private final static String qihu360_baike_url="http://baike.so.com/search/?word="; //分页 - start , 10倍数 , 默认:0 private final static String google_blog_url="http://ajax.googleapis.com/ajax/services/search/blogs?v=2.0&rsz=large&q="; //分页 - Offset , 顺序 , 默认:0 private final static String bing_yingxiang_url="http://cn.bing.com/yingxiangli/search?qs=n&form=BSCTAB&scope=q&sc=0-0&sp=-1&sk=&q="; public static SeoConfiguration generateBaiduConfiguration(){ return new SeoConfiguration(NAME_BAIDU, baidu_url, "&pn=", null, 0); } public static SeoConfiguration generateGoogleConfiguration(){ return new SeoConfiguration(NAME_GOOGLE, google_url, "&start=", null, 0); } public static SeoConfiguration generateSosoConfiguration(){ return new SeoConfiguration(NAME_SOSO, soso_url, "&pg=", null, 1); } public static SeoConfiguration generateBingWebConfiguration(){ return new SeoConfiguration(NAME_BING_WEB, bing_web_url, "&first=", null, 1); } public static SeoConfiguration generateSogouConfiguration(){ return new SeoConfiguration(NAME_SOGOU, sogou_url, "&page=", null, 1); } public static SeoConfiguration generateYoudaoConfiguration(){ return new SeoConfiguration(NAME_YOUDAO, youdao_url, "&start=", null, 1); } public static SeoConfiguration generate360Configuration(){ return new SeoConfiguration(NAME_360, qihu360_url, "&pn=", null, 1); } public static SeoConfiguration generateBaiduWeiboConfiguration(){ return new SeoConfiguration(NAME_BAIDU_WEIBO, baidu_weibo_url, "&pn=", null, 0); } public static SeoConfiguration generateBaiduBBSConfiguration(){ return new SeoConfiguration(NAME_BAIDU_BBS, baidu_bbs_url, "&pn=", null, 0); } public static SeoConfiguration generateBaiduBlogConfiguration() { return new SeoConfiguration(NAME_BAIDU_BLOG, baidu_blog_url, "&pn=", null, 0); } public static SeoConfiguration generateBaiduNewsConfiguration(){ return new SeoConfiguration(NAME_BAIDU_NEWS, baidu_news_url, "&pn=", null, 0); } public static SeoConfiguration generate360NewsConfiguration(){ return new SeoConfiguration(NAME_360_NEWS, baidu_360_url, "&pn=", null, 1); } public static SeoConfiguration generateBaiduBaikeConfiguration(){ return new SeoConfiguration(NAME_BAIDU_BAIKE, baidu_baike_url, "&pn=", null, 0); } public static SeoConfiguration generate360BaikeConfiguration(){ return new SeoConfiguration(NAME_360_BAIKE, qihu360_baike_url, "&p=", null, 1); } public static SeoConfiguration generateGoogleBbsConfiguration(){ return new SeoConfiguration(NAME_GOOGLE_BLOG, google_blog_url, "&start=", null, 0); } public static SeoConfiguration generateBingYingXiangConfiguration(){ return new SeoConfiguration(NAME_BING_YINGXIANG, bing_yingxiang_url, "&first=", null, 1); } public void addPageIndex(){ if (NAME_SOSO.equals(name)) { this.pageIndex++; } else if (NAME_BAIDU.equals(name)) { this.pageIndex+=10; } else if (NAME_GOOGLE.equals(name)) { this.pageIndex+=10; } else if (NAME_BING_WEB.equals(name)) { this.pageIndex+=10; } else if (NAME_SOGOU.equals(name)) { this.pageIndex++; } else if (NAME_YOUDAO.equals(name)) { this.pageIndex+=10; } else if (NAME_360.equals(name)) { this.pageIndex++; } else if (NAME_BAIDU_WEIBO.equals(name)) { this.pageIndex+=20; } else if (NAME_BAIDU_NEWS.equals(name)) { this.pageIndex+=20; } else if (NAME_360_NEWS.equals(name)) { this.pageIndex++; } else if (NAME_BAIDU_BAIKE.equals(name)) { this.pageIndex+=20; } else if (NAME_360_BAIKE.equals(name)) { this.pageIndex++; } else if (NAME_GOOGLE_BLOG.equals(name)) { this.pageIndex+=10; } else if (NAME_BAIDU_BLOG.equals(name)) { this.pageIndex+=10; } else if (NAME_BAIDU_BBS.equals(name)) { this.pageIndex+=10; } else if (NAME_BING_YINGXIANG.equals(name)) { this.pageIndex+=10; } } public String generateRequestUrl(){ return url+generateURLEncoder()+pageParam+pageIndex; } public String generateRequestUrl(String enc){ return url+generateURLEncoder(enc)+pageParam+pageIndex; } private String generateURLEncoder(){ return generateURLEncoder("utf-8"); } private String generateURLEncoder(String enc){ try { return URLEncoder.encode(query, enc); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } // ------------------- getter and setter ----------------------------------------------------------------------- public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getQuery() { return query; } public void setQuery(String query) { this.query = query; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getPageParam() { return pageParam; } public void setPageParam(String pageParam) { this.pageParam = pageParam; } public Integer getPageIndex() { return pageIndex; } public void setPageIndex(Integer pageIndex) { this.pageIndex = pageIndex; } }
【结构化对象】
public class SeoResult { public SeoResult() { super(); } /** * 网页 */ public SeoResult(String url, String title, String description) { super(); this.url = url; this.title = title; this.description = description; } /** * 新闻 */ public SeoResult(String url, String title, String description, String gdSource, Long pubtime) { super(); this.url = url; this.title = title; this.description = description; this.gdSource = gdSource; this.pubtime = pubtime.intValue(); } /** * 微博 */ public SeoResult(String url, String description, String author, Integer weiboType, Long pingTotal, Long transTotal) { super(); this.url = url; this.description = description; this.author = author; this.weiboType = weiboType; this.pingTotal = pingTotal; this.transTotal = transTotal; } /** * 论坛 */ public SeoResult(String url, String description, String author, String BBSTypeStr) { this.url = url; this.description = description; this.title = author; this.gdSource = BBSTypeStr; } /** * 影响力 */ public SeoResult(String url, String description, Integer weiboType, String influence, Integer pubtime) { super(); this.url = url; this.description = description; this.weiboType = weiboType; this.influence = influence; this.pubtime = pubtime; } private Integer id; private String url; private String title; private String description; /** * 来源,转载 */ private String gdSource; /** * 发布日期 */ private Integer pubtime; /** * 作者 */ private String author; /** * Sina = 1 * QQ = 2 * Sohu = 3 * 163 = 4 */ private Integer weiboType; /** * 评论总数 */ private Long pingTotal; /** * 转发总数 */ private Long transTotal; /** * 影响力 */ private String influence; // ------------------------ private ---------------------------------------------------------------- public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } public Integer getWeiboType() { return weiboType; } public void setWeiboType(Integer weiboType) { this.weiboType = weiboType; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public Long getPingTotal() { return pingTotal; } public void setPingTotal(Long pingTotal) { this.pingTotal = pingTotal; } public Long getTransTotal() { return transTotal; } public void setTransTotal(Long transTotal) { this.transTotal = transTotal; } public String getGdSource() { return gdSource; } public void setGdSource(String gdSource) { this.gdSource = gdSource; } public Integer getPubtime() { return pubtime; } public void setPubtime(Integer pubtime) { this.pubtime = pubtime; } public String getInfluence() { return influence; } public void setInfluence(String influence) { this.influence = influence; } }
【案例 -- 微博搜索】注:因页面太多,只提供微博搜索案例,其它请参考自行添加
public class BaiduWeiboClientServiceImpl implements SeoClientService { public List<SeoResult> findResults(SeoConfiguration configuration) { try { List<SeoResult> seoResults=new ArrayList<SeoResult>(); System.out.println(configuration.generateRequestUrl()); Document doc = Jsoup.connect(configuration.generateRequestUrl()).get(); Elements eles = doc.select("div[id=wrapper] div[id=main] div[class=content_bg] div[class=content] ol[id=weibo] li"); for (Element element : eles) { String url = element.select("div a[name=weibo_rootnick]").attr("href"); String description = element.select("div").text(); String author = element.select("div a[name=weibo_rootnick]").text(); String weiboTypeStr = element.select("div div[class=weibo_info] div[class=m] a").text(); String pubtime = weiboTypeStr.split(" ")[0]; weiboTypeStr = weiboTypeStr.split(" ")[1]; String pingTotalStr = element.select("div div[class=weibo_info] div[class=weibo_pz] a[name=weibo_ping]").text(); String transTotalStr = element.select("div div[class=weibo_info] div[class=weibo_pz] a[name=weibo_trans]").text(); weiboTypeStr = weiboTypeStr.substring(weiboTypeStr.indexOf("-")+1).trim(); Integer weiboType = null; System.out.println(weiboTypeStr); if ("新浪微博".equals(weiboTypeStr)) { weiboType = 1; } else if ("腾讯微博".equals(weiboTypeStr)) { weiboType = 2; } else if ("搜狐微博".equals(weiboTypeStr)) { weiboType = 3; } else if ("网易微博".equals(weiboTypeStr)) { weiboType = 4; } Long pingTotal = Long.parseLong(pingTotalStr.substring(pingTotalStr.indexOf("(")+1, pingTotalStr.indexOf(")")).trim()); Long transTotal = Long.parseLong(transTotalStr.substring(transTotalStr.indexOf("(")+1, transTotalStr.indexOf(")")).trim()); SeoResult seoResult=new SeoResult(url, description, author, weiboType, pingTotal, transTotal); if (!SeoResultFilter.filterSeoResult(seoResult)) { continue; } seoResults.add(seoResult); } return seoResults; } catch (Exception e) { e.printStackTrace(); } return new ArrayList<SeoResult>(); } }