现在的位置: 首页 > 搜索技术 > 正文

搜索引擎信息采集

2014年08月16日 搜索技术 ⁄ 共 11943字 ⁄ 字号 评论关闭

【可搜索范围】

1. 正常网页搜索

数据:URL、标题,描述

2. 百科(百度,360)

数据:URL、标题,描述,作者,发布日期

3. 微博

数据:URL、标题,描述,作者,发布日期,微博类型(新浪、腾讯、163、搜狐),评论数,转发数

4. 新闻

数据:URL、标题,描述,发布日期,发布网站

5. 论坛

数据:URL、标题,描述,发布日期,发布网站

6. 博客

数据:URL、标题,描述,发布日期,发布网站

【原理】

1. 获取各大搜索引擎,url及分页规则。

2. 结构化解析,参考:http://blog.csdn.net/a286352250/article/details/14520643

各大搜索引擎【URL规则配置,分页规则】最新更新:2013-11-08

public class SeoConfiguration {
	
	public SeoConfiguration() {
		super();
	}

	public SeoConfiguration(String name, String url, String pageParam, String query, Integer pageIndex) {
		super();
		this.name = name;
		this.url = url;
		this.pageParam = pageParam;
		this.query = query;
		this.pageIndex = pageIndex;
	}

	public SeoConfiguration(Integer id, String name, String url, String pageParam, String query, Integer pageIndex) {
		super();
		this.id = id;
		this.name = name;
		this.url = url;
		this.pageParam = pageParam;
		this.query = query;
		this.pageIndex = pageIndex;
	}

	private Integer id;
	private String name;
	private String url;
	private String pageParam;
	private String query;
	private Integer pageIndex;
	
	public final static String NAME_SOSO="soso";
	public final static String NAME_BAIDU="baidu";
	public final static String NAME_GOOGLE="google";
	public final static String NAME_BING_WEB="bingweb";
	public final static String NAME_SOGOU="sogou";
	public final static String NAME_YOUDAO="youdao";
	public final static String NAME_360="360";
	public final static String NAME_BAIDU_WEIBO="baidu_weibo";
	public final static String NAME_BAIDU_BBS="baidu_bbs";
	public final static String NAME_BAIDU_BLOG="baidu_blog";
	public final static String NAME_BAIDU_NEWS="baidu_news";
	public final static String NAME_360_NEWS="360_news";
	public final static String NAME_BAIDU_BAIKE="baidu_baike";
	public final static String NAME_360_BAIKE="360_baike";
	public final static String NAME_GOOGLE_BLOG="google_blog";
	public final static String NAME_BING_YINGXIANG="bing_yingxiang";
	
	//分页 - pn , 10倍数 , 默认:0
	private final static String baidu_url="http://www.baidu.com/s?ie=utf-8&usm=6&rsv_page=1&wd=";
	//分页 - start , 10倍数 , 默认:0
	private final static String google_url="http://ajax.googleapis.com/ajax/services/search/web?v=2.0&rsz=large&q=";
	//分页 - pg , 顺序 , 默认:1
	private final static String soso_url="http://www.soso.com/q?sc=web&ch=w.uf&num=10&w=";
	//分页 - Offset , 顺序 , 默认:0
	private final static String bing_web_url="http://cn.bing.com/search?go=&qs=bs&first=1&FORM=PORE&q=";
	//分页 - page , 顺序 , 默认:1
	private final static String sogou_url="http://www.sogou.com/web?query=";
	//分页 - Offset , 顺序 , 默认:1
	private final static String youdao_url="http://www.youdao.com/search?ue=utf8&keyfrom=web.nextPage×ort=0&q=";
	//分页 - pn , 10倍数 , 默认1
	private final static String qihu360_url="http://www.so.com/s?j=0&q=";
	//分页 - pn , 20倍数 , 默认0
	private final static String baidu_weibo_url="http://www.baidu.com/s?cl=2&tn=baiduwb&rn=20&ie=utf-8&rtt=2&wd=";
	//分页 - pn , 10倍数 , 默认0
	private final static String baidu_bbs_url="http://www.baidu.com/s?pbs=1&tn=baidurt&bsst=1&ie=utf-8&rtt=1&wd=";
	//分页 - pn , 10倍数 , 默认0
	private final static String baidu_blog_url="http://www.baidu.com/s?tn=baidurt&rtt=1&pbl=1&pbs=0&bsst=1&ie=utf-8&wd=";
	//分页 - pn , 20倍数 , 默认0
	private final static String baidu_news_url="http://news.baidu.com/ns?bt=0&et=0&si=&rn=20&tn=news&ie=utf-8&ct=1&cl=2&word=";
	//分页 - pn , 10倍数 , 默认1
	private final static String baidu_360_url="http://news.so.com/ns?tn=news&rank=rank&q=";
	//分页 - pn , 20倍数 , 默认0
	private final static String baidu_baike_url="http://baike.baidu.com/search?type=0&pn=0&rn=10&submit=search&word=";
	//分页 - pn , 顺序 , 默认1
	private final static String qihu360_baike_url="http://baike.so.com/search/?word=";
	//分页 - start , 10倍数 , 默认:0
	private final static String google_blog_url="http://ajax.googleapis.com/ajax/services/search/blogs?v=2.0&rsz=large&q=";
	//分页 - Offset , 顺序 , 默认:0
	private final static String bing_yingxiang_url="http://cn.bing.com/yingxiangli/search?qs=n&form=BSCTAB&scope=q&sc=0-0&sp=-1&sk=&q=";
	
	
	public static SeoConfiguration generateBaiduConfiguration(){
		return new SeoConfiguration(NAME_BAIDU, baidu_url, "&pn=", null, 0);
	}
	
	public static SeoConfiguration generateGoogleConfiguration(){
		return new SeoConfiguration(NAME_GOOGLE, google_url, "&start=", null, 0);
	}
	
	public static SeoConfiguration generateSosoConfiguration(){
		return new SeoConfiguration(NAME_SOSO, soso_url, "&pg=", null, 1);
	}
	
	public static SeoConfiguration generateBingWebConfiguration(){
		return new SeoConfiguration(NAME_BING_WEB, bing_web_url, "&first=", null, 1);
	}
	
	public static SeoConfiguration generateSogouConfiguration(){
		return new SeoConfiguration(NAME_SOGOU, sogou_url, "&page=", null, 1);
	}
	
	public static SeoConfiguration generateYoudaoConfiguration(){
		return new SeoConfiguration(NAME_YOUDAO, youdao_url, "&start=", null, 1);
	}
	
	public static SeoConfiguration generate360Configuration(){
		return new SeoConfiguration(NAME_360, qihu360_url, "&pn=", null, 1);
	}
	
	public static SeoConfiguration generateBaiduWeiboConfiguration(){
		return new SeoConfiguration(NAME_BAIDU_WEIBO, baidu_weibo_url, "&pn=", null, 0);
	}
	
	public static SeoConfiguration generateBaiduBBSConfiguration(){
		return new SeoConfiguration(NAME_BAIDU_BBS, baidu_bbs_url, "&pn=", null, 0);
	}

	public static SeoConfiguration generateBaiduBlogConfiguration() {
		return new SeoConfiguration(NAME_BAIDU_BLOG, baidu_blog_url, "&pn=", null, 0);
	}
	
	public static SeoConfiguration generateBaiduNewsConfiguration(){
		return new SeoConfiguration(NAME_BAIDU_NEWS, baidu_news_url, "&pn=", null, 0);
	}
	
	public static SeoConfiguration generate360NewsConfiguration(){
		return new SeoConfiguration(NAME_360_NEWS, baidu_360_url, "&pn=", null, 1);
	}
	
	public static SeoConfiguration generateBaiduBaikeConfiguration(){
		return new SeoConfiguration(NAME_BAIDU_BAIKE, baidu_baike_url, "&pn=", null, 0);
	}
	
	public static SeoConfiguration generate360BaikeConfiguration(){
		return new SeoConfiguration(NAME_360_BAIKE, qihu360_baike_url, "&p=", null, 1);
	}
	
	public static SeoConfiguration generateGoogleBbsConfiguration(){
		return new SeoConfiguration(NAME_GOOGLE_BLOG, google_blog_url, "&start=", null, 0);
	}
	
	public static SeoConfiguration generateBingYingXiangConfiguration(){
		return new SeoConfiguration(NAME_BING_YINGXIANG, bing_yingxiang_url, "&first=", null, 1);
	}
	
	public void addPageIndex(){
		if (NAME_SOSO.equals(name)) {
			this.pageIndex++;
		} else if (NAME_BAIDU.equals(name)) {
			this.pageIndex+=10;
		} else if (NAME_GOOGLE.equals(name)) {
			this.pageIndex+=10;
		} else if (NAME_BING_WEB.equals(name)) {
			this.pageIndex+=10;
		} else if (NAME_SOGOU.equals(name)) {
			this.pageIndex++;
		} else if (NAME_YOUDAO.equals(name)) {
			this.pageIndex+=10;
		} else if (NAME_360.equals(name)) {
			this.pageIndex++;
		} else if (NAME_BAIDU_WEIBO.equals(name)) {
			this.pageIndex+=20;
		} else if (NAME_BAIDU_NEWS.equals(name)) {
			this.pageIndex+=20;
		} else if (NAME_360_NEWS.equals(name)) {
			this.pageIndex++;
		} else if (NAME_BAIDU_BAIKE.equals(name)) {
			this.pageIndex+=20;
		} else if (NAME_360_BAIKE.equals(name)) {
			this.pageIndex++;
		} else if (NAME_GOOGLE_BLOG.equals(name)) {
			this.pageIndex+=10;
		} else if (NAME_BAIDU_BLOG.equals(name)) {
			this.pageIndex+=10;
		} else if (NAME_BAIDU_BBS.equals(name)) {
			this.pageIndex+=10;
		} else if (NAME_BING_YINGXIANG.equals(name)) {
			this.pageIndex+=10;
		}
	}
	
	public String generateRequestUrl(){
		return url+generateURLEncoder()+pageParam+pageIndex;
	}
	public String generateRequestUrl(String enc){
		return url+generateURLEncoder(enc)+pageParam+pageIndex;
	}
	
	private String generateURLEncoder(){
		return generateURLEncoder("utf-8");
	}
	
	private String generateURLEncoder(String enc){
		try {
			return URLEncoder.encode(query, enc);
		} catch (UnsupportedEncodingException e) {
			throw new RuntimeException(e);
		}
	}
	
	// ------------------- getter and setter -----------------------------------------------------------------------
	public Integer getId() {
		return id;
	}
	public void setId(Integer id) {
		this.id = id;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	public String getQuery() {
		return query;
	}
	public void setQuery(String query) {
		this.query = query;
	}
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}

	public String getPageParam() {
		return pageParam;
	}

	public void setPageParam(String pageParam) {
		this.pageParam = pageParam;
	}

	public Integer getPageIndex() {
		return pageIndex;
	}

	public void setPageIndex(Integer pageIndex) {
		this.pageIndex = pageIndex;
	}
	
}

【结构化对象】

public class SeoResult {
	
	public SeoResult() {
		super();
	}
	/**
	 * 网页
	 */
	public SeoResult(String url, String title, String description) {
		super();
		this.url = url;
		this.title = title;
		this.description = description;
	}
	/**
	 * 新闻
	 */
	public SeoResult(String url, String title, String description, String gdSource, Long pubtime) {
		super();
		this.url = url;
		this.title = title;
		this.description = description;
		this.gdSource = gdSource;
		this.pubtime = pubtime.intValue();
	}
	/**
	 * 微博
	 */
	public SeoResult(String url, String description, String author, Integer weiboType, Long pingTotal, Long transTotal) {
		super();
		this.url = url;
		this.description = description;
		this.author = author;
		this.weiboType = weiboType;
		this.pingTotal = pingTotal;
		this.transTotal = transTotal;
	}
	/**
	 * 论坛
	 */
	public SeoResult(String url, String description, String author, String BBSTypeStr) {
		this.url = url;
		this.description = description;
		this.title = author;
		this.gdSource = BBSTypeStr;
	}
	/**
	 * 影响力
	 */
	public SeoResult(String url, String description, Integer weiboType, String influence, Integer pubtime) {
		super();
		this.url = url;
		this.description = description;
		this.weiboType = weiboType;
		this.influence = influence;
		this.pubtime = pubtime;
	}
	

	private Integer id;
	private String url;
	private String title;
	private String description;
	
	/**
	 * 来源,转载
	 */
	private String gdSource;
	
	/**
	 * 发布日期
	 */
	private Integer pubtime;
	
	/**
	 * 作者
	 */
	private String author;
	
	/**
	 * Sina = 1
	 * QQ   = 2
	 * Sohu = 3
	 * 163  = 4
	 */
	private Integer weiboType;
	
	/**
	 * 评论总数
	 */
	private Long pingTotal;
	
	/**
	 * 转发总数
	 */
	private Long transTotal;
	
	/**
	 * 影响力
	 */
	private String influence;
	
	// ------------------------ private ----------------------------------------------------------------
	public Integer getId() {
		return id;
	}
	public void setId(Integer id) {
		this.id = id;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	public String getTitle() {
		return title;
	}
	public void setTitle(String title) {
		this.title = title;
	}
	public String getDescription() {
		return description;
	}
	public void setDescription(String description) {
		this.description = description;
	}
	public Integer getWeiboType() {
		return weiboType;
	}
	public void setWeiboType(Integer weiboType) {
		this.weiboType = weiboType;
	}
	public String getAuthor() {
		return author;
	}
	public void setAuthor(String author) {
		this.author = author;
	}
	public Long getPingTotal() {
		return pingTotal;
	}
	public void setPingTotal(Long pingTotal) {
		this.pingTotal = pingTotal;
	}
	public Long getTransTotal() {
		return transTotal;
	}
	public void setTransTotal(Long transTotal) {
		this.transTotal = transTotal;
	}
	public String getGdSource() {
		return gdSource;
	}
	public void setGdSource(String gdSource) {
		this.gdSource = gdSource;
	}
	public Integer getPubtime() {
		return pubtime;
	}
	public void setPubtime(Integer pubtime) {
		this.pubtime = pubtime;
	}
	public String getInfluence() {
		return influence;
	}
	public void setInfluence(String influence) {
		this.influence = influence;
	}
	
}

【案例 -- 微博搜索】注:因页面太多,只提供微博搜索案例,其它请参考自行添加

public class BaiduWeiboClientServiceImpl implements SeoClientService {

	public List<SeoResult> findResults(SeoConfiguration configuration) {
		try {
			List<SeoResult> seoResults=new ArrayList<SeoResult>();
			System.out.println(configuration.generateRequestUrl());
			Document doc = Jsoup.connect(configuration.generateRequestUrl()).get();
			Elements eles = doc.select("div[id=wrapper] div[id=main] div[class=content_bg] div[class=content] ol[id=weibo] li");
			for (Element element : eles) {
				String url = element.select("div a[name=weibo_rootnick]").attr("href");
				String description = element.select("div").text();
				String author = element.select("div a[name=weibo_rootnick]").text();
				String weiboTypeStr = element.select("div div[class=weibo_info] div[class=m] a").text();
				String pubtime = weiboTypeStr.split(" ")[0];
				weiboTypeStr = weiboTypeStr.split(" ")[1];
				String pingTotalStr = element.select("div div[class=weibo_info] div[class=weibo_pz] a[name=weibo_ping]").text();
				String transTotalStr = element.select("div div[class=weibo_info] div[class=weibo_pz] a[name=weibo_trans]").text();
				weiboTypeStr = weiboTypeStr.substring(weiboTypeStr.indexOf("-")+1).trim();
				Integer weiboType = null;
				System.out.println(weiboTypeStr);
				if ("新浪微博".equals(weiboTypeStr)) {
					weiboType = 1;
				} else if ("腾讯微博".equals(weiboTypeStr)) {
					weiboType = 2;
				} else if ("搜狐微博".equals(weiboTypeStr)) {
					weiboType = 3;
				} else if ("网易微博".equals(weiboTypeStr)) {
					weiboType = 4;
				}
				Long pingTotal = Long.parseLong(pingTotalStr.substring(pingTotalStr.indexOf("(")+1, pingTotalStr.indexOf(")")).trim());
				Long transTotal = Long.parseLong(transTotalStr.substring(transTotalStr.indexOf("(")+1, transTotalStr.indexOf(")")).trim());
				SeoResult seoResult=new SeoResult(url, description, author, weiboType, pingTotal, transTotal);
				if (!SeoResultFilter.filterSeoResult(seoResult)) {
					continue;
				}
				seoResults.add(seoResult);
			}
			return seoResults;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return new ArrayList<SeoResult>();
	}

}

抱歉!评论已关闭.