现在的位置: 首页 > 综合 > 正文

从百度空间到CSDN——博客搬家源码

2013年06月03日 ⁄ 综合 ⁄ 共 11735字 ⁄ 字号 评论关闭
文章目录

注意:下面的方法在csdn博客改版以后无法使用,因为现在csdn博客不支持metadata api,不知道什么时候可以支持。

1.原文连接

http://hi.baidu.com/cnjsp/blog/item/e175cf1b27bc6af6ae513335.html

2.心得

本方法我测试过,是可以用来的,一则感觉思路挺新颖了,程序员自己写代码解决自己的事情。另一个可以通过这个实例学习一下java,所以我贴出我修改后的java代码。

具体思路可以参见原文。

3.代码

CSDNPost.java

package cn.mingyuan.baidu2csdn.core;

import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.xmlrpc.XmlRpcException;
import org.apache.xmlrpc.client.XmlRpcClient;
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;

/**
 * csdn博文
 * 
 * @author mingyuanonline@gmail.com
 * 
 */
public class CSDNPost {
	/**
	 * 博文创建日期
	 */
	private Date dateCreated;
	/**
	 * 博文内容
	 */
	private String description;
	/**
	 * 标题
	 */
	private String title;
	/**
	 * 博文分类
	 */
	private String[] categories;

	public CSDNPost() {

	}

	public CSDNPost(String title, String description, String[] categories,
			Date dateCreated) {
		this.dateCreated = dateCreated;
		this.description = description;
		this.title = title;
		this.categories = categories;
	}

	public Date getDateCreated() {
		return dateCreated;
	}

	public void setDateCreated(Date dateCreated) {
		this.dateCreated = dateCreated;
	}

	public String getDescription() {
		return description;
	}

	public void setDescription(String description) {
		this.description = description;
	}

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public String[] getCategories() {
		return categories;
	}

	public void setCategories(String[] categories) {
		this.categories = categories;
	}

	/**
	 * xml-rpc配置
	 */
	private static XmlRpcClientConfigImpl config;
	/**
	 * xml-rpcClient
	 */
	private static XmlRpcClient client;

	static {
		config = new XmlRpcClientConfigImpl();
		try {
			// 此处请将telnetor替换为您的用户名
			config.setServerURL(new URL(
					"http://blog.csdn.net/xw13106209/services/metablogapi.aspx"));
		} catch (MalformedURLException e) {
			System.out.println("请检查url");
		}
		client = new XmlRpcClient();
		client.setConfig(config);
	}

	/**
	 * 日志记录
	 * 
	 * @param log
	 *            log
	 */
	private void writelog(String log) {
		FileOutputStream fos = null;
		try {
			fos = new FileOutputStream("post.log", true);
			fos.write((log + "\r\n").getBytes());
			fos.flush();
			fos.close();
		} catch (IOException e) {
			System.out.println("写入日志错误:" + log);
		}
	}

	/**
	 * 发布
	 */
	public void publish() {
		Map<String, Object> struct = new HashMap<String, Object>();
		struct.put("dateCreated", dateCreated);
		struct.put("description", description);
		struct.put("title", title);
		struct.put("categories", categories);
//		Object[] params = new Object[] { "your usrname",
//				"replace it with your username",
//				"replace it with your password", struct, true };
		
		Object[] params = new Object[] { "xw13106209",
		"xw13106209",
		"password", struct, true };
		
		String blogid = null;
		try {
			blogid = (String) client.execute("metaWeblog.newPost", params);
		} catch (XmlRpcException e) {
			writelog("导入出现错误:title=" + title);
			System.out.println("导入出现错误:title=" + title);
		}
		writelog(title + ">> 导入完毕,生成博文id为>>" + blogid);
		System.out.println(title + ">> 导入完毕,生成博文id为>>" + blogid);
		struct.clear();
	}

	public static void main(String[] args) {
		CSDNPost post = new CSDNPost();
		post.publish();
	}
}

BaiduHi

package cn.mingyuan.baidu2csdn.core;

import java.util.Date;

/**
 * 百度博客
 * 
 * @author mingyuanonline@gmail.com
 * 
 */
public class BaiduHi {
	/**
	 * 标题
	 */
	private String title;
	/**
	 * 内容
	 */
	private String description;
	/**
	 * 分类
	 */
	private String categories;
	/**
	 * 发布日期
	 */
	private Date dateCreated;

	public String getTitle() {
		return title;
	}

	public String getDescription() {
		return description;
	}

	public String getCategories() {
		return categories;
	}

	public Date getDateCreated() {
		return dateCreated;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public void setDescription(String description) {
		this.description = description;
	}

	public void setCategories(String categories) {
		this.categories = categories;
	}

	public void setDateCreated(Date dateCreated) {
		this.dateCreated = dateCreated;
	}

	public BaiduHi(String title, String description, String categories,
			Date dateCreated) {
		this.title = title;
		this.description = description;
		this.categories = categories;
		this.dateCreated = dateCreated;
	}

	public BaiduHi() {
		// TODO Auto-generated constructor stub
	}

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub
	}
}

BaiduHiFetcher

package cn.mingyuan.baidu2csdn.core;  

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 百度博客数据抓取及解析
 * 
 * @author mingyuanonline@gmail.com
 * 
 */
public class BaiduHiFetcher {
	/**
	 * 下载页面
	 * 
	 * @param url
	 *            url
	 * @return 网页源码
	 */
	private String downloadPage(String url) {
		URLConnection conn;
		InputStream in;
		BufferedReader reader = null;
		StringBuilder sb = new StringBuilder();
		String line = null;
		try {
			conn = new URL(url).openConnection();
			in = conn.getInputStream();
			reader = new BufferedReader(new InputStreamReader(in, "gb2312"));
			while ((line = reader.readLine()) != null) {
				sb.append(line);
			}
			in.close();
			reader.close();
		} catch (MalformedURLException e) {
			System.out.println("请检查url是否规范");
		} catch (IOException e) {
			System.out.println("读取源码错误:" + url);
		}
		return sb.toString();
	}

	/**
	 * 获取页面博文链接
	 * 
	 * @param html
	 *            网页源码
	 * @return 页面中的博文链接
	 */
	private List<String> getPostLinks(String html) {
		// 分析页面内容,取得页面中的文章链接
		String titleDivRegex = "<div[\\s]class=\"tit\"><a[\\s]href=[^<>]+?target=\"_blank\">.+?</div>";
		Pattern titleDivPattern = Pattern.compile(titleDivRegex);
		Matcher titleDivMatcher = titleDivPattern.matcher(html);
		List<String> posts = new ArrayList<String>();
		while (titleDivMatcher.find()) {
			String div = titleDivMatcher.group();
			String titleUrl = div.substring(div.indexOf("/"), div
					.indexOf("\" target"));
			posts.add("http://hi.baidu.com" + titleUrl);
		}
		return posts;
	}

	/**
	 * <p>
	 * 获取博客总页数 <br>
	 * 我的博客内容有16页,有上一页,下一页,尾页等这样的标志,如果博文少的话可能这些标志不会出现,请修改此方法
	 * 
	 * @param html
	 *            源码(最好是第一页)
	 * @return 博客总页数
	 */
	private int getTotalPages(String html) {
		// 页码
		// <a href="/cnjsp/blog/index/16"
		// mce_href="cnjsp/blog/index/16">[尾页]</a>
		String pageRegex = "<a[\\s]href=\"/cnjsp/blog/index/[\\d][\\d]\">\\[尾页\\]</a>";
		Pattern pagePattern = Pattern.compile(pageRegex);
		Matcher pageMatcher = pagePattern.matcher(html);
		String totalPagesStr = null;
		int pages = 0;
		if (pageMatcher.find()) {
			String pagelink = pageMatcher.group();
			totalPagesStr = pagelink.replaceAll(
					"<a[\\s]href=\"/cnjsp/blog/index/", "").replaceAll(
					"\">\\[尾页\\]</a>", "");
			pages = Integer.parseInt(totalPagesStr);
		}
		return pages;
	}

	/**
	 * <p>
	 * 获取博客的所有博文的地址 <br>
	 * 没有对url进行编码处理,如果博客地址含中文,请对url进行处理
	 * 
	 * @param blogUrl
	 *            博客地址
	 * @return 所有博文地址,存放于栈中,使用的时候请使用pop方法取出元素,这样可以保证按照最先发表的博文最先处理
	 */
	public Stack<String> getAllPostLink(String blogUrl) {
		Stack<String> posts = new Stack<String>();
		// 1.下载第一页
		String firstPageHtml = downloadPage(blogUrl + "/blog/index/0");
		// 2.获取博文总页数
//		int totalPages = getTotalPages(firstPageHtml);
		int totalPages = 2;
		// 3.下载各摘要页
		posts.addAll(getPostLinks(firstPageHtml));
		if (totalPages < 1) {
			return posts;
		}
		for (int i = 1; i <= totalPages; i++) {
			String page = downloadPage(blogUrl + "/blog/index/" + i);
			posts.addAll(getPostLinks(page));
		}
		return posts;
	}

	/**
	 * 解析博文,获取标题,发布时间,内容,分类等信息
	 * 
	 * @param postUrl
	 *            博文地址
	 * @return 封装了博文信息的BaiduHi
	 */
	public BaiduHi getBaiduHi(String postUrl) {
		String html = downloadPage(postUrl);
		// /<div class="tit">
		String titleDivRegex = "<div[\\s]id=\"m_blog\"[\\s]class=\"modbox\"[\\s]style=\"overflow-x:hidden;\"><div[\\s]class=\"tit\">.+?</div><div[\\s]class=\"date\">";
		Pattern titleDivPattern = Pattern.compile(titleDivRegex);
		Matcher titleDivMatcher = titleDivPattern.matcher(html);
		String title = null;
		if (titleDivMatcher.find()) {
			title = titleDivMatcher
					.group()
					.replaceAll(
							"<div[\\s]id=\"m_blog\"[\\s]class=\"modbox\"[\\s]style=\"overflow-x:hidden;\"><div[\\s]class=\"tit\">",
							"")
					.replaceAll("</div><div[\\s]class=\"date\">", "").trim();
		}
		String dateDivRegex = "<div[\\s]class=\"date\">.+?</div>";
		Pattern dateDivPattern = Pattern.compile(dateDivRegex);
		Matcher dateMatcher = dateDivPattern.matcher(html);
		String dateStr = null;
		Date postDate = null;
		if (dateMatcher.find()) {
			dateStr = dateMatcher.group().replaceAll(
					"<div[\\s]class=\"date\">", "").replaceAll("</div>", "")
					.trim();
			postDate = getDate(dateStr);
		}
		String textDivRegex = "<div[\\s]id=\"blog_text\"[\\s]class=\"cnt\"[\\s]+>.+?</div>";
		Pattern textDivPattern = Pattern.compile(textDivRegex);
		Matcher textMatcher = textDivPattern.matcher(html);
		String text = null;
		if (textMatcher.find()) {
			text = textMatcher.group().replaceAll(
					"<div[\\s]id=\"blog_text\"[\\s]class=\"cnt\"[\\s]+>", "")
					.replaceAll("</div>", "").trim();
		}
		String categoriesRegex = "title=\"查看该分类中所有文章\">类别:.+?</a>";
		Pattern categoriesDivPattern = Pattern.compile(categoriesRegex);
		Matcher categoriesMatcher = categoriesDivPattern.matcher(html);
		String categories = null;
		if (categoriesMatcher.find()) {
			categories = categoriesMatcher.group().replaceAll(
					"title=\"查看该分类中所有文章\">类别:", "").replaceAll("</a>", "")
					.trim();
		}
		BaiduHi hi = new BaiduHi();
		hi.setTitle(title);
		hi.setDescription(text);
		hi.setCategories(categories);
		hi.setDateCreated(postDate);
		return hi;
	}

	/**
	 * 解析博文中的日期格式返回Date类型
	 * 日期格式为:2011年07月01日 星期五 下午 01:05
	 * @param str
	 *            博文中的日期
	 * @return Date类型日期
	 */
	@SuppressWarnings("deprecation")
	private Date getDate(String str) {
		String yearStr = str.substring(0, str.indexOf("年")).trim();
		String monthStr = str.substring(str.indexOf("年"), str.indexOf("月"))
				.replace("年", "").trim();
		String dayStr = str.substring(str.indexOf("月"), str.indexOf("日"))
				.replace("月", "").trim();
		String timeStr = str.substring(str.indexOf("午")).replace("午", "")
				.trim();
		String hourStr = timeStr.split(":")[0];
		String minutesStr = timeStr.split(":")[1];
		Date date = new Date();
		date.setYear(Integer.parseInt(yearStr) - 1900);
		date.setMonth(Integer.parseInt(monthStr) - 1);
		date.setDate(Integer.parseInt(dayStr));
		if (str.contains("下午")) {
			date.setHours(Integer.parseInt(hourStr) + 12);
		} else {
			date.setHours(Integer.parseInt(hourStr));
		}
		date.setMinutes(Integer.parseInt(minutesStr));
		return date;
	}
}

Transfer 

package cn.mingyuan.baidu2csdn.core;

import java.util.Stack;

/**
 * 搬家
 * 
 * @author mingyuanonline@gmail.com
 * 
 */
public class Transfer {
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		//String postUrl = "http://hi.baidu.com/cnjsp";
		String postUrl = "http://hi.baidu.com/xwdreamer";
		BaiduHiFetcher fetcher = new BaiduHiFetcher();
		Stack<String> urls = null;
		urls = fetcher.getAllPostLink(postUrl);
		while (!urls.isEmpty()) {
			String url = urls.pop();
			BaiduHi hi = null;
			hi = fetcher.getBaiduHi(url);
			CSDNPost post = new CSDNPost();
			post.setTitle(hi.getTitle());
			post.setDescription(hi.getDescription());
			post.setCategories(new String[] { hi.getCategories() });
			post.setDateCreated(hi.getDateCreated());
			post.publish();
			try {
				Thread.sleep(5 * 1000);
			} catch (InterruptedException e) {
				System.out.println("休眠出错");
			}
		}
	}
}

DeletePostById

package cn.mingyuan.baidu2csdn.core;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.xmlrpc.XmlRpcException;
import org.apache.xmlrpc.client.XmlRpcClient;
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;

public class DeletePostById {
	private static XmlRpcClientConfigImpl config;
	private static XmlRpcClient client;
	static {
		config = new XmlRpcClientConfigImpl();
		try {
			config.setServerURL(new URL(
					"http://blog.csdn.net/telnetor/services/metablogapi.aspx"));
		} catch (MalformedURLException e) {
			System.out.println("请检查url");
		}
		client = new XmlRpcClient();
		client.setConfig(config);
	}

	/**
	 * 删除帖子
	 * 
	 * @param appkey
	 *            appkey,可以任意,这是一个忽略的值
	 * @param postid
	 *            帖子id
	 * @param username
	 *            用户名
	 * @param password
	 *            密码
	 * @param publish
	 *            博客在帖子被删除之后是否重新发布
	 */
	public static void delete(String appkey, String postid, String username,
			String password, boolean publish) {

		Object[] params = new Object[] { "ignored value", postid, username,
				password, true };
		try {
			client.execute("blogger.deletePost", params);
		} catch (XmlRpcException e) {
			System.out.println("删除出错,postid=" + postid);
		}
		System.out.println(postid + "删除完毕");

	}

	/**
	 * @param args
	 * @throws InterruptedException
	 */
	public static void main(String[] args) throws InterruptedException {
		BufferedReader reader = null;
		String line;
		try {
			reader = new BufferedReader(new InputStreamReader(
					new FileInputStream("content")));
			while ((line = reader.readLine()) != null) {
				line = line.split("生成博文id为:")[1];
				delete("ignored", line, "your username", "your password", true);
				Thread.sleep(1000 * 10);
			}
		} catch (FileNotFoundException e1) {
			System.out.println("文件没找到");
		} catch (IOException e) {
			System.out.println("读取文件失败");
		}

	}
}

抱歉!评论已关闭.