源于Stanford在Coursera上的Mining Massive Datasets课程。做Week 1里的作业中要求做一些和pagerank有关的计算。
#include <iostream> #include <iterator> #include <fstream> #include <memory> #include <stdexcept> #include <string> #include <sstream> #include <vector> #include <boost/numeric/ublas/io.hpp> #include <boost/numeric/ublas/vector.hpp> #include <boost/numeric/ublas/matrix.hpp> #include <boost/graph/adjacency_list.hpp> #include <boost/graph/graph_traits.hpp> const double CONVERGENCE_CRITERIA = 1e-6; double beta; int num_vertices; std::map<std::string, int> vertex_index; std::vector<std::string> index_vertex; using Graph = boost::adjacency_list<boost::vecS, boost::vecS, boost::directedS>; Graph connection_graph; boost::numeric::ublas::matrix<double> M; boost::numeric::ublas::vector<double> v, v1, vtax; void getBeta(const std::string& betaArg) { std::stringstream ss(betaArg); ss>>beta; } void mapVertexIndex(const std::string& vertices_line) { std::stringstream ss(vertices_line); std::string vertex; int index = 0; while(ss>>vertex) { vertex_index[vertex] = index++; index_vertex.push_back(vertex); boost::add_vertex(connection_graph); } num_vertices = index_vertex.size(); } void loadGraph(const std::string& fileName) { std::ifstream ifs(fileName); if (!ifs.is_open()) { throw std::runtime_error(std::string("Unable to open file ") + fileName); } std::string vertex_declaration; std::getline(ifs, vertex_declaration); mapVertexIndex(vertex_declaration); std::string v1, v2; while(ifs>>v1>>v2) { int vi1 = vertex_index[v1]; int vi2 = vertex_index[v2]; boost::add_edge(vi1, vi2, connection_graph); } } void displayInput() { std::cout<<"Beta: "<<beta<<std::endl; std::cout<<"Vertices: "; auto vertex_iters = boost::vertices(connection_graph); std::transform(vertex_iters.first, vertex_iters.second, std::ostream_iterator<std::string>( std::cout, " "), [](int index) { return index_vertex[index]; }); std::cout<<std::endl; std::cout<<"Edges: "<<std::endl; auto edge_iters = boost::edges(connection_graph); std::transform(edge_iters.first, edge_iters.second, std::ostream_iterator<std::string>( std::cout, "\n"), [](decltype(*edge_iters.first) edge) { return index_vertex[source(edge, connection_graph)] + " -> " + index_vertex[target(edge, connection_graph)]; }); } void generateMatrix() { M.resize(num_vertices, num_vertices); M.clear(); auto vertex_iters = boost::vertices(connection_graph); for(auto v0 = vertex_iters.first; v0 != vertex_iters.second; ++v0) { auto vertex = *v0; auto degree = boost::out_degree(vertex, connection_graph); if (degree == 0) { return; } double degree_weight = 1.0 / degree; auto adjacent_vertices_iter = boost::adjacent_vertices( vertex, connection_graph); for(auto v1 = adjacent_vertices_iter.first; v1 != adjacent_vertices_iter.second; ++v1) { auto target_vertex = *v1; M(target_vertex, vertex) = degree_weight; } } } void initialGuess() { v.resize(num_vertices); v1.resize(num_vertices); vtax.resize(num_vertices); std::fill(v.begin(), v.end(), 1.0 / num_vertices); std::fill(vtax.begin(), vtax.end(), (1 - beta) / num_vertices); } inline bool checkConvergence() { return boost::numeric::ublas::norm_1(v - v1) < CONVERGENCE_CRITERIA; } void solve() { int n_iter = 0; do { v1 = beta * boost::numeric::ublas::prod(M, v) + vtax; v.swap(v1); std::cout<<"Iteration "<<(++n_iter)<<": "<<v<<std::endl; } while(!checkConvergence()); } void displayPageRank() { auto vertex_iters = boost::vertices(connection_graph); auto index = 0; for(auto vi = vertex_iters.first; vi != vertex_iters.second; ++vi, ++index) { auto vertex_index = *vi; std::cout<<index_vertex[vertex_index]<<" = "<<v[index]<<std::endl; } } int main(int argc, char* argv[]) { if (argc != 3) { std::cerr<<"pagerank [beta] [graph file]"<<std::endl; return -1; } getBeta(argv[1]); loadGraph(argv[2]); displayInput(); generateMatrix(); initialGuess(); solve(); displayPageRank(); return 0; }
其中beta为tax的权重,graph file的结构,例如问题3里面的图:
a b c a b a c b c c a
第一行为所有的顶点名,第二行起为顶点->顶点的有向边。
运行例子:
~/P/m/pagerank> ./a.out 0.8 q3 Beta: 0.8 Vertices: a b c Edges: a -> b a -> c b -> c c -> a Iteration 1: [3](0.333333,0.2,0.466667) Iteration 2: [3](0.44,0.2,0.36) Iteration 3: [3](0.354667,0.242667,0.402667) Iteration 4: [3](0.3888,0.208533,0.402667) Iteration 5: [3](0.3888,0.222187,0.389013) Iteration 6: [3](0.377877,0.222187,0.399936) Iteration 7: [3](0.386615,0.217818,0.395567) Iteration 8: [3](0.38312,0.221313,0.395567) Iteration 9: [3](0.38312,0.219915,0.396965) Iteration 10: [3](0.384239,0.219915,0.395847) Iteration 11: [3](0.383344,0.220362,0.396294) Iteration 12: [3](0.383702,0.220004,0.396294) Iteration 13: [3](0.383702,0.220147,0.396151) Iteration 14: [3](0.383587,0.220147,0.396265) Iteration 15: [3](0.383679,0.220102,0.396219) Iteration 16: [3](0.383642,0.220138,0.396219) Iteration 17: [3](0.383642,0.220124,0.396234) Iteration 18: [3](0.383654,0.220124,0.396222) Iteration 19: [3](0.383645,0.220128,0.396227) Iteration 20: [3](0.383648,0.220125,0.396227) Iteration 21: [3](0.383648,0.220126,0.396226) Iteration 22: [3](0.383647,0.220126,0.396227) Iteration 23: [3](0.383648,0.220126,0.396226) Iteration 24: [3](0.383648,0.220126,0.396226) a = 0.383648 b = 0.220126 c = 0.396226
当然求解矩阵方程的方法实在是太多了,迭代未必是最好的。