现在的位置: 首页 > 综合 > 正文

PageRank的模拟计算

2018年03月31日 ⁄ 综合 ⁄ 共 4706字 ⁄ 字号 评论关闭

源于Stanford在Coursera上的Mining Massive Datasets课程。做Week 1里的作业中要求做一些和pagerank有关的计算。

#include <iostream>
#include <iterator>
#include <fstream>
#include <memory>
#include <stdexcept>
#include <string>
#include <sstream>
#include <vector>

#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/vector.hpp>
#include <boost/numeric/ublas/matrix.hpp>

#include <boost/graph/adjacency_list.hpp>
#include <boost/graph/graph_traits.hpp>

const double CONVERGENCE_CRITERIA = 1e-6;

double beta;
int num_vertices;
std::map<std::string, int> vertex_index;
std::vector<std::string> index_vertex;

using Graph = boost::adjacency_list<boost::vecS, boost::vecS, boost::directedS>;
Graph connection_graph;

boost::numeric::ublas::matrix<double> M;
boost::numeric::ublas::vector<double> v, v1, vtax;

void getBeta(const std::string& betaArg) {
    std::stringstream ss(betaArg);
    ss>>beta;
}

void mapVertexIndex(const std::string& vertices_line) {
    std::stringstream ss(vertices_line);
    std::string vertex;
    int index = 0;
    while(ss>>vertex) {
        vertex_index[vertex] = index++;
        index_vertex.push_back(vertex);

        boost::add_vertex(connection_graph);
    }

    num_vertices = index_vertex.size();
}

void loadGraph(const std::string& fileName) {
    std::ifstream ifs(fileName);
    if (!ifs.is_open()) {
        throw std::runtime_error(std::string("Unable to open file ") + fileName);
    }

    std::string vertex_declaration;
    std::getline(ifs, vertex_declaration);
    mapVertexIndex(vertex_declaration);

    std::string v1, v2;
    while(ifs>>v1>>v2) {
        int vi1 = vertex_index[v1];
        int vi2 = vertex_index[v2];

        boost::add_edge(vi1, vi2, connection_graph);
    }
}

void displayInput() {
    std::cout<<"Beta: "<<beta<<std::endl;

    std::cout<<"Vertices: ";
    auto vertex_iters = boost::vertices(connection_graph);
    std::transform(vertex_iters.first, vertex_iters.second,
        std::ostream_iterator<std::string>(
            std::cout, " "),
        [](int index) { return index_vertex[index]; });
    std::cout<<std::endl;

    std::cout<<"Edges: "<<std::endl;
    auto edge_iters = boost::edges(connection_graph);
    std::transform(edge_iters.first, edge_iters.second,
        std::ostream_iterator<std::string>(
            std::cout, "\n"),
        [](decltype(*edge_iters.first) edge) {
            return index_vertex[source(edge, connection_graph)] + " -> " +
                index_vertex[target(edge, connection_graph)];
            });
}

void generateMatrix() {
    M.resize(num_vertices, num_vertices);
    M.clear();

    auto vertex_iters = boost::vertices(connection_graph);
    for(auto v0 = vertex_iters.first; v0 != vertex_iters.second; ++v0) {
        auto vertex = *v0;
        auto degree = boost::out_degree(vertex, connection_graph);
        if (degree == 0) {
            return;
        }

        double degree_weight = 1.0 / degree;
        auto adjacent_vertices_iter = boost::adjacent_vertices(
            vertex, connection_graph);

        for(auto v1 = adjacent_vertices_iter.first;
                v1 != adjacent_vertices_iter.second; ++v1) {
            auto target_vertex = *v1;
            M(target_vertex, vertex) = degree_weight;
        }
    }
}

void initialGuess() {
    v.resize(num_vertices);
    v1.resize(num_vertices);
    vtax.resize(num_vertices);

    std::fill(v.begin(), v.end(), 1.0 / num_vertices);
    std::fill(vtax.begin(), vtax.end(), (1 - beta) / num_vertices);
}

inline bool checkConvergence() {
    return boost::numeric::ublas::norm_1(v - v1) < CONVERGENCE_CRITERIA;
}

void solve() {
    int n_iter = 0;
    do {
        v1 = beta * boost::numeric::ublas::prod(M, v) + vtax;
        v.swap(v1);

        std::cout<<"Iteration "<<(++n_iter)<<": "<<v<<std::endl;
    } while(!checkConvergence());
}

void displayPageRank() {
    auto vertex_iters = boost::vertices(connection_graph);
    auto index = 0;
    for(auto vi = vertex_iters.first; vi != vertex_iters.second;
            ++vi, ++index) {
        auto vertex_index = *vi;
        std::cout<<index_vertex[vertex_index]<<"  =  "<<v[index]<<std::endl;
    }
}

int main(int argc, char* argv[]) {
    if (argc != 3) {
        std::cerr<<"pagerank [beta] [graph file]"<<std::endl;
        return -1;
    }

    getBeta(argv[1]);
    loadGraph(argv[2]);

    displayInput();

    generateMatrix();
    initialGuess();

    solve();

    displayPageRank();
    
    return 0;
}

其中beta为tax的权重,graph file的结构,例如问题3里面的图:

a b c
a b
a c
b c
c a

第一行为所有的顶点名,第二行起为顶点->顶点的有向边。

运行例子:

~/P/m/pagerank> ./a.out 0.8 q3
Beta: 0.8
Vertices: a b c 
Edges: 
a -> b
a -> c
b -> c
c -> a
Iteration 1: [3](0.333333,0.2,0.466667)
Iteration 2: [3](0.44,0.2,0.36)
Iteration 3: [3](0.354667,0.242667,0.402667)
Iteration 4: [3](0.3888,0.208533,0.402667)
Iteration 5: [3](0.3888,0.222187,0.389013)
Iteration 6: [3](0.377877,0.222187,0.399936)
Iteration 7: [3](0.386615,0.217818,0.395567)
Iteration 8: [3](0.38312,0.221313,0.395567)
Iteration 9: [3](0.38312,0.219915,0.396965)
Iteration 10: [3](0.384239,0.219915,0.395847)
Iteration 11: [3](0.383344,0.220362,0.396294)
Iteration 12: [3](0.383702,0.220004,0.396294)
Iteration 13: [3](0.383702,0.220147,0.396151)
Iteration 14: [3](0.383587,0.220147,0.396265)
Iteration 15: [3](0.383679,0.220102,0.396219)
Iteration 16: [3](0.383642,0.220138,0.396219)
Iteration 17: [3](0.383642,0.220124,0.396234)
Iteration 18: [3](0.383654,0.220124,0.396222)
Iteration 19: [3](0.383645,0.220128,0.396227)
Iteration 20: [3](0.383648,0.220125,0.396227)
Iteration 21: [3](0.383648,0.220126,0.396226)
Iteration 22: [3](0.383647,0.220126,0.396227)
Iteration 23: [3](0.383648,0.220126,0.396226)
Iteration 24: [3](0.383648,0.220126,0.396226)
a  =  0.383648
b  =  0.220126
c  =  0.396226

当然求解矩阵方程的方法实在是太多了,迭代未必是最好的。

抱歉!评论已关闭.