## package_rank.R
## cran各包的PackageRank计算过程,采用pagerank算法
## Licensed under the Apache License, Version 2.0
library(Matrix)
## 构造链接矩阵
load_data_pair <- function(){
data<-read.table('package_detail.txt', header=FALSE)
names(data) <- c('from', 'to', 'type')
fid <- unique(data$from)
tid <- unique(data$to)
uid <- sort(union(fid, tid))
type_weight <- list(c('depend', 'import', 'suggest', 'enhance'), c(1,1,0.5,0.5))
A <- Matrix(data=0, nrow=length(uid), ncol=length(uid))
for (type in type_weight[[1]]){
tdata <- data[data$type==type, 1:2]
fidx <- match(tdata[[1]], uid, nomatch=0)
tidx <- match(tdata[[2]], uid, nomatch=0)
idx <- matrix(c(fidx, tidx), ncol=2)
A[idx] <- type_weight[[2]][match(type, type_weight[[1]])]
}
return(list(A, uid))
}
## rank值调整到0-10
adjust_rank <- function(R, lt=0, ht=10){
MIN_NUM <- 0.0000001
if(min(R) == 0) R <- R+MIN_NUM
R <- log(R)
min_r <- min(R)
max_r <- max(R)
R <- (ht-lt)*(R - min_r)/(max_r-min_r) + lt
return(R)
}
## pagerank计算函数
pagerank <- function(A){
N <- nrow(A)
K <- Diagonal(x=1/rowSums(A))
M <- t(K %*% A) ## 用这种方式来替代矩阵除以向量的操作
d <- 0.85
C <- matrix((1-d)/N, nrow=N, ncol=1)
R0 <- matrix(1/N, nrow=N, ncol=1)
EPS <- 0.0001
MAXITER <- 100
for(i in 1:MAXITER){
R <- d * M %*% R0 + C
e <- sum(abs(R-R0))
cat(paste('iteration', i, 'error', e), '\n')
if(e < EPS) break
R0 <- R
}
R <- as.vector(R)
}
## 计算并输出Package Rank
ranking <- function(){
ret <- load_data_pair()
A <- ret[[1]]
uid <- ret[[2]]
pr <- pagerank(A)
pr <- adjust_rank(pr)
ix <- sort(pr, index.return=TRUE, decreasing=TRUE)$ix
out <- paste(seq(1, length(uid)), uid[ix], sprintf('%.4f',pr[ix]), sep=',')
write.table(out, file='package_rank.txt', quote=FALSE, col.names=FALSE, row.names=FALSE)
}
没有评论:
发表评论