ioftools / networkxMiCe / networkxmaster / networkx / algorithms / link_analysis / pagerank_alg.py @ 5cef0f13
History  View  Annotate  Download (16.8 KB)
1 
"""PageRank analysis of graph structure. """


2 
# Copyright (C) 20042019 by

3 
# Aric Hagberg <hagberg@lanl.gov>

4 
# Dan Schult <dschult@colgate.edu>

5 
# Pieter Swart <swart@lanl.gov>

6 
# All rights reserved.

7 
# BSD license.

8 
# NetworkX:http://networkx.github.io/

9 
import networkx as nx 
10 
from networkx.utils import not_implemented_for 
11 
__author__ = """\n""".join(["Aric Hagberg <aric.hagberg@gmail.com>", 
12 
"Brandon Liu <brandon.k.liu@gmail.com"])

13 
__all__ = ['pagerank', 'pagerank_numpy', 'pagerank_scipy', 'google_matrix'] 
14  
15  
16 
@not_implemented_for('multigraph') 
17 
def pagerank(G, alpha=0.85, personalization=None, 
18 
max_iter=100, tol=1.0e6, nstart=None, weight='weight', 
19 
dangling=None):

20 
"""Returns the PageRank of the nodes in the graph.

21 

22 
PageRank computes a ranking of the nodes in the graph G based on

23 
the structure of the incoming links. It was originally designed as

24 
an algorithm to rank web pages.

25 

26 
Parameters

27 


28 
G : graph

29 
A NetworkX graph. Undirected graphs will be converted to a directed

30 
graph with two directed edges for each undirected edge.

31 

32 
alpha : float, optional

33 
Damping parameter for PageRank, default=0.85.

34 

35 
personalization: dict, optional

36 
The "personalization vector" consisting of a dictionary with a

37 
key some subset of graph nodes and personalization value each of those.

38 
At least one personalization value must be nonzero.

39 
If not specfiied, a nodes personalization value will be zero.

40 
By default, a uniform distribution is used.

41 

42 
max_iter : integer, optional

43 
Maximum number of iterations in power method eigenvalue solver.

44 

45 
tol : float, optional

46 
Error tolerance used to check convergence in power method solver.

47 

48 
nstart : dictionary, optional

49 
Starting value of PageRank iteration for each node.

50 

51 
weight : key, optional

52 
Edge data key to use as weight. If None weights are set to 1.

53 

54 
dangling: dict, optional

55 
The outedges to be assigned to any "dangling" nodes, i.e., nodes without

56 
any outedges. The dict key is the node the outedge points to and the dict

57 
value is the weight of that outedge. By default, dangling nodes are given

58 
outedges according to the personalization vector (uniform if not

59 
specified). This must be selected to result in an irreducible transition

60 
matrix (see notes under google_matrix). It may be common to have the

61 
dangling dict to be the same as the personalization dict.

62 

63 
Returns

64 


65 
pagerank : dictionary

66 
Dictionary of nodes with PageRank as value

67 

68 
Examples

69 


70 
>>> G = nx.DiGraph(nx.path_graph(4))

71 
>>> pr = nx.pagerank(G, alpha=0.9)

72 

73 
Notes

74 


75 
The eigenvector calculation is done by the power iteration method

76 
and has no guarantee of convergence. The iteration will stop after

77 
an error tolerance of ``len(G) * tol`` has been reached. If the

78 
number of iterations exceed `max_iter`, a

79 
:exc:`networkx.exception.PowerIterationFailedConvergence` exception

80 
is raised.

81 

82 
The PageRank algorithm was designed for directed graphs but this

83 
algorithm does not check if the input graph is directed and will

84 
execute on undirected graphs by converting each edge in the

85 
directed graph to two edges.

86 

87 
See Also

88 


89 
pagerank_numpy, pagerank_scipy, google_matrix

90 

91 
Raises

92 


93 
PowerIterationFailedConvergence

94 
If the algorithm fails to converge to the specified tolerance

95 
within the specified number of iterations of the power iteration

96 
method.

97 

98 
References

99 


100 
.. [1] A. Langville and C. Meyer,

101 
"A survey of eigenvector methods of web information retrieval."

102 
http://citeseer.ist.psu.edu/713792.html

103 
.. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,

104 
The PageRank citation ranking: Bringing order to the Web. 1999

105 
http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=199966&format=pdf

106 

107 
"""

108 
if len(G) == 0: 
109 
return {}

110  
111 
if not G.is_directed(): 
112 
D = G.to_directed() 
113 
else:

114 
D = G 
115  
116 
# Create a copy in (right) stochastic form

117 
W = nx.stochastic_graph(D, weight=weight) 
118 
N = W.number_of_nodes() 
119  
120 
# Choose fixed starting vector if not given

121 
if nstart is None: 
122 
x = dict.fromkeys(W, 1.0 / N) 
123 
else:

124 
# Normalized nstart vector

125 
s = float(sum(nstart.values())) 
126 
x = dict((k, v / s) for k, v in nstart.items()) 
127  
128 
if personalization is None: 
129 
# Assign uniform personalization vector if not given

130 
p = dict.fromkeys(W, 1.0 / N) 
131 
else:

132 
s = float(sum(personalization.values())) 
133 
p = dict((k, v / s) for k, v in personalization.items()) 
134  
135 
if dangling is None: 
136 
# Use personalization vector if dangling vector not specified

137 
dangling_weights = p 
138 
else:

139 
s = float(sum(dangling.values())) 
140 
dangling_weights = dict((k, v / s) for k, v in dangling.items()) 
141 
dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0] 
142  
143 
# power iteration: make up to max_iter iterations

144 
for _ in range(max_iter): 
145 
xlast = x 
146 
x = dict.fromkeys(xlast.keys(), 0) 
147 
danglesum = alpha * sum(xlast[n] for n in dangling_nodes) 
148 
for n in x: 
149 
# this matrix multiply looks odd because it is

150 
# doing a left multiply x^T=xlast^T*W

151 
for nbr in W[n]: 
152 
x[nbr] += alpha * xlast[n] * W[n][nbr][weight] 
153 
x[n] += danglesum * dangling_weights.get(n, 0) + (1.0  alpha) * p.get(n, 0) 
154 
# check convergence, l1 norm

155 
err = sum([abs(x[n]  xlast[n]) for n in x]) 
156 
if err < N * tol:

157 
return x

158 
raise nx.PowerIterationFailedConvergence(max_iter)

159  
160  
161 
def google_matrix(G, alpha=0.85, personalization=None, 
162 
nodelist=None, weight='weight', dangling=None): 
163 
"""Returns the Google matrix of the graph.

164 

165 
Parameters

166 


167 
G : graph

168 
A NetworkX graph. Undirected graphs will be converted to a directed

169 
graph with two directed edges for each undirected edge.

170 

171 
alpha : float

172 
The damping factor.

173 

174 
personalization: dict, optional

175 
The "personalization vector" consisting of a dictionary with a

176 
key some subset of graph nodes and personalization value each of those.

177 
At least one personalization value must be nonzero.

178 
If not specfiied, a nodes personalization value will be zero.

179 
By default, a uniform distribution is used.

180 

181 
nodelist : list, optional

182 
The rows and columns are ordered according to the nodes in nodelist.

183 
If nodelist is None, then the ordering is produced by G.nodes().

184 

185 
weight : key, optional

186 
Edge data key to use as weight. If None weights are set to 1.

187 

188 
dangling: dict, optional

189 
The outedges to be assigned to any "dangling" nodes, i.e., nodes without

190 
any outedges. The dict key is the node the outedge points to and the dict

191 
value is the weight of that outedge. By default, dangling nodes are given

192 
outedges according to the personalization vector (uniform if not

193 
specified) This must be selected to result in an irreducible transition

194 
matrix (see notes below). It may be common to have the dangling dict to

195 
be the same as the personalization dict.

196 

197 
Returns

198 


199 
A : NumPy matrix

200 
Google matrix of the graph

201 

202 
Notes

203 


204 
The matrix returned represents the transition matrix that describes the

205 
Markov chain used in PageRank. For PageRank to converge to a unique

206 
solution (i.e., a unique stationary distribution in a Markov chain), the

207 
transition matrix must be irreducible. In other words, it must be that

208 
there exists a path between every pair of nodes in the graph, or else there

209 
is the potential of "rank sinks."

210 

211 
This implementation works with Multi(Di)Graphs. For multigraphs the

212 
weight between two nodes is set to be the sum of all edge weights

213 
between those nodes.

214 

215 
See Also

216 


217 
pagerank, pagerank_numpy, pagerank_scipy

218 
"""

219 
import numpy as np 
220  
221 
if nodelist is None: 
222 
nodelist = list(G)

223  
224 
M = nx.to_numpy_matrix(G, nodelist=nodelist, weight=weight) 
225 
N = len(G)

226 
if N == 0: 
227 
return M

228  
229 
# Personalization vector

230 
if personalization is None: 
231 
p = np.repeat(1.0 / N, N)

232 
else:

233 
p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float) 
234 
p /= p.sum() 
235  
236 
# Dangling nodes

237 
if dangling is None: 
238 
dangling_weights = p 
239 
else:

240 
# Convert the dangling dictionary into an array in nodelist order

241 
dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], 
242 
dtype=float)

243 
dangling_weights /= dangling_weights.sum() 
244 
dangling_nodes = np.where(M.sum(axis=1) == 0)[0] 
245  
246 
# Assign dangling_weights to any dangling nodes (nodes with no out links)

247 
for node in dangling_nodes: 
248 
M[node] = dangling_weights 
249  
250 
M /= M.sum(axis=1) # Normalize rows to sum to 1 
251  
252 
return alpha * M + (1  alpha) * p 
253  
254  
255 
def pagerank_numpy(G, alpha=0.85, personalization=None, weight='weight', 
256 
dangling=None):

257 
"""Returns the PageRank of the nodes in the graph.

258 

259 
PageRank computes a ranking of the nodes in the graph G based on

260 
the structure of the incoming links. It was originally designed as

261 
an algorithm to rank web pages.

262 

263 
Parameters

264 


265 
G : graph

266 
A NetworkX graph. Undirected graphs will be converted to a directed

267 
graph with two directed edges for each undirected edge.

268 

269 
alpha : float, optional

270 
Damping parameter for PageRank, default=0.85.

271 

272 
personalization: dict, optional

273 
The "personalization vector" consisting of a dictionary with a

274 
key some subset of graph nodes and personalization value each of those.

275 
At least one personalization value must be nonzero.

276 
If not specfiied, a nodes personalization value will be zero.

277 
By default, a uniform distribution is used.

278 

279 
weight : key, optional

280 
Edge data key to use as weight. If None weights are set to 1.

281 

282 
dangling: dict, optional

283 
The outedges to be assigned to any "dangling" nodes, i.e., nodes without

284 
any outedges. The dict key is the node the outedge points to and the dict

285 
value is the weight of that outedge. By default, dangling nodes are given

286 
outedges according to the personalization vector (uniform if not

287 
specified) This must be selected to result in an irreducible transition

288 
matrix (see notes under google_matrix). It may be common to have the

289 
dangling dict to be the same as the personalization dict.

290 

291 
Returns

292 


293 
pagerank : dictionary

294 
Dictionary of nodes with PageRank as value.

295 

296 
Examples

297 


298 
>>> G = nx.DiGraph(nx.path_graph(4))

299 
>>> pr = nx.pagerank_numpy(G, alpha=0.9)

300 

301 
Notes

302 


303 
The eigenvector calculation uses NumPy's interface to the LAPACK

304 
eigenvalue solvers. This will be the fastest and most accurate

305 
for small graphs.

306 

307 
This implementation works with Multi(Di)Graphs. For multigraphs the

308 
weight between two nodes is set to be the sum of all edge weights

309 
between those nodes.

310 

311 
See Also

312 


313 
pagerank, pagerank_scipy, google_matrix

314 

315 
References

316 


317 
.. [1] A. Langville and C. Meyer,

318 
"A survey of eigenvector methods of web information retrieval."

319 
http://citeseer.ist.psu.edu/713792.html

320 
.. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,

321 
The PageRank citation ranking: Bringing order to the Web. 1999

322 
http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=199966&format=pdf

323 
"""

324 
import numpy as np 
325 
if len(G) == 0: 
326 
return {}

327 
M = google_matrix(G, alpha, personalization=personalization, 
328 
weight=weight, dangling=dangling) 
329 
# use numpy LAPACK solver

330 
eigenvalues, eigenvectors = np.linalg.eig(M.T) 
331 
ind = np.argmax(eigenvalues) 
332 
# eigenvector of largest eigenvalue is at ind, normalized

333 
largest = np.array(eigenvectors[:, ind]).flatten().real 
334 
norm = float(largest.sum())

335 
return dict(zip(G, map(float, largest / norm))) 
336  
337  
338 
def pagerank_scipy(G, alpha=0.85, personalization=None, 
339 
max_iter=100, tol=1.0e6, weight='weight', 
340 
dangling=None):

341 
"""Returns the PageRank of the nodes in the graph.

342 

343 
PageRank computes a ranking of the nodes in the graph G based on

344 
the structure of the incoming links. It was originally designed as

345 
an algorithm to rank web pages.

346 

347 
Parameters

348 


349 
G : graph

350 
A NetworkX graph. Undirected graphs will be converted to a directed

351 
graph with two directed edges for each undirected edge.

352 

353 
alpha : float, optional

354 
Damping parameter for PageRank, default=0.85.

355 

356 
personalization: dict, optional

357 
The "personalization vector" consisting of a dictionary with a

358 
key some subset of graph nodes and personalization value each of those.

359 
At least one personalization value must be nonzero.

360 
If not specfiied, a nodes personalization value will be zero.

361 
By default, a uniform distribution is used.

362 

363 
max_iter : integer, optional

364 
Maximum number of iterations in power method eigenvalue solver.

365 

366 
tol : float, optional

367 
Error tolerance used to check convergence in power method solver.

368 

369 
weight : key, optional

370 
Edge data key to use as weight. If None weights are set to 1.

371 

372 
dangling: dict, optional

373 
The outedges to be assigned to any "dangling" nodes, i.e., nodes without

374 
any outedges. The dict key is the node the outedge points to and the dict

375 
value is the weight of that outedge. By default, dangling nodes are given

376 
outedges according to the personalization vector (uniform if not

377 
specified) This must be selected to result in an irreducible transition

378 
matrix (see notes under google_matrix). It may be common to have the

379 
dangling dict to be the same as the personalization dict.

380 

381 
Returns

382 


383 
pagerank : dictionary

384 
Dictionary of nodes with PageRank as value

385 

386 
Examples

387 


388 
>>> G = nx.DiGraph(nx.path_graph(4))

389 
>>> pr = nx.pagerank_scipy(G, alpha=0.9)

390 

391 
Notes

392 


393 
The eigenvector calculation uses power iteration with a SciPy

394 
sparse matrix representation.

395 

396 
This implementation works with Multi(Di)Graphs. For multigraphs the

397 
weight between two nodes is set to be the sum of all edge weights

398 
between those nodes.

399 

400 
See Also

401 


402 
pagerank, pagerank_numpy, google_matrix

403 

404 
Raises

405 


406 
PowerIterationFailedConvergence

407 
If the algorithm fails to converge to the specified tolerance

408 
within the specified number of iterations of the power iteration

409 
method.

410 

411 
References

412 


413 
.. [1] A. Langville and C. Meyer,

414 
"A survey of eigenvector methods of web information retrieval."

415 
http://citeseer.ist.psu.edu/713792.html

416 
.. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,

417 
The PageRank citation ranking: Bringing order to the Web. 1999

418 
http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=199966&format=pdf

419 
"""

420 
import scipy.sparse 
421  
422 
N = len(G)

423 
if N == 0: 
424 
return {}

425  
426 
nodelist = list(G)

427 
M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight, 
428 
dtype=float)

429 
S = scipy.array(M.sum(axis=1)).flatten()

430 
S[S != 0] = 1.0 / S[S != 0] 
431 
Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr') 
432 
M = Q * M 
433  
434 
# initial vector

435 
x = scipy.repeat(1.0 / N, N)

436  
437 
# Personalization vector

438 
if personalization is None: 
439 
p = scipy.repeat(1.0 / N, N)

440 
else:

441 
p = scipy.array([personalization.get(n, 0) for n in nodelist], dtype=float) 
442 
p = p / p.sum() 
443  
444 
# Dangling nodes

445 
if dangling is None: 
446 
dangling_weights = p 
447 
else:

448 
# Convert the dangling dictionary into an array in nodelist order

449 
dangling_weights = scipy.array([dangling.get(n, 0) for n in nodelist], 
450 
dtype=float)

451 
dangling_weights /= dangling_weights.sum() 
452 
is_dangling = scipy.where(S == 0)[0] 
453  
454 
# power iteration: make up to max_iter iterations

455 
for _ in range(max_iter): 
456 
xlast = x 
457 
x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \

458 
(1  alpha) * p

459 
# check convergence, l1 norm

460 
err = scipy.absolute(x  xlast).sum() 
461 
if err < N * tol:

462 
return dict(zip(nodelist, map(float, x))) 
463 
raise nx.PowerIterationFailedConvergence(max_iter)

464  
465  
466 
# fixture for nose tests

467 
def setup_module(module): 
468 
from nose import SkipTest 
469 
try:

470 
import numpy 
471 
except:

472 
raise SkipTest("NumPy not available") 
473 
try:

474 
import scipy 
475 
except:

476 
raise SkipTest("SciPy not available") 