Statistics
| Branch: | Revision:

iof-tools / networkxMiCe / networkx-master / networkx / algorithms / link_analysis / pagerank_alg.py @ 5cef0f13

History | View | Annotate | Download (16.8 KB)

1
"""PageRank analysis of graph structure. """
2
#    Copyright (C) 2004-2019 by
3
#    Aric Hagberg <hagberg@lanl.gov>
4
#    Dan Schult <dschult@colgate.edu>
5
#    Pieter Swart <swart@lanl.gov>
6
#    All rights reserved.
7
#    BSD license.
8
#    NetworkX:http://networkx.github.io/
9
import networkx as nx
10
from networkx.utils import not_implemented_for
11
__author__ = """\n""".join(["Aric Hagberg <aric.hagberg@gmail.com>",
12
                            "Brandon Liu <brandon.k.liu@gmail.com"])
13
__all__ = ['pagerank', 'pagerank_numpy', 'pagerank_scipy', 'google_matrix']
14

    
15

    
16
@not_implemented_for('multigraph')
17
def pagerank(G, alpha=0.85, personalization=None,
18
             max_iter=100, tol=1.0e-6, nstart=None, weight='weight',
19
             dangling=None):
20
    """Returns the PageRank of the nodes in the graph.
21

22
    PageRank computes a ranking of the nodes in the graph G based on
23
    the structure of the incoming links. It was originally designed as
24
    an algorithm to rank web pages.
25

26
    Parameters
27
    ----------
28
    G : graph
29
      A NetworkX graph.  Undirected graphs will be converted to a directed
30
      graph with two directed edges for each undirected edge.
31

32
    alpha : float, optional
33
      Damping parameter for PageRank, default=0.85.
34

35
    personalization: dict, optional
36
      The "personalization vector" consisting of a dictionary with a
37
      key some subset of graph nodes and personalization value each of those.
38
      At least one personalization value must be non-zero.
39
      If not specfiied, a nodes personalization value will be zero.
40
      By default, a uniform distribution is used.
41

42
    max_iter : integer, optional
43
      Maximum number of iterations in power method eigenvalue solver.
44

45
    tol : float, optional
46
      Error tolerance used to check convergence in power method solver.
47

48
    nstart : dictionary, optional
49
      Starting value of PageRank iteration for each node.
50

51
    weight : key, optional
52
      Edge data key to use as weight.  If None weights are set to 1.
53

54
    dangling: dict, optional
55
      The outedges to be assigned to any "dangling" nodes, i.e., nodes without
56
      any outedges. The dict key is the node the outedge points to and the dict
57
      value is the weight of that outedge. By default, dangling nodes are given
58
      outedges according to the personalization vector (uniform if not
59
      specified). This must be selected to result in an irreducible transition
60
      matrix (see notes under google_matrix). It may be common to have the
61
      dangling dict to be the same as the personalization dict.
62

63
    Returns
64
    -------
65
    pagerank : dictionary
66
       Dictionary of nodes with PageRank as value
67

68
    Examples
69
    --------
70
    >>> G = nx.DiGraph(nx.path_graph(4))
71
    >>> pr = nx.pagerank(G, alpha=0.9)
72

73
    Notes
74
    -----
75
    The eigenvector calculation is done by the power iteration method
76
    and has no guarantee of convergence.  The iteration will stop after
77
    an error tolerance of ``len(G) * tol`` has been reached. If the
78
    number of iterations exceed `max_iter`, a
79
    :exc:`networkx.exception.PowerIterationFailedConvergence` exception
80
    is raised.
81

82
    The PageRank algorithm was designed for directed graphs but this
83
    algorithm does not check if the input graph is directed and will
84
    execute on undirected graphs by converting each edge in the
85
    directed graph to two edges.
86

87
    See Also
88
    --------
89
    pagerank_numpy, pagerank_scipy, google_matrix
90

91
    Raises
92
    ------
93
    PowerIterationFailedConvergence
94
        If the algorithm fails to converge to the specified tolerance
95
        within the specified number of iterations of the power iteration
96
        method.
97

98
    References
99
    ----------
100
    .. [1] A. Langville and C. Meyer,
101
       "A survey of eigenvector methods of web information retrieval."
102
       http://citeseer.ist.psu.edu/713792.html
103
    .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
104
       The PageRank citation ranking: Bringing order to the Web. 1999
105
       http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
106

107
    """
108
    if len(G) == 0:
109
        return {}
110

    
111
    if not G.is_directed():
112
        D = G.to_directed()
113
    else:
114
        D = G
115

    
116
    # Create a copy in (right) stochastic form
117
    W = nx.stochastic_graph(D, weight=weight)
118
    N = W.number_of_nodes()
119

    
120
    # Choose fixed starting vector if not given
121
    if nstart is None:
122
        x = dict.fromkeys(W, 1.0 / N)
123
    else:
124
        # Normalized nstart vector
125
        s = float(sum(nstart.values()))
126
        x = dict((k, v / s) for k, v in nstart.items())
127

    
128
    if personalization is None:
129
        # Assign uniform personalization vector if not given
130
        p = dict.fromkeys(W, 1.0 / N)
131
    else:
132
        s = float(sum(personalization.values()))
133
        p = dict((k, v / s) for k, v in personalization.items())
134

    
135
    if dangling is None:
136
        # Use personalization vector if dangling vector not specified
137
        dangling_weights = p
138
    else:
139
        s = float(sum(dangling.values()))
140
        dangling_weights = dict((k, v / s) for k, v in dangling.items())
141
    dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0]
142

    
143
    # power iteration: make up to max_iter iterations
144
    for _ in range(max_iter):
145
        xlast = x
146
        x = dict.fromkeys(xlast.keys(), 0)
147
        danglesum = alpha * sum(xlast[n] for n in dangling_nodes)
148
        for n in x:
149
            # this matrix multiply looks odd because it is
150
            # doing a left multiply x^T=xlast^T*W
151
            for nbr in W[n]:
152
                x[nbr] += alpha * xlast[n] * W[n][nbr][weight]
153
            x[n] += danglesum * dangling_weights.get(n, 0) + (1.0 - alpha) * p.get(n, 0)
154
        # check convergence, l1 norm
155
        err = sum([abs(x[n] - xlast[n]) for n in x])
156
        if err < N * tol:
157
            return x
158
    raise nx.PowerIterationFailedConvergence(max_iter)
159

    
160

    
161
def google_matrix(G, alpha=0.85, personalization=None,
162
                  nodelist=None, weight='weight', dangling=None):
163
    """Returns the Google matrix of the graph.
164

165
    Parameters
166
    ----------
167
    G : graph
168
      A NetworkX graph.  Undirected graphs will be converted to a directed
169
      graph with two directed edges for each undirected edge.
170

171
    alpha : float
172
      The damping factor.
173

174
    personalization: dict, optional
175
      The "personalization vector" consisting of a dictionary with a
176
      key some subset of graph nodes and personalization value each of those.
177
      At least one personalization value must be non-zero.
178
      If not specfiied, a nodes personalization value will be zero.
179
      By default, a uniform distribution is used.
180

181
    nodelist : list, optional
182
      The rows and columns are ordered according to the nodes in nodelist.
183
      If nodelist is None, then the ordering is produced by G.nodes().
184

185
    weight : key, optional
186
      Edge data key to use as weight.  If None weights are set to 1.
187

188
    dangling: dict, optional
189
      The outedges to be assigned to any "dangling" nodes, i.e., nodes without
190
      any outedges. The dict key is the node the outedge points to and the dict
191
      value is the weight of that outedge. By default, dangling nodes are given
192
      outedges according to the personalization vector (uniform if not
193
      specified) This must be selected to result in an irreducible transition
194
      matrix (see notes below). It may be common to have the dangling dict to
195
      be the same as the personalization dict.
196

197
    Returns
198
    -------
199
    A : NumPy matrix
200
       Google matrix of the graph
201

202
    Notes
203
    -----
204
    The matrix returned represents the transition matrix that describes the
205
    Markov chain used in PageRank. For PageRank to converge to a unique
206
    solution (i.e., a unique stationary distribution in a Markov chain), the
207
    transition matrix must be irreducible. In other words, it must be that
208
    there exists a path between every pair of nodes in the graph, or else there
209
    is the potential of "rank sinks."
210

211
    This implementation works with Multi(Di)Graphs. For multigraphs the
212
    weight between two nodes is set to be the sum of all edge weights
213
    between those nodes.
214

215
    See Also
216
    --------
217
    pagerank, pagerank_numpy, pagerank_scipy
218
    """
219
    import numpy as np
220

    
221
    if nodelist is None:
222
        nodelist = list(G)
223

    
224
    M = nx.to_numpy_matrix(G, nodelist=nodelist, weight=weight)
225
    N = len(G)
226
    if N == 0:
227
        return M
228

    
229
    # Personalization vector
230
    if personalization is None:
231
        p = np.repeat(1.0 / N, N)
232
    else:
233
        p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
234
        p /= p.sum()
235

    
236
    # Dangling nodes
237
    if dangling is None:
238
        dangling_weights = p
239
    else:
240
        # Convert the dangling dictionary into an array in nodelist order
241
        dangling_weights = np.array([dangling.get(n, 0) for n in nodelist],
242
                                    dtype=float)
243
        dangling_weights /= dangling_weights.sum()
244
    dangling_nodes = np.where(M.sum(axis=1) == 0)[0]
245

    
246
    # Assign dangling_weights to any dangling nodes (nodes with no out links)
247
    for node in dangling_nodes:
248
        M[node] = dangling_weights
249

    
250
    M /= M.sum(axis=1)  # Normalize rows to sum to 1
251

    
252
    return alpha * M + (1 - alpha) * p
253

    
254

    
255
def pagerank_numpy(G, alpha=0.85, personalization=None, weight='weight',
256
                   dangling=None):
257
    """Returns the PageRank of the nodes in the graph.
258

259
    PageRank computes a ranking of the nodes in the graph G based on
260
    the structure of the incoming links. It was originally designed as
261
    an algorithm to rank web pages.
262

263
    Parameters
264
    ----------
265
    G : graph
266
      A NetworkX graph.  Undirected graphs will be converted to a directed
267
      graph with two directed edges for each undirected edge.
268

269
    alpha : float, optional
270
      Damping parameter for PageRank, default=0.85.
271

272
    personalization: dict, optional
273
      The "personalization vector" consisting of a dictionary with a
274
      key some subset of graph nodes and personalization value each of those.
275
      At least one personalization value must be non-zero.
276
      If not specfiied, a nodes personalization value will be zero.
277
      By default, a uniform distribution is used.
278

279
    weight : key, optional
280
      Edge data key to use as weight.  If None weights are set to 1.
281

282
    dangling: dict, optional
283
      The outedges to be assigned to any "dangling" nodes, i.e., nodes without
284
      any outedges. The dict key is the node the outedge points to and the dict
285
      value is the weight of that outedge. By default, dangling nodes are given
286
      outedges according to the personalization vector (uniform if not
287
      specified) This must be selected to result in an irreducible transition
288
      matrix (see notes under google_matrix). It may be common to have the
289
      dangling dict to be the same as the personalization dict.
290

291
    Returns
292
    -------
293
    pagerank : dictionary
294
       Dictionary of nodes with PageRank as value.
295

296
    Examples
297
    --------
298
    >>> G = nx.DiGraph(nx.path_graph(4))
299
    >>> pr = nx.pagerank_numpy(G, alpha=0.9)
300

301
    Notes
302
    -----
303
    The eigenvector calculation uses NumPy's interface to the LAPACK
304
    eigenvalue solvers.  This will be the fastest and most accurate
305
    for small graphs.
306

307
    This implementation works with Multi(Di)Graphs. For multigraphs the
308
    weight between two nodes is set to be the sum of all edge weights
309
    between those nodes.
310

311
    See Also
312
    --------
313
    pagerank, pagerank_scipy, google_matrix
314

315
    References
316
    ----------
317
    .. [1] A. Langville and C. Meyer,
318
       "A survey of eigenvector methods of web information retrieval."
319
       http://citeseer.ist.psu.edu/713792.html
320
    .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
321
       The PageRank citation ranking: Bringing order to the Web. 1999
322
       http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
323
    """
324
    import numpy as np
325
    if len(G) == 0:
326
        return {}
327
    M = google_matrix(G, alpha, personalization=personalization,
328
                      weight=weight, dangling=dangling)
329
    # use numpy LAPACK solver
330
    eigenvalues, eigenvectors = np.linalg.eig(M.T)
331
    ind = np.argmax(eigenvalues)
332
    # eigenvector of largest eigenvalue is at ind, normalized
333
    largest = np.array(eigenvectors[:, ind]).flatten().real
334
    norm = float(largest.sum())
335
    return dict(zip(G, map(float, largest / norm)))
336

    
337

    
338
def pagerank_scipy(G, alpha=0.85, personalization=None,
339
                   max_iter=100, tol=1.0e-6, weight='weight',
340
                   dangling=None):
341
    """Returns the PageRank of the nodes in the graph.
342

343
    PageRank computes a ranking of the nodes in the graph G based on
344
    the structure of the incoming links. It was originally designed as
345
    an algorithm to rank web pages.
346

347
    Parameters
348
    ----------
349
    G : graph
350
      A NetworkX graph.  Undirected graphs will be converted to a directed
351
      graph with two directed edges for each undirected edge.
352

353
    alpha : float, optional
354
      Damping parameter for PageRank, default=0.85.
355

356
    personalization: dict, optional
357
      The "personalization vector" consisting of a dictionary with a
358
      key some subset of graph nodes and personalization value each of those.
359
      At least one personalization value must be non-zero.
360
      If not specfiied, a nodes personalization value will be zero.
361
      By default, a uniform distribution is used.
362

363
    max_iter : integer, optional
364
      Maximum number of iterations in power method eigenvalue solver.
365

366
    tol : float, optional
367
      Error tolerance used to check convergence in power method solver.
368

369
    weight : key, optional
370
      Edge data key to use as weight.  If None weights are set to 1.
371

372
    dangling: dict, optional
373
      The outedges to be assigned to any "dangling" nodes, i.e., nodes without
374
      any outedges. The dict key is the node the outedge points to and the dict
375
      value is the weight of that outedge. By default, dangling nodes are given
376
      outedges according to the personalization vector (uniform if not
377
      specified) This must be selected to result in an irreducible transition
378
      matrix (see notes under google_matrix). It may be common to have the
379
      dangling dict to be the same as the personalization dict.
380

381
    Returns
382
    -------
383
    pagerank : dictionary
384
       Dictionary of nodes with PageRank as value
385

386
    Examples
387
    --------
388
    >>> G = nx.DiGraph(nx.path_graph(4))
389
    >>> pr = nx.pagerank_scipy(G, alpha=0.9)
390

391
    Notes
392
    -----
393
    The eigenvector calculation uses power iteration with a SciPy
394
    sparse matrix representation.
395

396
    This implementation works with Multi(Di)Graphs. For multigraphs the
397
    weight between two nodes is set to be the sum of all edge weights
398
    between those nodes.
399

400
    See Also
401
    --------
402
    pagerank, pagerank_numpy, google_matrix
403

404
    Raises
405
    ------
406
    PowerIterationFailedConvergence
407
        If the algorithm fails to converge to the specified tolerance
408
        within the specified number of iterations of the power iteration
409
        method.
410

411
    References
412
    ----------
413
    .. [1] A. Langville and C. Meyer,
414
       "A survey of eigenvector methods of web information retrieval."
415
       http://citeseer.ist.psu.edu/713792.html
416
    .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
417
       The PageRank citation ranking: Bringing order to the Web. 1999
418
       http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
419
    """
420
    import scipy.sparse
421

    
422
    N = len(G)
423
    if N == 0:
424
        return {}
425

    
426
    nodelist = list(G)
427
    M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight,
428
                                  dtype=float)
429
    S = scipy.array(M.sum(axis=1)).flatten()
430
    S[S != 0] = 1.0 / S[S != 0]
431
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
432
    M = Q * M
433

    
434
    # initial vector
435
    x = scipy.repeat(1.0 / N, N)
436

    
437
    # Personalization vector
438
    if personalization is None:
439
        p = scipy.repeat(1.0 / N, N)
440
    else:
441
        p = scipy.array([personalization.get(n, 0) for n in nodelist], dtype=float)
442
        p = p / p.sum()
443

    
444
    # Dangling nodes
445
    if dangling is None:
446
        dangling_weights = p
447
    else:
448
        # Convert the dangling dictionary into an array in nodelist order
449
        dangling_weights = scipy.array([dangling.get(n, 0) for n in nodelist],
450
                                       dtype=float)
451
        dangling_weights /= dangling_weights.sum()
452
    is_dangling = scipy.where(S == 0)[0]
453

    
454
    # power iteration: make up to max_iter iterations
455
    for _ in range(max_iter):
456
        xlast = x
457
        x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
458
            (1 - alpha) * p
459
        # check convergence, l1 norm
460
        err = scipy.absolute(x - xlast).sum()
461
        if err < N * tol:
462
            return dict(zip(nodelist, map(float, x)))
463
    raise nx.PowerIterationFailedConvergence(max_iter)
464

    
465

    
466
# fixture for nose tests
467
def setup_module(module):
468
    from nose import SkipTest
469
    try:
470
        import numpy
471
    except:
472
        raise SkipTest("NumPy not available")
473
    try:
474
        import scipy
475
    except:
476
        raise SkipTest("SciPy not available")