Statistics
| Branch: | Revision:

iof-tools / networkxMiCe / networkx-master / networkx / algorithms / similarity.py @ 5cef0f13

History | View | Annotate | Download (47.5 KB)

1
# -*- coding: utf-8 -*-
2
#    Copyright (C) 2010 by
3
#    Aric Hagberg <hagberg@lanl.gov>
4
#    Dan Schult <dschult@colgate.edu>
5
#    Pieter Swart <swart@lanl.gov>
6
#    All rights reserved.
7
#    BSD license.
8
#
9
# Author:  Andrey Paramonov <paramon@acdlabs.ru>
10
""" Functions measuring similarity using graph edit distance.
11

12
The graph edit distance is the number of edge/node changes needed
13
to make two graphs isomorphic.
14

15
The default algorithm/implementation is sub-optimal for some graphs.
16
The problem of finding the exact Graph Edit Distance (GED) is NP-hard
17
so it is often slow. If the simple interface `graph_edit_distance`
18
takes too long for your graph, try `optimize_graph_edit_distance`
19
and/or `optimize_edit_paths`.
20

21
At the same time, I encourage capable people to investigate
22
alternative GED algorithms, in order to improve the choices available.
23
"""
24
from __future__ import print_function, division
25
from itertools import product
26
import math
27
import networkx as nx
28
from operator import *
29
import sys
30

    
31
__author__ = 'Andrey Paramonov <paramon@acdlabs.ru>'
32

    
33
__all__ = [
34
    'graph_edit_distance',
35
    'optimal_edit_paths',
36
    'optimize_graph_edit_distance',
37
    'optimize_edit_paths',
38
    'simrank_similarity',
39
    'simrank_similarity_numpy',
40
]
41

    
42

    
43
def debug_print(*args, **kwargs):
44
    print(*args, **kwargs)
45

    
46

    
47
def graph_edit_distance(G1, G2, node_match=None, edge_match=None,
48
                        node_subst_cost=None, node_del_cost=None,
49
                        node_ins_cost=None,
50
                        edge_subst_cost=None, edge_del_cost=None,
51
                        edge_ins_cost=None,
52
                        upper_bound=None):
53
    """Returns GED (graph edit distance) between graphs G1 and G2.
54

55
    Graph edit distance is a graph similarity measure analogous to
56
    Levenshtein distance for strings.  It is defined as minimum cost
57
    of edit path (sequence of node and edge edit operations)
58
    transforming graph G1 to graph isomorphic to G2.
59

60
    Parameters
61
    ----------
62
    G1, G2: graphs
63
        The two graphs G1 and G2 must be of the same type.
64

65
    node_match : callable
66
        A function that returns True if node n1 in G1 and n2 in G2
67
        should be considered equal during matching.
68

69
        The function will be called like
70

71
           node_match(G1.nodes[n1], G2.nodes[n2]).
72

73
        That is, the function will receive the node attribute
74
        dictionaries for n1 and n2 as inputs.
75

76
        Ignored if node_subst_cost is specified.  If neither
77
        node_match nor node_subst_cost are specified then node
78
        attributes are not considered.
79

80
    edge_match : callable
81
        A function that returns True if the edge attribute dictionaries
82
        for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
83
        be considered equal during matching.
84

85
        The function will be called like
86

87
           edge_match(G1[u1][v1], G2[u2][v2]).
88

89
        That is, the function will receive the edge attribute
90
        dictionaries of the edges under consideration.
91

92
        Ignored if edge_subst_cost is specified.  If neither
93
        edge_match nor edge_subst_cost are specified then edge
94
        attributes are not considered.
95

96
    node_subst_cost, node_del_cost, node_ins_cost : callable
97
        Functions that return the costs of node substitution, node
98
        deletion, and node insertion, respectively.
99

100
        The functions will be called like
101

102
           node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
103
           node_del_cost(G1.nodes[n1]),
104
           node_ins_cost(G2.nodes[n2]).
105

106
        That is, the functions will receive the node attribute
107
        dictionaries as inputs.  The functions are expected to return
108
        positive numeric values.
109

110
        Function node_subst_cost overrides node_match if specified.
111
        If neither node_match nor node_subst_cost are specified then
112
        default node substitution cost of 0 is used (node attributes
113
        are not considered during matching).
114

115
        If node_del_cost is not specified then default node deletion
116
        cost of 1 is used.  If node_ins_cost is not specified then
117
        default node insertion cost of 1 is used.
118

119
    edge_subst_cost, edge_del_cost, edge_ins_cost : callable
120
        Functions that return the costs of edge substitution, edge
121
        deletion, and edge insertion, respectively.
122

123
        The functions will be called like
124

125
           edge_subst_cost(G1[u1][v1], G2[u2][v2]),
126
           edge_del_cost(G1[u1][v1]),
127
           edge_ins_cost(G2[u2][v2]).
128

129
        That is, the functions will receive the edge attribute
130
        dictionaries as inputs.  The functions are expected to return
131
        positive numeric values.
132

133
        Function edge_subst_cost overrides edge_match if specified.
134
        If neither edge_match nor edge_subst_cost are specified then
135
        default edge substitution cost of 0 is used (edge attributes
136
        are not considered during matching).
137

138
        If edge_del_cost is not specified then default edge deletion
139
        cost of 1 is used.  If edge_ins_cost is not specified then
140
        default edge insertion cost of 1 is used.
141

142
    upper_bound : numeric
143
        Maximum edit distance to consider.  Return None if no edit
144
        distance under or equal to upper_bound exists.
145

146
    Examples
147
    --------
148
    >>> G1 = nx.cycle_graph(6)
149
    >>> G2 = nx.wheel_graph(7)
150
    >>> nx.graph_edit_distance(G1, G2)
151
    7.0
152

153
    See Also
154
    --------
155
    optimal_edit_paths, optimize_graph_edit_distance,
156

157
    is_isomorphic (test for graph edit distance of 0)
158

159
    References
160
    ----------
161
    .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
162
       Martineau. An Exact Graph Edit Distance Algorithm for Solving
163
       Pattern Recognition Problems. 4th International Conference on
164
       Pattern Recognition Applications and Methods 2015, Jan 2015,
165
       Lisbon, Portugal. 2015,
166
       <10.5220/0005209202710278>. <hal-01168816>
167
       https://hal.archives-ouvertes.fr/hal-01168816
168

169
    """
170
    bestcost = None
171
    for vertex_path, edge_path, cost in \
172
        optimize_edit_paths(G1, G2, node_match, edge_match,
173
                            node_subst_cost, node_del_cost, node_ins_cost,
174
                            edge_subst_cost, edge_del_cost, edge_ins_cost,
175
                            upper_bound, True):
176
        #assert bestcost is None or cost < bestcost
177
        bestcost = cost
178
    return bestcost
179

    
180

    
181
def optimal_edit_paths(G1, G2, node_match=None, edge_match=None,
182
                       node_subst_cost=None, node_del_cost=None,
183
                       node_ins_cost=None,
184
                       edge_subst_cost=None, edge_del_cost=None,
185
                       edge_ins_cost=None,
186
                       upper_bound=None):
187
    """Returns all minimum-cost edit paths transforming G1 to G2.
188

189
    Graph edit path is a sequence of node and edge edit operations
190
    transforming graph G1 to graph isomorphic to G2.  Edit operations
191
    include substitutions, deletions, and insertions.
192

193
    Parameters
194
    ----------
195
    G1, G2: graphs
196
        The two graphs G1 and G2 must be of the same type.
197

198
    node_match : callable
199
        A function that returns True if node n1 in G1 and n2 in G2
200
        should be considered equal during matching.
201

202
        The function will be called like
203

204
           node_match(G1.nodes[n1], G2.nodes[n2]).
205

206
        That is, the function will receive the node attribute
207
        dictionaries for n1 and n2 as inputs.
208

209
        Ignored if node_subst_cost is specified.  If neither
210
        node_match nor node_subst_cost are specified then node
211
        attributes are not considered.
212

213
    edge_match : callable
214
        A function that returns True if the edge attribute dictionaries
215
        for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
216
        be considered equal during matching.
217

218
        The function will be called like
219

220
           edge_match(G1[u1][v1], G2[u2][v2]).
221

222
        That is, the function will receive the edge attribute
223
        dictionaries of the edges under consideration.
224

225
        Ignored if edge_subst_cost is specified.  If neither
226
        edge_match nor edge_subst_cost are specified then edge
227
        attributes are not considered.
228

229
    node_subst_cost, node_del_cost, node_ins_cost : callable
230
        Functions that return the costs of node substitution, node
231
        deletion, and node insertion, respectively.
232

233
        The functions will be called like
234

235
           node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
236
           node_del_cost(G1.nodes[n1]),
237
           node_ins_cost(G2.nodes[n2]).
238

239
        That is, the functions will receive the node attribute
240
        dictionaries as inputs.  The functions are expected to return
241
        positive numeric values.
242

243
        Function node_subst_cost overrides node_match if specified.
244
        If neither node_match nor node_subst_cost are specified then
245
        default node substitution cost of 0 is used (node attributes
246
        are not considered during matching).
247

248
        If node_del_cost is not specified then default node deletion
249
        cost of 1 is used.  If node_ins_cost is not specified then
250
        default node insertion cost of 1 is used.
251

252
    edge_subst_cost, edge_del_cost, edge_ins_cost : callable
253
        Functions that return the costs of edge substitution, edge
254
        deletion, and edge insertion, respectively.
255

256
        The functions will be called like
257

258
           edge_subst_cost(G1[u1][v1], G2[u2][v2]),
259
           edge_del_cost(G1[u1][v1]),
260
           edge_ins_cost(G2[u2][v2]).
261

262
        That is, the functions will receive the edge attribute
263
        dictionaries as inputs.  The functions are expected to return
264
        positive numeric values.
265

266
        Function edge_subst_cost overrides edge_match if specified.
267
        If neither edge_match nor edge_subst_cost are specified then
268
        default edge substitution cost of 0 is used (edge attributes
269
        are not considered during matching).
270

271
        If edge_del_cost is not specified then default edge deletion
272
        cost of 1 is used.  If edge_ins_cost is not specified then
273
        default edge insertion cost of 1 is used.
274

275
    upper_bound : numeric
276
        Maximum edit distance to consider.
277

278
    Returns
279
    -------
280
    edit_paths : list of tuples (node_edit_path, edge_edit_path)
281
        node_edit_path : list of tuples (u, v)
282
        edge_edit_path : list of tuples ((u1, v1), (u2, v2))
283

284
    cost : numeric
285
        Optimal edit path cost (graph edit distance).
286

287
    Examples
288
    --------
289
    >>> G1 = nx.cycle_graph(6)
290
    >>> G2 = nx.wheel_graph(7)
291
    >>> paths, cost = nx.optimal_edit_paths(G1, G2)
292
    >>> len(paths)
293
    84
294
    >>> cost
295
    7.0
296

297
    See Also
298
    --------
299
    graph_edit_distance, optimize_edit_paths
300

301
    References
302
    ----------
303
    .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
304
       Martineau. An Exact Graph Edit Distance Algorithm for Solving
305
       Pattern Recognition Problems. 4th International Conference on
306
       Pattern Recognition Applications and Methods 2015, Jan 2015,
307
       Lisbon, Portugal. 2015,
308
       <10.5220/0005209202710278>. <hal-01168816>
309
       https://hal.archives-ouvertes.fr/hal-01168816
310

311
    """
312
    paths = list()
313
    bestcost = None
314
    for vertex_path, edge_path, cost in \
315
        optimize_edit_paths(G1, G2, node_match, edge_match,
316
                            node_subst_cost, node_del_cost, node_ins_cost,
317
                            edge_subst_cost, edge_del_cost, edge_ins_cost,
318
                            upper_bound, False):
319
        #assert bestcost is None or cost <= bestcost
320
        if bestcost is not None and cost < bestcost:
321
            paths = list()
322
        paths.append((vertex_path, edge_path))
323
        bestcost = cost
324
    return paths, bestcost
325

    
326

    
327
def optimize_graph_edit_distance(G1, G2, node_match=None, edge_match=None,
328
                                 node_subst_cost=None, node_del_cost=None,
329
                                 node_ins_cost=None,
330
                                 edge_subst_cost=None, edge_del_cost=None,
331
                                 edge_ins_cost=None,
332
                                 upper_bound=None):
333
    """Returns consecutive approximations of GED (graph edit distance)
334
    between graphs G1 and G2.
335

336
    Graph edit distance is a graph similarity measure analogous to
337
    Levenshtein distance for strings.  It is defined as minimum cost
338
    of edit path (sequence of node and edge edit operations)
339
    transforming graph G1 to graph isomorphic to G2.
340

341
    Parameters
342
    ----------
343
    G1, G2: graphs
344
        The two graphs G1 and G2 must be of the same type.
345

346
    node_match : callable
347
        A function that returns True if node n1 in G1 and n2 in G2
348
        should be considered equal during matching.
349

350
        The function will be called like
351

352
           node_match(G1.nodes[n1], G2.nodes[n2]).
353

354
        That is, the function will receive the node attribute
355
        dictionaries for n1 and n2 as inputs.
356

357
        Ignored if node_subst_cost is specified.  If neither
358
        node_match nor node_subst_cost are specified then node
359
        attributes are not considered.
360

361
    edge_match : callable
362
        A function that returns True if the edge attribute dictionaries
363
        for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
364
        be considered equal during matching.
365

366
        The function will be called like
367

368
           edge_match(G1[u1][v1], G2[u2][v2]).
369

370
        That is, the function will receive the edge attribute
371
        dictionaries of the edges under consideration.
372

373
        Ignored if edge_subst_cost is specified.  If neither
374
        edge_match nor edge_subst_cost are specified then edge
375
        attributes are not considered.
376

377
    node_subst_cost, node_del_cost, node_ins_cost : callable
378
        Functions that return the costs of node substitution, node
379
        deletion, and node insertion, respectively.
380

381
        The functions will be called like
382

383
           node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
384
           node_del_cost(G1.nodes[n1]),
385
           node_ins_cost(G2.nodes[n2]).
386

387
        That is, the functions will receive the node attribute
388
        dictionaries as inputs.  The functions are expected to return
389
        positive numeric values.
390

391
        Function node_subst_cost overrides node_match if specified.
392
        If neither node_match nor node_subst_cost are specified then
393
        default node substitution cost of 0 is used (node attributes
394
        are not considered during matching).
395

396
        If node_del_cost is not specified then default node deletion
397
        cost of 1 is used.  If node_ins_cost is not specified then
398
        default node insertion cost of 1 is used.
399

400
    edge_subst_cost, edge_del_cost, edge_ins_cost : callable
401
        Functions that return the costs of edge substitution, edge
402
        deletion, and edge insertion, respectively.
403

404
        The functions will be called like
405

406
           edge_subst_cost(G1[u1][v1], G2[u2][v2]),
407
           edge_del_cost(G1[u1][v1]),
408
           edge_ins_cost(G2[u2][v2]).
409

410
        That is, the functions will receive the edge attribute
411
        dictionaries as inputs.  The functions are expected to return
412
        positive numeric values.
413

414
        Function edge_subst_cost overrides edge_match if specified.
415
        If neither edge_match nor edge_subst_cost are specified then
416
        default edge substitution cost of 0 is used (edge attributes
417
        are not considered during matching).
418

419
        If edge_del_cost is not specified then default edge deletion
420
        cost of 1 is used.  If edge_ins_cost is not specified then
421
        default edge insertion cost of 1 is used.
422

423
    upper_bound : numeric
424
        Maximum edit distance to consider.
425

426
    Returns
427
    -------
428
    Generator of consecutive approximations of graph edit distance.
429

430
    Examples
431
    --------
432
    >>> G1 = nx.cycle_graph(6)
433
    >>> G2 = nx.wheel_graph(7)
434
    >>> for v in nx.optimize_graph_edit_distance(G1, G2):
435
    ...     minv = v
436
    >>> minv
437
    7.0
438

439
    See Also
440
    --------
441
    graph_edit_distance, optimize_edit_paths
442

443
    References
444
    ----------
445
    .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
446
       Martineau. An Exact Graph Edit Distance Algorithm for Solving
447
       Pattern Recognition Problems. 4th International Conference on
448
       Pattern Recognition Applications and Methods 2015, Jan 2015,
449
       Lisbon, Portugal. 2015,
450
       <10.5220/0005209202710278>. <hal-01168816>
451
       https://hal.archives-ouvertes.fr/hal-01168816
452
    """
453
    for vertex_path, edge_path, cost in \
454
        optimize_edit_paths(G1, G2, node_match, edge_match,
455
                            node_subst_cost, node_del_cost, node_ins_cost,
456
                            edge_subst_cost, edge_del_cost, edge_ins_cost,
457
                            upper_bound, True):
458
        yield cost
459

    
460

    
461
def optimize_edit_paths(G1, G2, node_match=None, edge_match=None,
462
                        node_subst_cost=None, node_del_cost=None,
463
                        node_ins_cost=None,
464
                        edge_subst_cost=None, edge_del_cost=None,
465
                        edge_ins_cost=None,
466
                        upper_bound=None, strictly_decreasing=True):
467
    """GED (graph edit distance) calculation: advanced interface.
468

469
    Graph edit path is a sequence of node and edge edit operations
470
    transforming graph G1 to graph isomorphic to G2.  Edit operations
471
    include substitutions, deletions, and insertions.
472

473
    Graph edit distance is defined as minimum cost of edit path.
474

475
    Parameters
476
    ----------
477
    G1, G2: graphs
478
        The two graphs G1 and G2 must be of the same type.
479

480
    node_match : callable
481
        A function that returns True if node n1 in G1 and n2 in G2
482
        should be considered equal during matching.
483

484
        The function will be called like
485

486
           node_match(G1.nodes[n1], G2.nodes[n2]).
487

488
        That is, the function will receive the node attribute
489
        dictionaries for n1 and n2 as inputs.
490

491
        Ignored if node_subst_cost is specified.  If neither
492
        node_match nor node_subst_cost are specified then node
493
        attributes are not considered.
494

495
    edge_match : callable
496
        A function that returns True if the edge attribute dictionaries
497
        for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
498
        be considered equal during matching.
499

500
        The function will be called like
501

502
           edge_match(G1[u1][v1], G2[u2][v2]).
503

504
        That is, the function will receive the edge attribute
505
        dictionaries of the edges under consideration.
506

507
        Ignored if edge_subst_cost is specified.  If neither
508
        edge_match nor edge_subst_cost are specified then edge
509
        attributes are not considered.
510

511
    node_subst_cost, node_del_cost, node_ins_cost : callable
512
        Functions that return the costs of node substitution, node
513
        deletion, and node insertion, respectively.
514

515
        The functions will be called like
516

517
           node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
518
           node_del_cost(G1.nodes[n1]),
519
           node_ins_cost(G2.nodes[n2]).
520

521
        That is, the functions will receive the node attribute
522
        dictionaries as inputs.  The functions are expected to return
523
        positive numeric values.
524

525
        Function node_subst_cost overrides node_match if specified.
526
        If neither node_match nor node_subst_cost are specified then
527
        default node substitution cost of 0 is used (node attributes
528
        are not considered during matching).
529

530
        If node_del_cost is not specified then default node deletion
531
        cost of 1 is used.  If node_ins_cost is not specified then
532
        default node insertion cost of 1 is used.
533

534
    edge_subst_cost, edge_del_cost, edge_ins_cost : callable
535
        Functions that return the costs of edge substitution, edge
536
        deletion, and edge insertion, respectively.
537

538
        The functions will be called like
539

540
           edge_subst_cost(G1[u1][v1], G2[u2][v2]),
541
           edge_del_cost(G1[u1][v1]),
542
           edge_ins_cost(G2[u2][v2]).
543

544
        That is, the functions will receive the edge attribute
545
        dictionaries as inputs.  The functions are expected to return
546
        positive numeric values.
547

548
        Function edge_subst_cost overrides edge_match if specified.
549
        If neither edge_match nor edge_subst_cost are specified then
550
        default edge substitution cost of 0 is used (edge attributes
551
        are not considered during matching).
552

553
        If edge_del_cost is not specified then default edge deletion
554
        cost of 1 is used.  If edge_ins_cost is not specified then
555
        default edge insertion cost of 1 is used.
556

557
    upper_bound : numeric
558
        Maximum edit distance to consider.
559

560
    strictly_decreasing : bool
561
        If True, return consecutive approximations of strictly
562
        decreasing cost.  Otherwise, return all edit paths of cost
563
        less than or equal to the previous minimum cost.
564

565
    Returns
566
    -------
567
    Generator of tuples (node_edit_path, edge_edit_path, cost)
568
        node_edit_path : list of tuples (u, v)
569
        edge_edit_path : list of tuples ((u1, v1), (u2, v2))
570
        cost : numeric
571

572
    See Also
573
    --------
574
    graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths
575

576
    References
577
    ----------
578
    .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
579
       Martineau. An Exact Graph Edit Distance Algorithm for Solving
580
       Pattern Recognition Problems. 4th International Conference on
581
       Pattern Recognition Applications and Methods 2015, Jan 2015,
582
       Lisbon, Portugal. 2015,
583
       <10.5220/0005209202710278>. <hal-01168816>
584
       https://hal.archives-ouvertes.fr/hal-01168816
585

586
    """
587
    # TODO: support DiGraph
588

    
589
    import numpy as np
590
    from scipy.optimize import linear_sum_assignment
591

    
592
    class CostMatrix:
593
        def __init__(self, C, lsa_row_ind, lsa_col_ind, ls):
594
            #assert C.shape[0] == len(lsa_row_ind)
595
            #assert C.shape[1] == len(lsa_col_ind)
596
            #assert len(lsa_row_ind) == len(lsa_col_ind)
597
            #assert set(lsa_row_ind) == set(range(len(lsa_row_ind)))
598
            #assert set(lsa_col_ind) == set(range(len(lsa_col_ind)))
599
            #assert ls == C[lsa_row_ind, lsa_col_ind].sum()
600
            self.C = C
601
            self.lsa_row_ind = lsa_row_ind
602
            self.lsa_col_ind = lsa_col_ind
603
            self.ls = ls
604

    
605
    def make_CostMatrix(C, m, n):
606
        #assert(C.shape == (m + n, m + n))
607
        lsa_row_ind, lsa_col_ind = linear_sum_assignment(C)
608

    
609
        # Fixup dummy assignments:
610
        # each substitution i<->j should have dummy assignment m+j<->n+i
611
        # NOTE: fast reduce of Cv relies on it
612
        #assert len(lsa_row_ind) == len(lsa_col_ind)
613
        indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)
614
        subst_ind = list(k for k, i, j in indexes if i < m and j < n)
615
        indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)
616
        dummy_ind = list(k for k, i, j in indexes if i >= m and j >= n)
617
        #assert len(subst_ind) == len(dummy_ind)
618
        lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m
619
        lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n
620

    
621
        return CostMatrix(C, lsa_row_ind, lsa_col_ind,
622
                          C[lsa_row_ind, lsa_col_ind].sum())
623

    
624
    def extract_C(C, i, j, m, n):
625
        #assert(C.shape == (m + n, m + n))
626
        row_ind = [k in i or k - m in j for k in range(m + n)]
627
        col_ind = [k in j or k - n in i for k in range(m + n)]
628
        return C[row_ind, :][:, col_ind]
629

    
630
    def reduce_C(C, i, j, m, n):
631
        #assert(C.shape == (m + n, m + n))
632
        row_ind = [k not in i and k - m not in j for k in range(m + n)]
633
        col_ind = [k not in j and k - n not in i for k in range(m + n)]
634
        return C[row_ind, :][:, col_ind]
635

    
636
    def reduce_ind(ind, i):
637
        #assert set(ind) == set(range(len(ind)))
638
        rind = ind[[k not in i for k in ind]]
639
        for k in set(i):
640
            rind[rind >= k] -= 1
641
        return rind
642

    
643
    def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=[]):
644
        """
645
        Parameters:
646
            u, v: matched vertices, u=None or v=None for
647
               deletion/insertion
648
            pending_g, pending_h: lists of edges not yet mapped
649
            Ce: CostMatrix of pending edge mappings
650
            matched_uv: partial vertex edit path
651
                list of tuples (u, v) of previously matched vertex
652
                    mappings u<->v, u=None or v=None for
653
                    deletion/insertion
654

655
        Returns:
656
            list of (i, j): indices of edge mappings g<->h
657
            localCe: local CostMatrix of edge mappings
658
                (basically submatrix of Ce at cross of rows i, cols j)
659
        """
660
        M = len(pending_g)
661
        N = len(pending_h)
662
        #assert Ce.C.shape == (M + N, M + N)
663

    
664
        g_ind = [i for i in range(M) if pending_g[i][:2] == (u, u) or
665
                 any(pending_g[i][:2] in ((p, u), (u, p))
666
                     for p, q in matched_uv)]
667
        h_ind = [j for j in range(N) if pending_h[j][:2] == (v, v) or
668
                 any(pending_h[j][:2] in ((q, v), (v, q))
669
                     for p, q in matched_uv)]
670
        m = len(g_ind)
671
        n = len(h_ind)
672

    
673
        if m or n:
674
            C = extract_C(Ce.C, g_ind, h_ind, M, N)
675
            #assert C.shape == (m + n, m + n)
676

    
677
            # Forbid structurally invalid matches
678
            # NOTE: inf remembered from Ce construction
679
            for k, i in zip(range(m), g_ind):
680
                g = pending_g[i][:2]
681
                for l, j in zip(range(n), h_ind):
682
                    h = pending_h[j][:2]
683
                    if nx.is_directed(G1) or nx.is_directed(G2):
684
                        if any(g == (p, u) and h == (q, v) or
685
                               g == (u, p) and h == (v, q)
686
                               for p, q in matched_uv):
687
                            continue
688
                    else:
689
                        if any(g in ((p, u), (u, p)) and h in ((q, v), (v, q))
690
                               for p, q in matched_uv):
691
                            continue
692
                    if g == (u, u):
693
                        continue
694
                    if h == (v, v):
695
                        continue
696
                    C[k, l] = inf
697

    
698
            localCe = make_CostMatrix(C, m, n)
699
            ij = list((g_ind[k] if k < m else M + h_ind[l],
700
                       h_ind[l] if l < n else N + g_ind[k])
701
                      for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind)
702
                      if k < m or l < n)
703

    
704
        else:
705
            ij = []
706
            localCe = CostMatrix(np.empty((0, 0)), [], [], 0)
707

    
708
        return ij, localCe
709

    
710
    def reduce_Ce(Ce, ij, m, n):
711
        if len(ij):
712
            i, j = zip(*ij)
713
            m_i = m - sum(1 for t in i if t < m)
714
            n_j = n - sum(1 for t in j if t < n)
715
            return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j)
716
        else:
717
            return Ce
718

    
719
    def get_edit_ops(matched_uv, pending_u, pending_v, Cv,
720
                     pending_g, pending_h, Ce, matched_cost):
721
        """
722
        Parameters:
723
            matched_uv: partial vertex edit path
724
                list of tuples (u, v) of vertex mappings u<->v,
725
                u=None or v=None for deletion/insertion
726
            pending_u, pending_v: lists of vertices not yet mapped
727
            Cv: CostMatrix of pending vertex mappings
728
            pending_g, pending_h: lists of edges not yet mapped
729
            Ce: CostMatrix of pending edge mappings
730
            matched_cost: cost of partial edit path
731

732
        Returns:
733
            sequence of
734
                (i, j): indices of vertex mapping u<->v
735
                Cv_ij: reduced CostMatrix of pending vertex mappings
736
                    (basically Cv with row i, col j removed)
737
                list of (x, y): indices of edge mappings g<->h
738
                Ce_xy: reduced CostMatrix of pending edge mappings
739
                    (basically Ce with rows x, cols y removed)
740
                cost: total cost of edit operation
741
            NOTE: most promising ops first
742
        """
743
        m = len(pending_u)
744
        n = len(pending_v)
745
        #assert Cv.C.shape == (m + n, m + n)
746

    
747
        # 1) a vertex mapping from optimal linear sum assignment
748
        i, j = min((k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind)
749
                   if k < m or l < n)
750
        xy, localCe = match_edges(pending_u[i] if i < m else None,
751
                                  pending_v[j] if j < n else None,
752
                                  pending_g, pending_h, Ce, matched_uv)
753
        Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
754
        #assert Ce.ls <= localCe.ls + Ce_xy.ls
755
        if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls):
756
            pass
757
        else:
758
            # get reduced Cv efficiently
759
            Cv_ij = CostMatrix(reduce_C(Cv.C, (i,), (j,), m, n),
760
                               reduce_ind(Cv.lsa_row_ind, (i, m + j)),
761
                               reduce_ind(Cv.lsa_col_ind, (j, n + i)),
762
                               Cv.ls - Cv.C[i, j])
763
            yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls
764

    
765
        # 2) other candidates, sorted by lower-bound cost estimate
766
        other = list()
767
        fixed_i, fixed_j = i, j
768
        if m <= n:
769
            candidates = ((t, fixed_j) for t in range(m + n)
770
                          if t != fixed_i and (t < m or t == m + fixed_j))
771
        else:
772
            candidates = ((fixed_i, t) for t in range(m + n)
773
                          if t != fixed_j and (t < n or t == n + fixed_i))
774
        for i, j in candidates:
775
            if prune(matched_cost + Cv.C[i, j] + Ce.ls):
776
                continue
777
            Cv_ij = make_CostMatrix(reduce_C(Cv.C, (i,), (j,), m, n),
778
                                    m - 1 if i < m else m,
779
                                    n - 1 if j < n else n)
780
            #assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls
781
            if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls):
782
                continue
783
            xy, localCe = match_edges(pending_u[i] if i < m else None,
784
                                      pending_v[j] if j < n else None,
785
                                      pending_g, pending_h, Ce, matched_uv)
786
            if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls):
787
                continue
788
            Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
789
            #assert Ce.ls <= localCe.ls + Ce_xy.ls
790
            if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls +
791
                     Ce_xy.ls):
792
                continue
793
            other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls))
794

    
795
        # yield from
796
        for t in sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls):
797
            yield t
798

    
799
    def get_edit_paths(matched_uv, pending_u, pending_v, Cv,
800
                       matched_gh, pending_g, pending_h, Ce, matched_cost):
801
        """
802
        Parameters:
803
            matched_uv: partial vertex edit path
804
                list of tuples (u, v) of vertex mappings u<->v,
805
                u=None or v=None for deletion/insertion
806
            pending_u, pending_v: lists of vertices not yet mapped
807
            Cv: CostMatrix of pending vertex mappings
808
            matched_gh: partial edge edit path
809
                list of tuples (g, h) of edge mappings g<->h,
810
                g=None or h=None for deletion/insertion
811
            pending_g, pending_h: lists of edges not yet mapped
812
            Ce: CostMatrix of pending edge mappings
813
            matched_cost: cost of partial edit path
814

815
        Returns:
816
            sequence of (vertex_path, edge_path, cost)
817
                vertex_path: complete vertex edit path
818
                    list of tuples (u, v) of vertex mappings u<->v,
819
                    u=None or v=None for deletion/insertion
820
                edge_path: complete edge edit path
821
                    list of tuples (g, h) of edge mappings g<->h,
822
                    g=None or h=None for deletion/insertion
823
                cost: total cost of edit path
824
            NOTE: path costs are non-increasing
825
        """
826
        #debug_print('matched-uv:', matched_uv)
827
        #debug_print('matched-gh:', matched_gh)
828
        #debug_print('matched-cost:', matched_cost)
829
        #debug_print('pending-u:', pending_u)
830
        #debug_print('pending-v:', pending_v)
831
        #debug_print(Cv.C)
832
        #assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u))
833
        #assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v))
834
        #debug_print('pending-g:', pending_g)
835
        #debug_print('pending-h:', pending_h)
836
        #debug_print(Ce.C)
837
        #assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g))
838
        #assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h))
839
        #debug_print()
840

    
841
        if prune(matched_cost + Cv.ls + Ce.ls):
842
            return
843

    
844
        if not max(len(pending_u), len(pending_v)):
845
            #assert not len(pending_g)
846
            #assert not len(pending_h)
847
            # path completed!
848
            #assert matched_cost <= maxcost.value
849
            maxcost.value = min(maxcost.value, matched_cost)
850
            yield matched_uv, matched_gh, matched_cost
851

    
852
        else:
853
            edit_ops = get_edit_ops(matched_uv, pending_u, pending_v, Cv,
854
                                    pending_g, pending_h, Ce, matched_cost)
855
            for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops:
856
                i, j = ij
857
                #assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost
858
                if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls):
859
                    continue
860

    
861
                # dive deeper
862
                u = pending_u.pop(i) if i < len(pending_u) else None
863
                v = pending_v.pop(j) if j < len(pending_v) else None
864
                matched_uv.append((u, v))
865
                for x, y in xy:
866
                    len_g = len(pending_g)
867
                    len_h = len(pending_h)
868
                    matched_gh.append((pending_g[x] if x < len_g else None,
869
                                       pending_h[y] if y < len_h else None))
870
                sortedx = list(sorted(x for x, y in xy))
871
                sortedy = list(sorted(y for x, y in xy))
872
                G = list((pending_g.pop(x) if x < len(pending_g) else None)
873
                         for x in reversed(sortedx))
874
                H = list((pending_h.pop(y) if y < len(pending_h) else None)
875
                         for y in reversed(sortedy))
876

    
877
                # yield from
878
                for t in get_edit_paths(matched_uv, pending_u, pending_v,
879
                                        Cv_ij,
880
                                        matched_gh, pending_g, pending_h,
881
                                        Ce_xy,
882
                                        matched_cost + edit_cost):
883
                    yield t
884

    
885
                # backtrack
886
                if u is not None:
887
                    pending_u.insert(i, u)
888
                if v is not None:
889
                    pending_v.insert(j, v)
890
                matched_uv.pop()
891
                for x, g in zip(sortedx, reversed(G)):
892
                    if g is not None:
893
                        pending_g.insert(x, g)
894
                for y, h in zip(sortedy, reversed(H)):
895
                    if h is not None:
896
                        pending_h.insert(y, h)
897
                for t in xy:
898
                    matched_gh.pop()
899

    
900
    # Initialization
901

    
902
    pending_u = list(G1.nodes)
903
    pending_v = list(G2.nodes)
904

    
905
    # cost matrix of vertex mappings
906
    m = len(pending_u)
907
    n = len(pending_v)
908
    C = np.zeros((m + n, m + n))
909
    if node_subst_cost:
910
        C[0:m, 0:n] = np.array([node_subst_cost(G1.nodes[u], G2.nodes[v])
911
                                for u in pending_u for v in pending_v]
912
                               ).reshape(m, n)
913
    elif node_match:
914
        C[0:m, 0:n] = np.array([1 - int(node_match(G1.nodes[u], G2.nodes[v]))
915
                                for u in pending_u for v in pending_v]
916
                               ).reshape(m, n)
917
    else:
918
        # all zeroes
919
        pass
920
    #assert not min(m, n) or C[0:m, 0:n].min() >= 0
921
    if node_del_cost:
922
        del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u]
923
    else:
924
        del_costs = [1] * len(pending_u)
925
    #assert not m or min(del_costs) >= 0
926
    if node_ins_cost:
927
        ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v]
928
    else:
929
        ins_costs = [1] * len(pending_v)
930
    #assert not n or min(ins_costs) >= 0
931
    inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
932
    C[0:m, n:n + m] = np.array([del_costs[i] if i == j else inf
933
                                for i in range(m) for j in range(m)]
934
                               ).reshape(m, m)
935
    C[m:m + n, 0:n] = np.array([ins_costs[i] if i == j else inf
936
                                for i in range(n) for j in range(n)]
937
                               ).reshape(n, n)
938
    Cv = make_CostMatrix(C, m, n)
939
    #debug_print('Cv: {} x {}'.format(m, n))
940
    #debug_print(Cv.C)
941

    
942
    pending_g = list(G1.edges)
943
    pending_h = list(G2.edges)
944

    
945
    # cost matrix of edge mappings
946
    m = len(pending_g)
947
    n = len(pending_h)
948
    C = np.zeros((m + n, m + n))
949
    if edge_subst_cost:
950
        C[0:m, 0:n] = np.array([edge_subst_cost(G1.edges[g], G2.edges[h])
951
                                for g in pending_g for h in pending_h]
952
                               ).reshape(m, n)
953
    elif edge_match:
954
        C[0:m, 0:n] = np.array([1 - int(edge_match(G1.edges[g], G2.edges[h]))
955
                                for g in pending_g for h in pending_h]
956
                               ).reshape(m, n)
957
    else:
958
        # all zeroes
959
        pass
960
    #assert not min(m, n) or C[0:m, 0:n].min() >= 0
961
    if edge_del_cost:
962
        del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g]
963
    else:
964
        del_costs = [1] * len(pending_g)
965
    #assert not m or min(del_costs) >= 0
966
    if edge_ins_cost:
967
        ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h]
968
    else:
969
        ins_costs = [1] * len(pending_h)
970
    #assert not n or min(ins_costs) >= 0
971
    inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
972
    C[0:m, n:n + m] = np.array([del_costs[i] if i == j else inf
973
                                for i in range(m) for j in range(m)]
974
                               ).reshape(m, m)
975
    C[m:m + n, 0:n] = np.array([ins_costs[i] if i == j else inf
976
                                for i in range(n) for j in range(n)]
977
                               ).reshape(n, n)
978
    Ce = make_CostMatrix(C, m, n)
979
    #debug_print('Ce: {} x {}'.format(m, n))
980
    #debug_print(Ce.C)
981
    #debug_print()
982

    
983
    class MaxCost:
984
        def __init__(self):
985
            # initial upper-bound estimate
986
            # NOTE: should work for empty graph
987
            self.value = Cv.C.sum() + Ce.C.sum() + 1
988
    maxcost = MaxCost()
989

    
990
    def prune(cost):
991
        if upper_bound is not None:
992
            if cost > upper_bound:
993
                return True
994
        if cost > maxcost.value:
995
            return True
996
        elif strictly_decreasing and cost >= maxcost.value:
997
            return True
998

    
999
    # Now go!
1000

    
1001
    for vertex_path, edge_path, cost in \
1002
        get_edit_paths([], pending_u, pending_v, Cv,
1003
                       [], pending_g, pending_h, Ce, 0):
1004
        #assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None)
1005
        #assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None)
1006
        #assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None)
1007
        #assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None)
1008
        #print(vertex_path, edge_path, cost, file = sys.stderr)
1009
        #assert cost == maxcost.value
1010
        yield list(vertex_path), list(edge_path), cost
1011

    
1012

    
1013
def _is_close(d1, d2, atolerance=0, rtolerance=0):
1014
    """Determines whether two adjacency matrices are within
1015
    a provided tolerance.
1016
    
1017
    Parameters
1018
    ----------
1019
    d1 : dict
1020
        Adjacency dictionary
1021
    
1022
    d2 : dict
1023
        Adjacency dictionary
1024
    
1025
    atolerance : float
1026
        Some scalar tolerance value to determine closeness
1027
    
1028
    rtolerance : float
1029
        A scalar tolerance value that will be some proportion
1030
        of ``d2``'s value
1031
    
1032
    Returns
1033
    -------
1034
    closeness : bool
1035
        If all of the nodes within ``d1`` and ``d2`` are within
1036
        a predefined tolerance, they are considered "close" and
1037
        this method will return True. Otherwise, this method will
1038
        return False.
1039
    
1040
    """
1041
    # Pre-condition: d1 and d2 have the same keys at each level if they
1042
    # are dictionaries.
1043
    if not isinstance(d1, dict) and not isinstance(d2, dict):
1044
        return abs(d1 - d2) <= atolerance + rtolerance * abs(d2)
1045
    return all(all(_is_close(d1[u][v], d2[u][v]) for v in d1[u]) for u in d1)
1046

    
1047

    
1048
def simrank_similarity(G, source=None, target=None, importance_factor=0.9,
1049
                       max_iterations=100, tolerance=1e-4):
1050
    """Returns the SimRank similarity of nodes in the graph ``G``.
1051

1052
    SimRank is a similarity metric that says "two objects are considered
1053
    to be similar if they are referenced by similar objects." [1]_.
1054
    
1055
    The pseudo-code definition from the paper is:
1056

1057
        def simrank(G, u, v):
1058
            in_neighbors_u = G.predecessors(u)
1059
            in_neighbors_v = G.predecessors(v)
1060
            scale = C / (len(in_neighbors_u) * len(in_neighbors_v))
1061
            return scale * sum(simrank(G, w, x)
1062
                               for w, x in product(in_neighbors_u,
1063
                                                   in_neighbors_v))
1064
    
1065
    where ``G`` is the graph, ``u`` is the source, ``v`` is the target,
1066
    and ``C`` is a float decay or importance factor between 0 and 1.
1067
    
1068
    The SimRank algorithm for determining node similarity is defined in
1069
    [2]_.
1070

1071
    Parameters
1072
    ----------
1073
    G : NetworkX graph
1074
        A NetworkX graph
1075

1076
    source : node
1077
        If this is specified, the returned dictionary maps each node
1078
        ``v`` in the graph to the similarity between ``source`` and
1079
        ``v``.
1080

1081
    target : node
1082
        If both ``source`` and ``target`` are specified, the similarity
1083
        value between ``source`` and ``target`` is returned. If
1084
        ``target`` is specified but ``source`` is not, this argument is
1085
        ignored.
1086

1087
    importance_factor : float
1088
        The relative importance of indirect neighbors with respect to
1089
        direct neighbors.
1090

1091
    max_iterations : integer
1092
        Maximum number of iterations.
1093

1094
    tolerance : float
1095
        Error tolerance used to check convergence. When an iteration of
1096
        the algorithm finds that no similarity value changes more than
1097
        this amount, the algorithm halts.
1098

1099
    Returns
1100
    -------
1101
    similarity : dictionary or float
1102
        If ``source`` and ``target`` are both ``None``, this returns a
1103
        dictionary of dictionaries, where keys are node pairs and value
1104
        are similarity of the pair of nodes.
1105

1106
        If ``source`` is not ``None`` but ``target`` is, this returns a
1107
        dictionary mapping node to the similarity of ``source`` and that
1108
        node.
1109

1110
        If neither ``source`` nor ``target`` is ``None``, this returns
1111
        the similarity value for the given pair of nodes.
1112

1113
    Examples
1114
    --------
1115
    If the nodes of the graph are numbered from zero to *n - 1*, where *n*
1116
    is the number of nodes in the graph, you can create a SimRank matrix
1117
    from the return value of this function where the node numbers are
1118
    the row and column indices of the matrix::
1119

1120
        >>> import networkx as nx
1121
        >>> from numpy import array
1122
        >>> G = nx.cycle_graph(4)
1123
        >>> sim = nx.simrank_similarity(G)
1124
        >>> lol = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)]
1125
        >>> sim_array = array(lol)
1126

1127
    References
1128
    ----------
1129
    .. [1] https://en.wikipedia.org/wiki/SimRank
1130
    .. [2] G. Jeh and J. Widom.
1131
           "SimRank: a measure of structural-context similarity",
1132
           In KDD'02: Proceedings of the Eighth ACM SIGKDD
1133
           International Conference on Knowledge Discovery and Data Mining,
1134
           pp. 538--543. ACM Press, 2002.
1135
    """
1136
    prevsim = None
1137

    
1138
    # build up our similarity adjacency dictionary output
1139
    newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}
1140

    
1141
    # These functions compute the update to the similarity value of the nodes
1142
    # `u` and `v` with respect to the previous similarity values.
1143
    avg_sim = lambda s: sum(newsim[w][x] for (w, x) in s) / len(s)
1144
    sim = lambda u, v: importance_factor * avg_sim(list(product(G[u], G[v])))
1145

    
1146
    for _ in range(max_iterations):
1147
        if prevsim and _is_close(prevsim, newsim, tolerance):
1148
            break
1149
        prevsim = newsim
1150
        newsim = {u: {v: sim(u, v) if u is not v else 1
1151
                      for v in newsim[u]} for u in newsim}
1152

    
1153
    if source is not None and target is not None:
1154
        return newsim[source][target]
1155
    if source is not None:
1156
        return newsim[source]
1157
    return newsim
1158

    
1159

    
1160
def simrank_similarity_numpy(G, source=None, target=None, importance_factor=0.9,
1161
                             max_iterations=100, tolerance=1e-4):
1162
    """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
1163

1164
    The SimRank algorithm for determining node similarity is defined in
1165
    [1]_.
1166

1167
    Parameters
1168
    ----------
1169
    G : NetworkX graph
1170
        A NetworkX graph
1171

1172
    source : node
1173
        If this is specified, the returned dictionary maps each node
1174
        ``v`` in the graph to the similarity between ``source`` and
1175
        ``v``.
1176

1177
    target : node
1178
        If both ``source`` and ``target`` are specified, the similarity
1179
        value between ``source`` and ``target`` is returned. If
1180
        ``target`` is specified but ``source`` is not, this argument is
1181
        ignored.
1182

1183
    importance_factor : float
1184
        The relative importance of indirect neighbors with respect to
1185
        direct neighbors.
1186

1187
    max_iterations : integer
1188
        Maximum number of iterations.
1189

1190
    tolerance : float
1191
        Error tolerance used to check convergence. When an iteration of
1192
        the algorithm finds that no similarity value changes more than
1193
        this amount, the algorithm halts.
1194

1195
    Returns
1196
    -------
1197
    similarity : dictionary or float
1198
        If ``source`` and ``target`` are both ``None``, this returns a
1199
        dictionary of dictionaries, where keys are node pairs and value
1200
        are similarity of the pair of nodes.
1201

1202
        If ``source`` is not ``None`` but ``target`` is, this returns a
1203
        dictionary mapping node to the similarity of ``source`` and that
1204
        node.
1205

1206
        If neither ``source`` nor ``target`` is ``None``, this returns
1207
        the similarity value for the given pair of nodes.
1208

1209
    Examples
1210
    --------
1211
        >>> import networkx as nx
1212
        >>> from numpy import array
1213
        >>> G = nx.cycle_graph(4)
1214
        >>> sim = nx.simrank_similarity_numpy(G)
1215

1216
    References
1217
    ----------
1218
    .. [1] G. Jeh and J. Widom.
1219
           "SimRank: a measure of structural-context similarity",
1220
           In KDD'02: Proceedings of the Eighth ACM SIGKDD
1221
           International Conference on Knowledge Discovery and Data Mining,
1222
           pp. 538--543. ACM Press, 2002.
1223
    """
1224
    # This algorithm follows roughly
1225
    #
1226
    #     S = max{C * (A.T * S * A), I}
1227
    #
1228
    # where C is the importance factor, A is the column normalized
1229
    # adjacency matrix, and I is the identity matrix.
1230
    import numpy as np
1231
    adjacency_matrix = nx.to_numpy_array(G)
1232

    
1233
    # column-normalize the ``adjacency_matrix``
1234
    adjacency_matrix /= adjacency_matrix.sum(axis=0)
1235

    
1236
    newsim = np.eye(adjacency_matrix.shape[0], dtype=np.float64)
1237
    for _ in range(max_iterations):
1238
        prevsim = np.copy(newsim)
1239
        newsim = importance_factor * np.matmul(
1240
            np.matmul(adjacency_matrix.T, prevsim), adjacency_matrix)
1241
        np.fill_diagonal(newsim, 1.0)
1242

    
1243
        if np.allclose(prevsim, newsim, atol=tolerance):
1244
            break
1245

    
1246
    if source is not None and target is not None:
1247
        return newsim[source, target]
1248
    if source is not None:
1249
        return newsim[source]
1250
    return newsim
1251

    
1252

    
1253
def setup_module(module):
1254
    """Fixture for nose tests."""
1255
    from nose import SkipTest
1256
    try:
1257
        import numpy
1258
    except:
1259
        raise SkipTest("NumPy not available")
1260
    try:
1261
        import scipy
1262
    except:
1263
        raise SkipTest("SciPy not available")