Statistics
| Branch: | Revision:

iof-tools / networkxMiCe / networkx-master / networkx / algorithms / community / modularity_max.py @ 5cef0f13

History | View | Annotate | Download (10.3 KB)

1
# modularity_max.py - functions for finding communities based on modularity
2
#
3
# Copyright 2018 Edward L. Platt
4
#
5
# This file is part of NetworkX
6
#
7
# NetworkX is distributed under a BSD license; see LICENSE.txt for more
8
# information.
9
#
10
# Authors:
11
#   Edward L. Platt <ed@elplatt.com>
12
#
13
# TODO:
14
#   - Alter equations for weighted case
15
#   - Write tests for weighted case
16
"""Functions for detecting communities based on modularity.
17
"""
18
from __future__ import division
19

    
20
import networkx as nx
21
from networkx.algorithms.community.quality import modularity
22

    
23
from networkx.utils.mapped_queue import MappedQueue
24

    
25
__all__ = [
26
    'greedy_modularity_communities',
27
    '_naive_greedy_modularity_communities']
28

    
29

    
30
def greedy_modularity_communities(G, weight=None):
31
    """Find communities in graph using Clauset-Newman-Moore greedy modularity
32
    maximization. This method currently supports the Graph class and does not
33
    consider edge weights.
34

35
    Greedy modularity maximization begins with each node in its own community
36
    and joins the pair of communities that most increases modularity until no
37
    such pair exists.
38

39
    Parameters
40
    ----------
41
    G : NetworkX graph
42

43
    Returns
44
    -------
45
    Yields sets of nodes, one for each community.
46

47
    Examples
48
    --------
49
    >>> from networkx.algorithms.community import greedy_modularity_communities
50
    >>> G = nx.karate_club_graph()
51
    >>> c = list(greedy_modularity_communities(G))
52
    >>> sorted(c[0])
53
    [8, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
54

55
    References
56
    ----------
57
    .. [1] M. E. J Newman 'Networks: An Introduction', page 224
58
       Oxford University Press 2011.
59
    .. [2] Clauset, A., Newman, M. E., & Moore, C.
60
       "Finding community structure in very large networks."
61
       Physical Review E 70(6), 2004.
62
    """
63

    
64
    # Count nodes and edges
65
    N = len(G.nodes())
66
    m = sum([d.get('weight', 1) for u, v, d in G.edges(data=True)])
67
    q0 = 1.0 / (2.0*m)
68

    
69
    # Map node labels to contiguous integers
70
    label_for_node = dict((i, v) for i, v in enumerate(G.nodes()))
71
    node_for_label = dict((label_for_node[i], i) for i in range(N))
72

    
73
    # Calculate degrees
74
    k_for_label = G.degree(G.nodes(), weight=weight)
75
    k = [k_for_label[label_for_node[i]] for i in range(N)]
76

    
77
    # Initialize community and merge lists
78
    communities = dict((i, frozenset([i])) for i in range(N))
79
    merges = []
80

    
81
    # Initial modularity
82
    partition = [[label_for_node[x] for x in c] for c in communities.values()]
83
    q_cnm = modularity(G, partition)
84

    
85
    # Initialize data structures
86
    # CNM Eq 8-9 (Eq 8 was missing a factor of 2 (from A_ij + A_ji)
87
    # a[i]: fraction of edges within community i
88
    # dq_dict[i][j]: dQ for merging community i, j
89
    # dq_heap[i][n] : (-dq, i, j) for communitiy i nth largest dQ
90
    # H[n]: (-dq, i, j) for community with nth largest max_j(dQ_ij)
91
    a = [k[i]*q0 for i in range(N)]
92
    dq_dict = dict(
93
        (i, dict(
94
            (j, 2*q0 - 2*k[i]*k[j]*q0*q0)
95
            for j in [
96
                node_for_label[u]
97
                for u in G.neighbors(label_for_node[i])]
98
            if j != i))
99
        for i in range(N))
100
    dq_heap = [
101
        MappedQueue([
102
            (-dq, i, j)
103
            for j, dq in dq_dict[i].items()])
104
        for i in range(N)]
105
    H = MappedQueue([
106
        dq_heap[i].h[0]
107
        for i in range(N)
108
        if len(dq_heap[i]) > 0])
109

    
110
    # Merge communities until we can't improve modularity
111
    while len(H) > 1:
112
        # Find best merge
113
        # Remove from heap of row maxes
114
        # Ties will be broken by choosing the pair with lowest min community id
115
        try:
116
            dq, i, j = H.pop()
117
        except IndexError:
118
            break
119
        dq = -dq
120
        # Remove best merge from row i heap
121
        dq_heap[i].pop()
122
        # Push new row max onto H
123
        if len(dq_heap[i]) > 0:
124
            H.push(dq_heap[i].h[0])
125
        # If this element was also at the root of row j, we need to remove the
126
        # duplicate entry from H
127
        if dq_heap[j].h[0] == (-dq, j, i):
128
            H.remove((-dq, j, i))
129
            # Remove best merge from row j heap
130
            dq_heap[j].remove((-dq, j, i))
131
            # Push new row max onto H
132
            if len(dq_heap[j]) > 0:
133
                H.push(dq_heap[j].h[0])
134
        else:
135
            # Duplicate wasn't in H, just remove from row j heap
136
            dq_heap[j].remove((-dq, j, i))
137
        # Stop when change is non-positive
138
        if dq <= 0:
139
            break
140

    
141
        # Perform merge
142
        communities[j] = frozenset(communities[i] | communities[j])
143
        del communities[i]
144
        merges.append((i, j, dq))
145
        # New modularity
146
        q_cnm += dq
147
        # Get list of communities connected to merged communities
148
        i_set = set(dq_dict[i].keys())
149
        j_set = set(dq_dict[j].keys())
150
        all_set = (i_set | j_set) - set([i, j])
151
        both_set = i_set & j_set
152
        # Merge i into j and update dQ
153
        for k in all_set:
154
            # Calculate new dq value
155
            if k in both_set:
156
                dq_jk = dq_dict[j][k] + dq_dict[i][k]
157
            elif k in j_set:
158
                dq_jk = dq_dict[j][k] - 2.0*a[i]*a[k]
159
            else:
160
                # k in i_set
161
                dq_jk = dq_dict[i][k] - 2.0*a[j]*a[k]
162
            # Update rows j and k
163
            for row, col in [(j, k), (k, j)]:
164
                # Save old value for finding heap index
165
                if k in j_set:
166
                    d_old = (-dq_dict[row][col], row, col)
167
                else:
168
                    d_old = None
169
                # Update dict for j,k only (i is removed below)
170
                dq_dict[row][col] = dq_jk
171
                # Save old max of per-row heap
172
                if len(dq_heap[row]) > 0:
173
                    d_oldmax = dq_heap[row].h[0]
174
                else:
175
                    d_oldmax = None
176
                # Add/update heaps
177
                d = (-dq_jk, row, col)
178
                if d_old is None:
179
                    # We're creating a new nonzero element, add to heap
180
                    dq_heap[row].push(d)
181
                else:
182
                    # Update existing element in per-row heap
183
                    dq_heap[row].update(d_old, d)
184
                # Update heap of row maxes if necessary
185
                if d_oldmax is None:
186
                    # No entries previously in this row, push new max
187
                    H.push(d)
188
                else:
189
                    # We've updated an entry in this row, has the max changed?
190
                    if dq_heap[row].h[0] != d_oldmax:
191
                        H.update(d_oldmax, dq_heap[row].h[0])
192

    
193
        # Remove row/col i from matrix
194
        i_neighbors = dq_dict[i].keys()
195
        for k in i_neighbors:
196
            # Remove from dict
197
            dq_old = dq_dict[k][i]
198
            del dq_dict[k][i]
199
            # Remove from heaps if we haven't already
200
            if k != j:
201
                # Remove both row and column
202
                for row, col in [(k, i), (i, k)]:
203
                    # Check if replaced dq is row max
204
                    d_old = (-dq_old, row, col)
205
                    if dq_heap[row].h[0] == d_old:
206
                        # Update per-row heap and heap of row maxes
207
                        dq_heap[row].remove(d_old)
208
                        H.remove(d_old)
209
                        # Update row max
210
                        if len(dq_heap[row]) > 0:
211
                            H.push(dq_heap[row].h[0])
212
                    else:
213
                        # Only update per-row heap
214
                        dq_heap[row].remove(d_old)
215

    
216
        del dq_dict[i]
217
        # Mark row i as deleted, but keep placeholder
218
        dq_heap[i] = MappedQueue()
219
        # Merge i into j and update a
220
        a[j] += a[i]
221
        a[i] = 0
222

    
223
    communities = [
224
        frozenset([label_for_node[i] for i in c])
225
        for c in communities.values()]
226
    return sorted(communities, key=len, reverse=True)
227

    
228

    
229
def _naive_greedy_modularity_communities(G):
230
    """Find communities in graph using the greedy modularity maximization.
231
    This implementation is O(n^4), much slower than alternatives, but it is
232
    provided as an easy-to-understand reference implementation.
233
    """
234
    # First create one community for each node
235
    communities = list([frozenset([u]) for u in G.nodes()])
236
    # Track merges
237
    merges = []
238
    # Greedily merge communities until no improvement is possible
239
    old_modularity = None
240
    new_modularity = modularity(G, communities)
241
    while old_modularity is None or new_modularity > old_modularity:
242
        # Save modularity for comparison
243
        old_modularity = new_modularity
244
        # Find best pair to merge
245
        trial_communities = list(communities)
246
        to_merge = None
247
        for i, u in enumerate(communities):
248
            for j, v in enumerate(communities):
249
                # Skip i=j and empty communities
250
                if j <= i or len(u) == 0 or len(v) == 0:
251
                    continue
252
                # Merge communities u and v
253
                trial_communities[j] = u | v
254
                trial_communities[i] = frozenset([])
255
                trial_modularity = modularity(G, trial_communities)
256
                if trial_modularity >= new_modularity:
257
                    # Check if strictly better or tie
258
                    if trial_modularity > new_modularity:
259
                        # Found new best, save modularity and group indexes
260
                        new_modularity = trial_modularity
261
                        to_merge = (i, j, new_modularity - old_modularity)
262
                    elif (
263
                        to_merge and
264
                        min(i, j) < min(to_merge[0], to_merge[1])
265
                    ):
266
                        # Break ties by choosing pair with lowest min id
267
                        new_modularity = trial_modularity
268
                        to_merge = (i, j, new_modularity - old_modularity)
269
                # Un-merge
270
                trial_communities[i] = u
271
                trial_communities[j] = v
272
        if to_merge is not None:
273
            # If the best merge improves modularity, use it
274
            merges.append(to_merge)
275
            i, j, dq = to_merge
276
            u, v = communities[i], communities[j]
277
            communities[j] = u | v
278
            communities[i] = frozenset([])
279
    # Remove empty communities and sort
280
    communities = [c for c in communities if len(c) > 0]
281
    for com in sorted(communities, key=lambda x: len(x), reverse=True):
282
        yield com