Revision fac6e5a4

View differences:

globecomm/analyse_snapshot.py
1
import os
2 1
import sys
3 2
from collections import defaultdict
4 3
import numpy as np
5 4
import scipy
6 5
from scipy import stats
7 6
import matplotlib.pyplot as plt
7
import utility
8 8

  
9 9
import metrics
10
import custom_plot
11

  
10 12

  
11 13
from pdb import set_trace as debugger
12 14

  
13 15
class Experiment():
16
    """This class analyzes the betweenness centrality across multiple snapshots
17
    of community networks.
18

  
19
    We want to know how often the BC score changes.
20
    """
14 21
    def __init__(self, num_of_nodes, num_of_snapshots):
15
        self.scores = np.zeros((num_of_snapshots, num_of_nodes))
22
        self.scores = np.empty((num_of_snapshots, num_of_nodes))
23
        self.scores.fill(np.nan)
16 24
        self.index_of_snapshot = 0
17 25
        self.num_of_nodes = num_of_nodes
18 26
        self.num_of_snapshots = num_of_snapshots
27
        self.node_index_map = dict()
28
        self.current_max_node_index = 0
29

  
30
    def index_of_node(self, node_id):
31
        if node_id in self.node_index_map:
32
            # print self.node_index_map[node_id]
33
            return self.node_index_map[node_id]
34
        else:
35
            print "ERROR: node doesn't exist"
36
            sys.exit()
19 37

  
20 38
    def add_new_result(self, in_filepath):
21 39
        if self.index_of_snapshot == self.num_of_snapshots:
......
24 42

  
25 43
        data = np.loadtxt(in_filepath, delimiter=',', dtype={
26 44
                'names': ('node_id', 'score'),
27
                'formats': ('i4', 'f4')
45
                'formats': ('a100', 'f4')
28 46
            })
29 47

  
30 48
        for row in data:
......
33 51
        self.index_of_snapshot += 1
34 52

  
35 53
    def update_score(self, node_id, score):
36
        self.scores[self.index_of_snapshot][node_id] = score
54
        if node_id not in self.node_index_map:
55
            self.node_index_map[node_id] = self.current_max_node_index
56
            self.current_max_node_index += 1
57

  
58
        node_index = self.index_of_node(node_id)
59
        self.scores[self.index_of_snapshot][node_index] = score
37 60

  
38 61
    def summarize(self):
39 62
        np.average(self.scores, axis=1)
40 63

  
41
    def spearman_rank_correlation_coef(self):
42
        time_diff = defaultdict(list)
43
        for j in range(self.num_of_snapshots):
44
            for i in range(j, self.num_of_snapshots):
45
                diff = scipy.stats.spearmanr(self.scores[j], self.scores[i])
46
                time_diff[i-j].append(diff[0])
47
        self._plot_time_diff(time_diff,
48
                        title='FFGraz',
49
                        xlabel='time diff (?)',
50
                        ylabel='Spearman rank correlation coefficient')
51

  
52
    def percentage_overlap(self, top_k=20):
64
    #########
65
    # METRICS
66
    #########
67
    def percentage_overlap(self, top_k=20, time_window=1):
68
        """Draws a graph in ./output/overlap*.png to represent the proportion
69
        of nodes remain in the top-k nodes with the highest BC score over different
70
        time window
53 71
        """
54
        """
55
        time_diff = defaultdict(list)
56
        for j in range(self.num_of_snapshots):
57
            for i in range(j, self.num_of_snapshots):
58
                diff = metrics.percentage_overlap(self.scores[j], self.scores[i], top_k)
59
                time_diff[i-j].append(diff)
60

  
61
        out_filepath = 'output/overlap_%s.png' % top_k
62
        fig = self._plot_time_diff(time_diff,
72
        time_diff = list()
73
        for j in range(self.num_of_snapshots - time_window):
74
            diff = metrics.percentage_overlap(self.scores[j:j+time_window+1,:], top_k)
75
            time_diff.append(diff)
76

  
77
        return time_diff
78

  
79
    def plot_percentage_overlap(self, time_diff, top_k, time_window):
80
        out_filepath = './output/percentage_overlap/top_k_%s_window_%s.png' % (top_k, time_window)
81
        fig = custom_plot.plot_time_diff(time_diff,
63 82
                        title='FFGraz',
64 83
                        xlabel='time_diff',
65 84
                        ylabel='Percentage overlap for top-k = %s' % top_k,
66 85
                        ylim=(40, 101),
67 86
                        out_filepath=out_filepath)
68 87

  
88
    def filtered_node_indices(self, cutoff_value=0):
89
        """Return a list of node indices, such that each node has the maximum BC score > 0.05 in at least one snapshot
90
        """
91
        max_values = np.nanmax(self.scores, axis=0)
69 92

  
70
    def _plot_time_diff(self, time_diff, title='', xlabel='', ylabel='', ylim=None, out_filepath=''):
71
        max_key = max(time_diff.keys()) + 1
72
        min_diff = [0 for i in range(max_key)]
73
        max_diff = [0 for i in range(max_key)]
74
        mean_diff = [0 for i in range(max_key)]
75

  
76
        for i, value in time_diff.iteritems():
77
            min_diff[i] = np.min(value)
78
            mean_diff[i] = np.mean(value)
79
            max_diff[i] = np.max(value)
80

  
81
        fig = plt.figure()
82
        # Plot
83
        x_range = sorted(time_diff.keys())
84
        plt.plot(x_range, min_diff, label='min')
85
        plt.plot(x_range, mean_diff, label='mean')
86
        plt.plot(x_range, max_diff, label='max')
87

  
88
        plt.ylabel(ylabel)
89
        plt.xlabel(xlabel)
90
        if ylim:
91
            plt.ylim(ylim)
92
        plt.legend()
93
        plt.title(title)
94

  
95
        if out_filepath:
93
        return [i for i in range(len(max_values)) if max_values[i] > cutoff_value]
94

  
95
    def plot_bc_score(self):
96
        node_indices = self.filtered_node_indices(cutoff_value=0.05)
97
        output_base = './output/bc_score/node_id_%s.png'
98

  
99
        x_range = range(self.num_of_snapshots)
100
        for n_index in node_indices:
101
            out_filepath = output_base % n_index
102
            plt.plot(x_range, self.scores[:,n_index])
103
            plt.title("Node id = %s" % n_index)
104
            # plt.title("Node name = %s" % self.node_index_map[n_index])
105
            plt.ylim([0, 0.6])
96 106
            plt.savefig(out_filepath)
97
        else:
98
            plt.show()
107
            plt.close()
108
            # plt.show()
109

  
110

  
111
def experiment_1(exp):
112
    """Shows the percentage over lap for multiple snapshots, with different time
113
    window
114
    """
115
    percentages = [i/10. for i in range(1, 6)]
116
    top_ks = [int(p*exp.num_of_nodes) for p in percentages]
117

  
118
    # Comparing percentage_overlap for different time window values
119
    time_windows = [1, 10, 20, 30, 40, 50]
120
    for k in top_ks:
121
        results = dict()
122
        for tw in time_windows:
123
            time_diff = exp.percentage_overlap(top_k=k, time_window=tw)
124

  
125
            results[tw] = time_diff
126

  
127
        out_filepath = './output/percentage_overlap/top_k_%s.png' % k
128
        custom_plot.plot_time_diff(results, ylim=(50, 102), out_filepath=out_filepath)
129

  
130
    # Comparing percentage overlap for different p-value
131
    results = dict()
132
    for k in top_ks:
133
        time_diff = exp.percentage_overlap(top_k=k, time_window=1)
134
        results[k] = time_diff
135

  
136
    out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_scatter.png'
137
    custom_plot.plot_time_diff(results, scatter=True, ylim=(50, 102), title="Percentage overlap for different top-k",
138
        out_filepath=out_filepath)
139

  
140
    out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_line.png'
141
    custom_plot.plot_time_diff(results, ylim=(50, 102), title="Percentage overlap for different top-k",
142
        out_filepath=out_filepath)
143

  
144
    for k, vals in results.iteritems():
145
        out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_%s_line.png' % k
146
        custom_plot.plot_single_time_diff(vals, ylim=(50, 102), title="Percentage overlap with top-k",
147
            out_filepath=out_filepath)
148

  
149

  
150
def experiment_2(exp):
151
    out_filepath = './output/histogram_bc_score.png'
152
    custom_plot.scatter_histogram(exp.scores, out_filepath)
99 153

  
100
def all_files_for_network(network_name, dir):
101
    files = []
102
    for file in os.listdir(dir):
103
        prefix = file.split('_')[0]
104
        if prefix == network_name:
105
            files.append(os.path.join(dir, file))
106 154

  
107
    return files
155
def experiment_3(exp, k_max=1, slack_var=0):
156
    """Analyses the "inter-change" distribution
157

  
158
    slack_var: is used to treat BC scores between 2 different
159
    snapshot as they are the same when the difference is small
160
    """
161
    node_indices = exp.filtered_node_indices(cutoff_value=0.05)
162

  
163
    rows = len(node_indices)
164
    mapped_node_indices = {node_indices[i]: i for i in range(rows)}
165

  
166
    cols = k_max + 1
167
    T = np.zeros((rows, cols))
168

  
169
    s_range = max(0, exp.num_of_snapshots - k_max)
170
    # print s_range
171
    # print exp.num_of_snapshots
172
    # k_range = 2
173

  
174
    # print "s_range %s" % range(s_range)
175
    # print "k_range %s" % range(k_max)
176

  
177
    # print "T cols = %s" % len(T[0])
178
    for n_original_index in node_indices:
179
        n_index = mapped_node_indices[n_original_index]
180
        s = 0
181
        while (s < s_range):
182
            # print s
183
            old_score = exp.scores[s][n_original_index]
184
            for k in range(k_max):
185
                # print "%i | %i" % (s, s + k)
186
                new_score = exp.scores[s + k][n_original_index]
187
                if abs(old_score - new_score) > slack_var:
188
                    T[n_index][k] += 1
189
                    s += k - 1
190
                    break;
191

  
192
                T[n_index][k_max] += 1
193

  
194
            s += 1
195

  
196
    #####
197
    # OUTPUT
198
    #####
199

  
200
    # plot scatter
201
    x_range = range(1, k_max)
202
    # for i in range(rows):
203
    #     plt.scatter(x_range, T[i][1:-1])
204

  
205
    # plot histogram
206
    considered_data = T[:,1:-r1]
207
    y_range = np.sum(considered_data, axis=0)
208
    y_range_total = np.sum(considered_data)
209

  
210
    # count non-zero
211
    y_count_non_zero = []
212
    for c in range(len(y_range)):
213
        y_count_non_zero.append(considered_data[:,c])
214
    # y_count_non_zero = np.count_nonzero(considered_data, axis=0)
215
    y_range_normalized = y_range / y_range_total
216
    if y_range_total != np.sum(y_range):
217
        print "ERRROR XXX\n"
218

  
219
    if np.sum(y_range_normalized) != 1:
220
        "--- ERROR sum Normalized values is not equal to 1: %s" % y_range_normalized
221

  
222
    plt.plot(x_range, y_range_normalized)
223

  
224
    plt.ylabel('(?) Probability that BC score is changed')
225
    plt.xlabel('k')
226
    output_basename = './output/histogram_interchange_k_%i_slack_%.3f' % (k_max, slack_var)
227
    plt.text(0, 0.8, y_range_total)
228
    plt.ylim(0, 1.1)
229
    plt.title(output_basename)
230
    plt.savefig(output_basename + '.png')
231
    plt.close()
232

  
233
    # save to the text format
234
    np.savetxt(output_basename + '.out', T, '%i')
235

  
236
    # plot cumulative distribution
237
    cumm = [sum(y_range_normalized[:i+1]) for i in range(len(y_range_normalized))]
238
    plt.plot(x_range, cumm)
239
    output_basename = './output/cumulatie_interchange_k_%i_slack_%.3f' % (k_max, slack_var)
240
    out_filepath = output_basename + '.png'
241
    plt.ylim(0, 1.1)
242
    plt.title(output_basename)
243
    plt.savefig(out_filepath)
244
    plt.close()
245

  
246

  
247
def run_experiment_3(exp):
248
    # # For testing
249
    # experiment_3(exp, k_max=5) # running with default slack_var = 0
250
    # experiment_3(exp, k_max=5, slack_var=0.01)
251
    # experiment_3(exp, k_max=50, slack_var=0.001)
252
#    For real run
253
    k_max = [50, 100, 114]
254
    for k in k_max:
255
        slack_vars = [i * 0.001 for i in range(11)]
256
        for slack_var in slack_vars:
257
            experiment_3(exp, k_max=k, slack_var=slack_var)
258

  
259
        slack_vars = [i * 0.02 for i in range(1, 10)]
260
        for slack_var in slack_vars:
261
            experiment_3(exp, k_max=k, slack_var=slack_var)
262

  
108 263

  
109 264
def main():
110
    dir = 'output'
265
    # INPUT_DIR = 'output'
266
    if len(sys.argv) == 2:
267
        INPUT_DIR = sys.argv[1]
268
    else:
269
        INPUT_DIR = 'output2'
270

  
111 271
    network = 'FFGraz'
112
    files = all_files_for_network(network, dir)
272
    files = utility.all_files_for_network(network, INPUT_DIR)
273

  
113 274
    num_of_snapshots = len(files)
114 275
    num_of_nodes = 200
115 276
    exp = Experiment(num_of_nodes, num_of_snapshots)
......
118 279

  
119 280
    exp.summarize()
120 281

  
121
    # Show the percentage over lap for multiple snapshots
122
    percentages = [i/10. for i in range(1, 6)]
123
    top_ks = [int(p*num_of_nodes) for p in percentages]
124
    for k in top_ks:
125
        exp.percentage_overlap(k)
282
    # experiment_1(exp)
283

  
284
    # experiment_2(exp)
285

  
286
    # run_experiment_3(exp)
287

  
126 288

  
127 289
if __name__ == '__main__':
128 290
    main()
globecomm/custom_plot.py
1
import numpy as np
2
import matplotlib.pyplot as plt
3

  
4
def plot_time_diff(time_diff, scatter=False, title='', xlabel='', ylabel='', ylim=None, out_filepath=''):
5
    """Helper function for percentage_overlap()
6
    """
7
    tw_keys = sorted(time_diff.keys())
8

  
9
    arr = np.linspace(0, 1, len(tw_keys))
10
    colors = get_colors_for_array(arr, 'hsv')
11

  
12
    for index, k in enumerate(tw_keys):
13
        values = time_diff[k]
14
        if scatter:
15
            plt.scatter(range(len(values)), values, color=colors[index], s=2, label=k)
16
        else:
17
            plt.plot(range(len(values)), values, label=k)
18

  
19
    plt.ylabel(ylabel)
20
    plt.xlabel(xlabel)
21
    if ylim:
22
        plt.ylim(ylim)
23
    plt.legend()
24
    plt.title(title)
25

  
26
    if out_filepath:
27
        plt.savefig(out_filepath)
28
        plt.close()
29
    else:
30
        plt.show()
31

  
32
def plot_single_time_diff(time_diff, scatter=False, title='', xlabel='', ylabel='', ylim=None, out_filepath=''):
33
    if scatter:
34
        plt.scatter(range(len(time_diff)), values, s=2)
35
    else:
36
        plt.plot(range(len(time_diff)), time_diff)
37

  
38
    plt.ylabel(ylabel)
39
    plt.xlabel(xlabel)
40
    if ylim:
41
        plt.ylim(ylim)
42
    plt.legend()
43
    plt.title(title)
44

  
45
    if out_filepath:
46
        plt.savefig(out_filepath)
47
        plt.close()
48
    else:
49
        plt.show()
50

  
51

  
52
def plot_time_diff_window(time_diff, window=1, title='', xlabel='', ylabel='', ylim=None, out_filepath=''):
53
    """Plots only one row of time_diff, coressponding to window = 1.
54

  
55
    It shows the fluctation with times
56
    """
57
    data = time_diff[window]
58
    x_range = range(len(data))
59
    plt.scatter(x_range, data)
60
    if ylim:
61
        plt.ylim(ylim)
62

  
63
    if out_filepath:
64
        plt.savefig(out_filepath)
65
        plt.close()
66
    else:
67
        plt.show()
68

  
69
def plot_time_diff_evolvement(time_diff, window=1, title='', xlabel='', ylabel='', ylim=None, out_filepath=''):
70
    """Plot shows the percentage_overlap of between the first snapshot and other subsequent snapshots
71
    """
72
    plot_data = [0 for i in range(len(time_diff.keys()))]
73
    for k, values in time_diff.iteritems():
74
        plot_data[k-1] = values[window-1]
75

  
76
    x_range = range(1, len(plot_data) + 1)
77
    plt.scatter(x_range, plot_data)
78

  
79
    if ylim:
80
        plt.ylim(ylim)
81

  
82
    if out_filepath:
83
        plt.savefig(out_filepath)
84
        plt.close()
85
    else:
86
        plt.show()
87

  
88

  
89
def scatter_histogram(values, out_filepath):
90
    """Shows (or saves the figure) of  the scater plot of the BC scores
91
    for each node.
92
    X-axis the is node
93
    Each dot on the graph is the BC score of one snapshots.
94
    """
95
    num_of_rows = len(values)
96
    num_of_cols = len(values[0])
97

  
98
    bc_min = np.nanmin(values, axis=0)
99
    bc_max = np.nanmax(values, axis=0)
100
    bc_mean = np.nanmean(values, axis=0)
101

  
102
    sorted_indices = sorted(range(num_of_cols), key = lambda k : bc_max[k])
103

  
104
    x_range = range(num_of_cols)
105
    for i in range(num_of_rows):
106
        plt.scatter(x_range, [values[i][j] for j in sorted_indices], s=1)
107

  
108
    # plt.plot(x_range, [bc_min[j] for j in sorted_indices])
109
    plt.plot(x_range, [bc_mean[j] for j in sorted_indices])
110
    # plt.plot(x_range, [bc_max[j] for j in sorted_indices])
111
    if out_filepath:
112
        plt.savefig(out_filepath)
113
    else:
114
        plt.show()
115

  
116
def get_colors_for_array(arr, color_map_name):
117
    return plt.get_cmap(color_map_name)(arr)
globecomm/heuristic_bc/utility.py
62 62
    return graph
63 63

  
64 64

  
65
def plot_graph(output_filepath, graph, title=""):
65
def plot_graph(out_filepath, graph, title=""):
66 66
    plt.figure(1, figsize=(15, 10))
67 67
    plt.title(title)
68 68
    pos=nx.spring_layout(graph)
69 69
    colors=range(30)
70 70
    nx.draw(graph, pos, node_size=1000, with_label=True)
71 71
    nx.draw_networkx_labels(graph,pos,font_size=20,font_family='sans-serif')
72
    plt.savefig(output_filepath)
72
    plt.savefig(out_filepath)
73 73
    plt.close()
74 74
    # plt.show()
75 75

  
globecomm/main.py
2 2
from os.path import isfile, join, splitext, basename
3 3
import sys
4 4
import networkx as nx
5
import numpy
5 6

  
6 7
sys.path.append("heuristic_bc/")
7 8
from heuristic_betweenness_centrality import HeuristicBetweennessCentrality as HBC
8 9

  
9 10
from pdb import set_trace as debugger
10 11

  
11

  
12
GRAPH_DIRS='/home/quynh/Thesis/quynhnguyen-ms/experiment_input_graphs/CNGraphs/snapshots'
13 12
def get_all_files_from_dir(dir):
14 13
    return [join(dir, f) for f in listdir(dir) if isfile(join(dir, f))]
15 14

  
16 15

  
17 16
if __name__ == '__main__':
18
    onlyfiles = get_all_files_from_dir(GRAPH_DIRS)
17
    input_dir = ''
18
    # Default input directory
19
    GRAPH_DIRS='/home/quynh/Thesis/quynhnguyen-ms/experiment_input_graphs/CNGraphs/snapshots/small_samples'
20

  
21
    if len(sys.argv) > 1:
22
        input_dir = sys.argv[1]
23
    else:
24
        input_dir = GRAPH_DIRS
25

  
26
    onlyfiles = get_all_files_from_dir(input_dir)
19 27

  
20 28
    for in_filepath in onlyfiles:
21 29
        filename = splitext(basename(in_filepath))[0]
22 30
        graph = nx.read_weighted_edgelist(in_filepath)
23
        hbc = HBC(graph)
24 31
        out_filepath = join('output/', filename + '.hbcout')
25
        hbc.write(out_filepath)
32

  
33
        # Brandes BC
34
        bbc = nx.betweenness_centrality(graph, weight=True, normalized=True, endpoints=True)
35
        out_data = [[k, v] for (k, v) in bbc.iteritems()]
36
        numpy.savetxt(out_filepath, out_data, delimiter=',', fmt='%s')
37
        print "Finish calculating BC for %s" % filename
38

  
39
        # # Heuristic BC
40
        # hbc = HBC(graph)
41
        # hbc.write(out_filepath)
26 42

  
27 43

  
28 44

  
globecomm/metrics/__init__.py
1
from .percentage_overlap import *
globecomm/metrics/percentage_overlap.py
1
from __future__ import division
2
from pdb import set_trace as debugger
3

  
4
__all__ = ['percentage_overlap']
5

  
6
def percentage_overlap(matrices, top_k=1):
7
    num_of_rows = len(matrices)
8
    # debugger()
9
    if num_of_rows < 2:
10
        return 100
11

  
12
    x_sorted_indices = _sort_with_index(matrices[0])
13

  
14
    intersection_set = set(x_sorted_indices[:top_k])
15
    for r in range(1, num_of_rows):
16
        y_sorted_indices = _sort_with_index(matrices[r])
17

  
18
        # get top k
19
        y_top_k = set(y_sorted_indices[:top_k])
20

  
21
        # debugger()
22
        intersection_set = intersection_set.intersection(y_top_k)
23

  
24
    return len(intersection_set) * 100. / top_k
25

  
26
def _sort_with_index(arr):
27
    return sorted(range(len(arr)), key=lambda k: arr[k])
28

  
globecomm/script.sh
1
python main.py
2
python analyse_snapshot.py
1
########
2
# STEPS 1
3
# calculating the BC scores for every *.edges files in a directory.
4
# The result is saved in ./output directory
5
########
6
python main.py /home/quynh/Thesis/quynhnguyen-ms/experiment_input_graphs/CNGraphs/snapshots
7

  
8
########
9
# STEPS 2
10
# analyze the BC scores for multiple snapshots given the folder containing all the BC scores
11
# You can modify which experiment you want to run in the main() function
12
# Experiment 1 plots the percentage overlap
13
# Experiment 3 plots
14
#     - the inter-change histogram for BC scores across snapshot.
15
#     - the cumulative distribution for inter-change
16
########
17
python analyse_snapshot.py ./bc_output
globecomm/utility.py
1
import os
2

  
3
def get_int(name):
4
    """Helper function of all_files_for_network()
5
    """
6
    basename = name.partition('.')[0]
7
    # print basename
8
    alpha, num = basename.split('_')
9
    return int(num)
10

  
11

  
12
def all_files_for_network(network_name, dir):
13
    """Returns the all the files started with the <network_name>
14
    in the sorted order in the <dir> directory
15

  
16
    Sample output: [FFGraz_1.edges, FFGraz_2.edges, ...., FFGraz_10.edges, ...]
17
    """
18
    files = []
19
    for file in os.listdir(dir):
20
        prefix = file.split('_')[0]
21
        if prefix == network_name:
22
            files.append(os.path.join(dir, file))
23

  
24
    files.sort(key=get_int)
25

  
26
    return files

Also available in: Unified diff