Statistics
| Branch: | Revision:

root / globecomm / analyse_snapshot.py @ fac6e5a4

History | View | Annotate | Download (9.37 KB)

1 bd3d6dca Quynh PX Nguyen
import sys
2
from collections import defaultdict
3
import numpy as np
4
import scipy
5
from scipy import stats
6
import matplotlib.pyplot as plt
7 fac6e5a4 Quynh PX Nguyen
import utility
8 bd3d6dca Quynh PX Nguyen
9 5f27ee90 Quynh PX Nguyen
import metrics
10 fac6e5a4 Quynh PX Nguyen
import custom_plot
11
12 5f27ee90 Quynh PX Nguyen
13 bd3d6dca Quynh PX Nguyen
from pdb import set_trace as debugger
14
15
class Experiment():
16 fac6e5a4 Quynh PX Nguyen
    """This class analyzes the betweenness centrality across multiple snapshots
17
    of community networks.
18

19
    We want to know how often the BC score changes.
20
    """
21 bd3d6dca Quynh PX Nguyen
    def __init__(self, num_of_nodes, num_of_snapshots):
22 fac6e5a4 Quynh PX Nguyen
        self.scores = np.empty((num_of_snapshots, num_of_nodes))
23
        self.scores.fill(np.nan)
24 bd3d6dca Quynh PX Nguyen
        self.index_of_snapshot = 0
25
        self.num_of_nodes = num_of_nodes
26
        self.num_of_snapshots = num_of_snapshots
27 fac6e5a4 Quynh PX Nguyen
        self.node_index_map = dict()
28
        self.current_max_node_index = 0
29
30
    def index_of_node(self, node_id):
31
        if node_id in self.node_index_map:
32
            # print self.node_index_map[node_id]
33
            return self.node_index_map[node_id]
34
        else:
35
            print "ERROR: node doesn't exist"
36
            sys.exit()
37 bd3d6dca Quynh PX Nguyen
38
    def add_new_result(self, in_filepath):
39
        if self.index_of_snapshot == self.num_of_snapshots:
40
            print "ERROR: the number of snapshots provided is less than the input files"
41
            sys.exit()
42
43
        data = np.loadtxt(in_filepath, delimiter=',', dtype={
44
                'names': ('node_id', 'score'),
45 fac6e5a4 Quynh PX Nguyen
                'formats': ('a100', 'f4')
46 bd3d6dca Quynh PX Nguyen
            })
47
48
        for row in data:
49
            self.update_score(row[0], row[1])
50
51
        self.index_of_snapshot += 1
52
53
    def update_score(self, node_id, score):
54 fac6e5a4 Quynh PX Nguyen
        if node_id not in self.node_index_map:
55
            self.node_index_map[node_id] = self.current_max_node_index
56
            self.current_max_node_index += 1
57
58
        node_index = self.index_of_node(node_id)
59
        self.scores[self.index_of_snapshot][node_index] = score
60 bd3d6dca Quynh PX Nguyen
61
    def summarize(self):
62
        np.average(self.scores, axis=1)
63
64 fac6e5a4 Quynh PX Nguyen
    #########
65
    # METRICS
66
    #########
67
    def percentage_overlap(self, top_k=20, time_window=1):
68
        """Draws a graph in ./output/overlap*.png to represent the proportion
69
        of nodes remain in the top-k nodes with the highest BC score over different
70
        time window
71 5f27ee90 Quynh PX Nguyen
        """
72 fac6e5a4 Quynh PX Nguyen
        time_diff = list()
73
        for j in range(self.num_of_snapshots - time_window):
74
            diff = metrics.percentage_overlap(self.scores[j:j+time_window+1,:], top_k)
75
            time_diff.append(diff)
76
77
        return time_diff
78
79
    def plot_percentage_overlap(self, time_diff, top_k, time_window):
80
        out_filepath = './output/percentage_overlap/top_k_%s_window_%s.png' % (top_k, time_window)
81
        fig = custom_plot.plot_time_diff(time_diff,
82 5f27ee90 Quynh PX Nguyen
                        title='FFGraz',
83
                        xlabel='time_diff',
84
                        ylabel='Percentage overlap for top-k = %s' % top_k,
85
                        ylim=(40, 101),
86
                        out_filepath=out_filepath)
87 bd3d6dca Quynh PX Nguyen
88 fac6e5a4 Quynh PX Nguyen
    def filtered_node_indices(self, cutoff_value=0):
89
        """Return a list of node indices, such that each node has the maximum BC score > 0.05 in at least one snapshot
90
        """
91
        max_values = np.nanmax(self.scores, axis=0)
92 5f27ee90 Quynh PX Nguyen
93 fac6e5a4 Quynh PX Nguyen
        return [i for i in range(len(max_values)) if max_values[i] > cutoff_value]
94
95
    def plot_bc_score(self):
96
        node_indices = self.filtered_node_indices(cutoff_value=0.05)
97
        output_base = './output/bc_score/node_id_%s.png'
98
99
        x_range = range(self.num_of_snapshots)
100
        for n_index in node_indices:
101
            out_filepath = output_base % n_index
102
            plt.plot(x_range, self.scores[:,n_index])
103
            plt.title("Node id = %s" % n_index)
104
            # plt.title("Node name = %s" % self.node_index_map[n_index])
105
            plt.ylim([0, 0.6])
106 5f27ee90 Quynh PX Nguyen
            plt.savefig(out_filepath)
107 fac6e5a4 Quynh PX Nguyen
            plt.close()
108
            # plt.show()
109
110
111
def experiment_1(exp):
112
    """Shows the percentage over lap for multiple snapshots, with different time
113
    window
114
    """
115
    percentages = [i/10. for i in range(1, 6)]
116
    top_ks = [int(p*exp.num_of_nodes) for p in percentages]
117
118
    # Comparing percentage_overlap for different time window values
119
    time_windows = [1, 10, 20, 30, 40, 50]
120
    for k in top_ks:
121
        results = dict()
122
        for tw in time_windows:
123
            time_diff = exp.percentage_overlap(top_k=k, time_window=tw)
124
125
            results[tw] = time_diff
126
127
        out_filepath = './output/percentage_overlap/top_k_%s.png' % k
128
        custom_plot.plot_time_diff(results, ylim=(50, 102), out_filepath=out_filepath)
129
130
    # Comparing percentage overlap for different p-value
131
    results = dict()
132
    for k in top_ks:
133
        time_diff = exp.percentage_overlap(top_k=k, time_window=1)
134
        results[k] = time_diff
135
136
    out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_scatter.png'
137
    custom_plot.plot_time_diff(results, scatter=True, ylim=(50, 102), title="Percentage overlap for different top-k",
138
        out_filepath=out_filepath)
139
140
    out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_line.png'
141
    custom_plot.plot_time_diff(results, ylim=(50, 102), title="Percentage overlap for different top-k",
142
        out_filepath=out_filepath)
143
144
    for k, vals in results.iteritems():
145
        out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_%s_line.png' % k
146
        custom_plot.plot_single_time_diff(vals, ylim=(50, 102), title="Percentage overlap with top-k",
147
            out_filepath=out_filepath)
148
149
150
def experiment_2(exp):
151
    out_filepath = './output/histogram_bc_score.png'
152
    custom_plot.scatter_histogram(exp.scores, out_filepath)
153 bd3d6dca Quynh PX Nguyen
154
155 fac6e5a4 Quynh PX Nguyen
def experiment_3(exp, k_max=1, slack_var=0):
156
    """Analyses the "inter-change" distribution
157

158
    slack_var: is used to treat BC scores between 2 different
159
    snapshot as they are the same when the difference is small
160
    """
161
    node_indices = exp.filtered_node_indices(cutoff_value=0.05)
162
163
    rows = len(node_indices)
164
    mapped_node_indices = {node_indices[i]: i for i in range(rows)}
165
166
    cols = k_max + 1
167
    T = np.zeros((rows, cols))
168
169
    s_range = max(0, exp.num_of_snapshots - k_max)
170
    # print s_range
171
    # print exp.num_of_snapshots
172
    # k_range = 2
173
174
    # print "s_range %s" % range(s_range)
175
    # print "k_range %s" % range(k_max)
176
177
    # print "T cols = %s" % len(T[0])
178
    for n_original_index in node_indices:
179
        n_index = mapped_node_indices[n_original_index]
180
        s = 0
181
        while (s < s_range):
182
            # print s
183
            old_score = exp.scores[s][n_original_index]
184
            for k in range(k_max):
185
                # print "%i | %i" % (s, s + k)
186
                new_score = exp.scores[s + k][n_original_index]
187
                if abs(old_score - new_score) > slack_var:
188
                    T[n_index][k] += 1
189
                    s += k - 1
190
                    break;
191
192
                T[n_index][k_max] += 1
193
194
            s += 1
195
196
    #####
197
    # OUTPUT
198
    #####
199
200
    # plot scatter
201
    x_range = range(1, k_max)
202
    # for i in range(rows):
203
    #     plt.scatter(x_range, T[i][1:-1])
204
205
    # plot histogram
206
    considered_data = T[:,1:-r1]
207
    y_range = np.sum(considered_data, axis=0)
208
    y_range_total = np.sum(considered_data)
209
210
    # count non-zero
211
    y_count_non_zero = []
212
    for c in range(len(y_range)):
213
        y_count_non_zero.append(considered_data[:,c])
214
    # y_count_non_zero = np.count_nonzero(considered_data, axis=0)
215
    y_range_normalized = y_range / y_range_total
216
    if y_range_total != np.sum(y_range):
217
        print "ERRROR XXX\n"
218
219
    if np.sum(y_range_normalized) != 1:
220
        "--- ERROR sum Normalized values is not equal to 1: %s" % y_range_normalized
221
222
    plt.plot(x_range, y_range_normalized)
223
224
    plt.ylabel('(?) Probability that BC score is changed')
225
    plt.xlabel('k')
226
    output_basename = './output/histogram_interchange_k_%i_slack_%.3f' % (k_max, slack_var)
227
    plt.text(0, 0.8, y_range_total)
228
    plt.ylim(0, 1.1)
229
    plt.title(output_basename)
230
    plt.savefig(output_basename + '.png')
231
    plt.close()
232
233
    # save to the text format
234
    np.savetxt(output_basename + '.out', T, '%i')
235
236
    # plot cumulative distribution
237
    cumm = [sum(y_range_normalized[:i+1]) for i in range(len(y_range_normalized))]
238
    plt.plot(x_range, cumm)
239
    output_basename = './output/cumulatie_interchange_k_%i_slack_%.3f' % (k_max, slack_var)
240
    out_filepath = output_basename + '.png'
241
    plt.ylim(0, 1.1)
242
    plt.title(output_basename)
243
    plt.savefig(out_filepath)
244
    plt.close()
245
246
247
def run_experiment_3(exp):
248
    # # For testing
249
    # experiment_3(exp, k_max=5) # running with default slack_var = 0
250
    # experiment_3(exp, k_max=5, slack_var=0.01)
251
    # experiment_3(exp, k_max=50, slack_var=0.001)
252
#    For real run
253
    k_max = [50, 100, 114]
254
    for k in k_max:
255
        slack_vars = [i * 0.001 for i in range(11)]
256
        for slack_var in slack_vars:
257
            experiment_3(exp, k_max=k, slack_var=slack_var)
258
259
        slack_vars = [i * 0.02 for i in range(1, 10)]
260
        for slack_var in slack_vars:
261
            experiment_3(exp, k_max=k, slack_var=slack_var)
262
263 bd3d6dca Quynh PX Nguyen
264
def main():
265 fac6e5a4 Quynh PX Nguyen
    # INPUT_DIR = 'output'
266
    if len(sys.argv) == 2:
267
        INPUT_DIR = sys.argv[1]
268
    else:
269
        INPUT_DIR = 'output2'
270
271 bd3d6dca Quynh PX Nguyen
    network = 'FFGraz'
272 fac6e5a4 Quynh PX Nguyen
    files = utility.all_files_for_network(network, INPUT_DIR)
273
274 bd3d6dca Quynh PX Nguyen
    num_of_snapshots = len(files)
275
    num_of_nodes = 200
276
    exp = Experiment(num_of_nodes, num_of_snapshots)
277
    for file in files:
278
        exp.add_new_result(file)
279
280
    exp.summarize()
281 5f27ee90 Quynh PX Nguyen
282 fac6e5a4 Quynh PX Nguyen
    # experiment_1(exp)
283
284
    # experiment_2(exp)
285
286
    # run_experiment_3(exp)
287
288 bd3d6dca Quynh PX Nguyen
289
if __name__ == '__main__':
290
    main()