Statistics
| Branch: | Revision:

root / globecomm / analyse_snapshot.py @ fac6e5a4

History | View | Annotate | Download (9.37 KB)

1
import sys
2
from collections import defaultdict
3
import numpy as np
4
import scipy
5
from scipy import stats
6
import matplotlib.pyplot as plt
7
import utility
8

    
9
import metrics
10
import custom_plot
11

    
12

    
13
from pdb import set_trace as debugger
14

    
15
class Experiment():
16
    """This class analyzes the betweenness centrality across multiple snapshots
17
    of community networks.
18

19
    We want to know how often the BC score changes.
20
    """
21
    def __init__(self, num_of_nodes, num_of_snapshots):
22
        self.scores = np.empty((num_of_snapshots, num_of_nodes))
23
        self.scores.fill(np.nan)
24
        self.index_of_snapshot = 0
25
        self.num_of_nodes = num_of_nodes
26
        self.num_of_snapshots = num_of_snapshots
27
        self.node_index_map = dict()
28
        self.current_max_node_index = 0
29

    
30
    def index_of_node(self, node_id):
31
        if node_id in self.node_index_map:
32
            # print self.node_index_map[node_id]
33
            return self.node_index_map[node_id]
34
        else:
35
            print "ERROR: node doesn't exist"
36
            sys.exit()
37

    
38
    def add_new_result(self, in_filepath):
39
        if self.index_of_snapshot == self.num_of_snapshots:
40
            print "ERROR: the number of snapshots provided is less than the input files"
41
            sys.exit()
42

    
43
        data = np.loadtxt(in_filepath, delimiter=',', dtype={
44
                'names': ('node_id', 'score'),
45
                'formats': ('a100', 'f4')
46
            })
47

    
48
        for row in data:
49
            self.update_score(row[0], row[1])
50

    
51
        self.index_of_snapshot += 1
52

    
53
    def update_score(self, node_id, score):
54
        if node_id not in self.node_index_map:
55
            self.node_index_map[node_id] = self.current_max_node_index
56
            self.current_max_node_index += 1
57

    
58
        node_index = self.index_of_node(node_id)
59
        self.scores[self.index_of_snapshot][node_index] = score
60

    
61
    def summarize(self):
62
        np.average(self.scores, axis=1)
63

    
64
    #########
65
    # METRICS
66
    #########
67
    def percentage_overlap(self, top_k=20, time_window=1):
68
        """Draws a graph in ./output/overlap*.png to represent the proportion
69
        of nodes remain in the top-k nodes with the highest BC score over different
70
        time window
71
        """
72
        time_diff = list()
73
        for j in range(self.num_of_snapshots - time_window):
74
            diff = metrics.percentage_overlap(self.scores[j:j+time_window+1,:], top_k)
75
            time_diff.append(diff)
76

    
77
        return time_diff
78

    
79
    def plot_percentage_overlap(self, time_diff, top_k, time_window):
80
        out_filepath = './output/percentage_overlap/top_k_%s_window_%s.png' % (top_k, time_window)
81
        fig = custom_plot.plot_time_diff(time_diff,
82
                        title='FFGraz',
83
                        xlabel='time_diff',
84
                        ylabel='Percentage overlap for top-k = %s' % top_k,
85
                        ylim=(40, 101),
86
                        out_filepath=out_filepath)
87

    
88
    def filtered_node_indices(self, cutoff_value=0):
89
        """Return a list of node indices, such that each node has the maximum BC score > 0.05 in at least one snapshot
90
        """
91
        max_values = np.nanmax(self.scores, axis=0)
92

    
93
        return [i for i in range(len(max_values)) if max_values[i] > cutoff_value]
94

    
95
    def plot_bc_score(self):
96
        node_indices = self.filtered_node_indices(cutoff_value=0.05)
97
        output_base = './output/bc_score/node_id_%s.png'
98

    
99
        x_range = range(self.num_of_snapshots)
100
        for n_index in node_indices:
101
            out_filepath = output_base % n_index
102
            plt.plot(x_range, self.scores[:,n_index])
103
            plt.title("Node id = %s" % n_index)
104
            # plt.title("Node name = %s" % self.node_index_map[n_index])
105
            plt.ylim([0, 0.6])
106
            plt.savefig(out_filepath)
107
            plt.close()
108
            # plt.show()
109

    
110

    
111
def experiment_1(exp):
112
    """Shows the percentage over lap for multiple snapshots, with different time
113
    window
114
    """
115
    percentages = [i/10. for i in range(1, 6)]
116
    top_ks = [int(p*exp.num_of_nodes) for p in percentages]
117

    
118
    # Comparing percentage_overlap for different time window values
119
    time_windows = [1, 10, 20, 30, 40, 50]
120
    for k in top_ks:
121
        results = dict()
122
        for tw in time_windows:
123
            time_diff = exp.percentage_overlap(top_k=k, time_window=tw)
124

    
125
            results[tw] = time_diff
126

    
127
        out_filepath = './output/percentage_overlap/top_k_%s.png' % k
128
        custom_plot.plot_time_diff(results, ylim=(50, 102), out_filepath=out_filepath)
129

    
130
    # Comparing percentage overlap for different p-value
131
    results = dict()
132
    for k in top_ks:
133
        time_diff = exp.percentage_overlap(top_k=k, time_window=1)
134
        results[k] = time_diff
135

    
136
    out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_scatter.png'
137
    custom_plot.plot_time_diff(results, scatter=True, ylim=(50, 102), title="Percentage overlap for different top-k",
138
        out_filepath=out_filepath)
139

    
140
    out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_line.png'
141
    custom_plot.plot_time_diff(results, ylim=(50, 102), title="Percentage overlap for different top-k",
142
        out_filepath=out_filepath)
143

    
144
    for k, vals in results.iteritems():
145
        out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_%s_line.png' % k
146
        custom_plot.plot_single_time_diff(vals, ylim=(50, 102), title="Percentage overlap with top-k",
147
            out_filepath=out_filepath)
148

    
149

    
150
def experiment_2(exp):
151
    out_filepath = './output/histogram_bc_score.png'
152
    custom_plot.scatter_histogram(exp.scores, out_filepath)
153

    
154

    
155
def experiment_3(exp, k_max=1, slack_var=0):
156
    """Analyses the "inter-change" distribution
157

158
    slack_var: is used to treat BC scores between 2 different
159
    snapshot as they are the same when the difference is small
160
    """
161
    node_indices = exp.filtered_node_indices(cutoff_value=0.05)
162

    
163
    rows = len(node_indices)
164
    mapped_node_indices = {node_indices[i]: i for i in range(rows)}
165

    
166
    cols = k_max + 1
167
    T = np.zeros((rows, cols))
168

    
169
    s_range = max(0, exp.num_of_snapshots - k_max)
170
    # print s_range
171
    # print exp.num_of_snapshots
172
    # k_range = 2
173

    
174
    # print "s_range %s" % range(s_range)
175
    # print "k_range %s" % range(k_max)
176

    
177
    # print "T cols = %s" % len(T[0])
178
    for n_original_index in node_indices:
179
        n_index = mapped_node_indices[n_original_index]
180
        s = 0
181
        while (s < s_range):
182
            # print s
183
            old_score = exp.scores[s][n_original_index]
184
            for k in range(k_max):
185
                # print "%i | %i" % (s, s + k)
186
                new_score = exp.scores[s + k][n_original_index]
187
                if abs(old_score - new_score) > slack_var:
188
                    T[n_index][k] += 1
189
                    s += k - 1
190
                    break;
191

    
192
                T[n_index][k_max] += 1
193

    
194
            s += 1
195

    
196
    #####
197
    # OUTPUT
198
    #####
199

    
200
    # plot scatter
201
    x_range = range(1, k_max)
202
    # for i in range(rows):
203
    #     plt.scatter(x_range, T[i][1:-1])
204

    
205
    # plot histogram
206
    considered_data = T[:,1:-r1]
207
    y_range = np.sum(considered_data, axis=0)
208
    y_range_total = np.sum(considered_data)
209

    
210
    # count non-zero
211
    y_count_non_zero = []
212
    for c in range(len(y_range)):
213
        y_count_non_zero.append(considered_data[:,c])
214
    # y_count_non_zero = np.count_nonzero(considered_data, axis=0)
215
    y_range_normalized = y_range / y_range_total
216
    if y_range_total != np.sum(y_range):
217
        print "ERRROR XXX\n"
218

    
219
    if np.sum(y_range_normalized) != 1:
220
        "--- ERROR sum Normalized values is not equal to 1: %s" % y_range_normalized
221

    
222
    plt.plot(x_range, y_range_normalized)
223

    
224
    plt.ylabel('(?) Probability that BC score is changed')
225
    plt.xlabel('k')
226
    output_basename = './output/histogram_interchange_k_%i_slack_%.3f' % (k_max, slack_var)
227
    plt.text(0, 0.8, y_range_total)
228
    plt.ylim(0, 1.1)
229
    plt.title(output_basename)
230
    plt.savefig(output_basename + '.png')
231
    plt.close()
232

    
233
    # save to the text format
234
    np.savetxt(output_basename + '.out', T, '%i')
235

    
236
    # plot cumulative distribution
237
    cumm = [sum(y_range_normalized[:i+1]) for i in range(len(y_range_normalized))]
238
    plt.plot(x_range, cumm)
239
    output_basename = './output/cumulatie_interchange_k_%i_slack_%.3f' % (k_max, slack_var)
240
    out_filepath = output_basename + '.png'
241
    plt.ylim(0, 1.1)
242
    plt.title(output_basename)
243
    plt.savefig(out_filepath)
244
    plt.close()
245

    
246

    
247
def run_experiment_3(exp):
248
    # # For testing
249
    # experiment_3(exp, k_max=5) # running with default slack_var = 0
250
    # experiment_3(exp, k_max=5, slack_var=0.01)
251
    # experiment_3(exp, k_max=50, slack_var=0.001)
252
#    For real run
253
    k_max = [50, 100, 114]
254
    for k in k_max:
255
        slack_vars = [i * 0.001 for i in range(11)]
256
        for slack_var in slack_vars:
257
            experiment_3(exp, k_max=k, slack_var=slack_var)
258

    
259
        slack_vars = [i * 0.02 for i in range(1, 10)]
260
        for slack_var in slack_vars:
261
            experiment_3(exp, k_max=k, slack_var=slack_var)
262

    
263

    
264
def main():
265
    # INPUT_DIR = 'output'
266
    if len(sys.argv) == 2:
267
        INPUT_DIR = sys.argv[1]
268
    else:
269
        INPUT_DIR = 'output2'
270

    
271
    network = 'FFGraz'
272
    files = utility.all_files_for_network(network, INPUT_DIR)
273

    
274
    num_of_snapshots = len(files)
275
    num_of_nodes = 200
276
    exp = Experiment(num_of_nodes, num_of_snapshots)
277
    for file in files:
278
        exp.add_new_result(file)
279

    
280
    exp.summarize()
281

    
282
    # experiment_1(exp)
283

    
284
    # experiment_2(exp)
285

    
286
    # run_experiment_3(exp)
287

    
288

    
289
if __name__ == '__main__':
290
    main()