Statistics
| Branch: | Revision:

## root / globecomm / analyse_snapshot.py @ fac6e5a4

 1 ```import sys ``` ```from collections import defaultdict ``` ```import numpy as np ``` ```import scipy ``` ```from scipy import stats ``` ```import matplotlib.pyplot as plt ``` ```import utility ``` ```import metrics ``` ```import custom_plot ``` ```from pdb import set_trace as debugger ``` ```class Experiment(): ``` ``` """This class analyzes the betweenness centrality across multiple snapshots ``` ``` of community networks. ``` ``` ``` ``` We want to know how often the BC score changes. ``` ``` """ ``` ``` def __init__(self, num_of_nodes, num_of_snapshots): ``` ``` self.scores = np.empty((num_of_snapshots, num_of_nodes)) ``` ``` self.scores.fill(np.nan) ``` ``` self.index_of_snapshot = 0 ``` ``` self.num_of_nodes = num_of_nodes ``` ``` self.num_of_snapshots = num_of_snapshots ``` ``` self.node_index_map = dict() ``` ``` self.current_max_node_index = 0 ``` ``` def index_of_node(self, node_id): ``` ``` if node_id in self.node_index_map: ``` ``` # print self.node_index_map[node_id] ``` ``` return self.node_index_map[node_id] ``` ``` else: ``` ``` print "ERROR: node doesn't exist" ``` ``` sys.exit() ``` ``` def add_new_result(self, in_filepath): ``` ``` if self.index_of_snapshot == self.num_of_snapshots: ``` ``` print "ERROR: the number of snapshots provided is less than the input files" ``` ``` sys.exit() ``` ``` data = np.loadtxt(in_filepath, delimiter=',', dtype={ ``` ``` 'names': ('node_id', 'score'), ``` ``` 'formats': ('a100', 'f4') ``` ``` }) ``` ``` for row in data: ``` ``` self.update_score(row[0], row[1]) ``` ``` self.index_of_snapshot += 1 ``` ``` def update_score(self, node_id, score): ``` ``` if node_id not in self.node_index_map: ``` ``` self.node_index_map[node_id] = self.current_max_node_index ``` ``` self.current_max_node_index += 1 ``` ``` node_index = self.index_of_node(node_id) ``` ``` self.scores[self.index_of_snapshot][node_index] = score ``` ``` def summarize(self): ``` ``` np.average(self.scores, axis=1) ``` ``` ######### ``` ``` # METRICS ``` ``` ######### ``` ``` def percentage_overlap(self, top_k=20, time_window=1): ``` ``` """Draws a graph in ./output/overlap*.png to represent the proportion ``` ``` of nodes remain in the top-k nodes with the highest BC score over different ``` ``` time window ``` ``` """ ``` ``` time_diff = list() ``` ``` for j in range(self.num_of_snapshots - time_window): ``` ``` diff = metrics.percentage_overlap(self.scores[j:j+time_window+1,:], top_k) ``` ``` time_diff.append(diff) ``` ``` return time_diff ``` ``` def plot_percentage_overlap(self, time_diff, top_k, time_window): ``` ``` out_filepath = './output/percentage_overlap/top_k_%s_window_%s.png' % (top_k, time_window) ``` ``` fig = custom_plot.plot_time_diff(time_diff, ``` ``` title='FFGraz', ``` ``` xlabel='time_diff', ``` ``` ylabel='Percentage overlap for top-k = %s' % top_k, ``` ``` ylim=(40, 101), ``` ``` out_filepath=out_filepath) ``` ``` def filtered_node_indices(self, cutoff_value=0): ``` ``` """Return a list of node indices, such that each node has the maximum BC score > 0.05 in at least one snapshot ``` ``` """ ``` ``` max_values = np.nanmax(self.scores, axis=0) ``` ``` return [i for i in range(len(max_values)) if max_values[i] > cutoff_value] ``` ``` def plot_bc_score(self): ``` ``` node_indices = self.filtered_node_indices(cutoff_value=0.05) ``` ``` output_base = './output/bc_score/node_id_%s.png' ``` ``` x_range = range(self.num_of_snapshots) ``` ``` for n_index in node_indices: ``` ``` out_filepath = output_base % n_index ``` ``` plt.plot(x_range, self.scores[:,n_index]) ``` ``` plt.title("Node id = %s" % n_index) ``` ``` # plt.title("Node name = %s" % self.node_index_map[n_index]) ``` ``` plt.ylim([0, 0.6]) ``` ``` plt.savefig(out_filepath) ``` ``` plt.close() ``` ``` # plt.show() ``` ```def experiment_1(exp): ``` ``` """Shows the percentage over lap for multiple snapshots, with different time ``` ``` window ``` ``` """ ``` ``` percentages = [i/10. for i in range(1, 6)] ``` ``` top_ks = [int(p*exp.num_of_nodes) for p in percentages] ``` ``` # Comparing percentage_overlap for different time window values ``` ``` time_windows = [1, 10, 20, 30, 40, 50] ``` ``` for k in top_ks: ``` ``` results = dict() ``` ``` for tw in time_windows: ``` ``` time_diff = exp.percentage_overlap(top_k=k, time_window=tw) ``` ``` results[tw] = time_diff ``` ``` out_filepath = './output/percentage_overlap/top_k_%s.png' % k ``` ``` custom_plot.plot_time_diff(results, ylim=(50, 102), out_filepath=out_filepath) ``` ``` # Comparing percentage overlap for different p-value ``` ``` results = dict() ``` ``` for k in top_ks: ``` ``` time_diff = exp.percentage_overlap(top_k=k, time_window=1) ``` ``` results[k] = time_diff ``` ``` out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_scatter.png' ``` ``` custom_plot.plot_time_diff(results, scatter=True, ylim=(50, 102), title="Percentage overlap for different top-k", ``` ``` out_filepath=out_filepath) ``` ``` out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_line.png' ``` ``` custom_plot.plot_time_diff(results, ylim=(50, 102), title="Percentage overlap for different top-k", ``` ``` out_filepath=out_filepath) ``` ``` for k, vals in results.iteritems(): ``` ``` out_filepath = './output/percentage_overlap/percentage_overlap_with_different_top_k_%s_line.png' % k ``` ``` custom_plot.plot_single_time_diff(vals, ylim=(50, 102), title="Percentage overlap with top-k", ``` ``` out_filepath=out_filepath) ``` ```def experiment_2(exp): ``` ``` out_filepath = './output/histogram_bc_score.png' ``` ``` custom_plot.scatter_histogram(exp.scores, out_filepath) ``` ```def experiment_3(exp, k_max=1, slack_var=0): ``` ``` """Analyses the "inter-change" distribution ``` ``` ``` ``` slack_var: is used to treat BC scores between 2 different ``` ``` snapshot as they are the same when the difference is small ``` ``` """ ``` ``` node_indices = exp.filtered_node_indices(cutoff_value=0.05) ``` ``` rows = len(node_indices) ``` ``` mapped_node_indices = {node_indices[i]: i for i in range(rows)} ``` ``` cols = k_max + 1 ``` ``` T = np.zeros((rows, cols)) ``` ``` s_range = max(0, exp.num_of_snapshots - k_max) ``` ``` # print s_range ``` ``` # print exp.num_of_snapshots ``` ``` # k_range = 2 ``` ``` # print "s_range %s" % range(s_range) ``` ``` # print "k_range %s" % range(k_max) ``` ``` # print "T cols = %s" % len(T[0]) ``` ``` for n_original_index in node_indices: ``` ``` n_index = mapped_node_indices[n_original_index] ``` ``` s = 0 ``` ``` while (s < s_range): ``` ``` # print s ``` ``` old_score = exp.scores[s][n_original_index] ``` ``` for k in range(k_max): ``` ``` # print "%i | %i" % (s, s + k) ``` ``` new_score = exp.scores[s + k][n_original_index] ``` ``` if abs(old_score - new_score) > slack_var: ``` ``` T[n_index][k] += 1 ``` ``` s += k - 1 ``` ``` break; ``` ``` T[n_index][k_max] += 1 ``` ``` s += 1 ``` ``` ##### ``` ``` # OUTPUT ``` ``` ##### ``` ``` # plot scatter ``` ``` x_range = range(1, k_max) ``` ``` # for i in range(rows): ``` ``` # plt.scatter(x_range, T[i][1:-1]) ``` ``` # plot histogram ``` ``` considered_data = T[:,1:-r1] ``` ``` y_range = np.sum(considered_data, axis=0) ``` ``` y_range_total = np.sum(considered_data) ``` ``` # count non-zero ``` ``` y_count_non_zero = [] ``` ``` for c in range(len(y_range)): ``` ``` y_count_non_zero.append(considered_data[:,c]) ``` ``` # y_count_non_zero = np.count_nonzero(considered_data, axis=0) ``` ``` y_range_normalized = y_range / y_range_total ``` ``` if y_range_total != np.sum(y_range): ``` ``` print "ERRROR XXX\n" ``` ``` if np.sum(y_range_normalized) != 1: ``` ``` "--- ERROR sum Normalized values is not equal to 1: %s" % y_range_normalized ``` ``` plt.plot(x_range, y_range_normalized) ``` ``` plt.ylabel('(?) Probability that BC score is changed') ``` ``` plt.xlabel('k') ``` ``` output_basename = './output/histogram_interchange_k_%i_slack_%.3f' % (k_max, slack_var) ``` ``` plt.text(0, 0.8, y_range_total) ``` ``` plt.ylim(0, 1.1) ``` ``` plt.title(output_basename) ``` ``` plt.savefig(output_basename + '.png') ``` ``` plt.close() ``` ``` # save to the text format ``` ``` np.savetxt(output_basename + '.out', T, '%i') ``` ``` # plot cumulative distribution ``` ``` cumm = [sum(y_range_normalized[:i+1]) for i in range(len(y_range_normalized))] ``` ``` plt.plot(x_range, cumm) ``` ``` output_basename = './output/cumulatie_interchange_k_%i_slack_%.3f' % (k_max, slack_var) ``` ``` out_filepath = output_basename + '.png' ``` ``` plt.ylim(0, 1.1) ``` ``` plt.title(output_basename) ``` ``` plt.savefig(out_filepath) ``` ``` plt.close() ``` ```def run_experiment_3(exp): ``` ``` # # For testing ``` ``` # experiment_3(exp, k_max=5) # running with default slack_var = 0 ``` ``` # experiment_3(exp, k_max=5, slack_var=0.01) ``` ``` # experiment_3(exp, k_max=50, slack_var=0.001) ``` ```# For real run ``` ``` k_max = [50, 100, 114] ``` ``` for k in k_max: ``` ``` slack_vars = [i * 0.001 for i in range(11)] ``` ``` for slack_var in slack_vars: ``` ``` experiment_3(exp, k_max=k, slack_var=slack_var) ``` ``` slack_vars = [i * 0.02 for i in range(1, 10)] ``` ``` for slack_var in slack_vars: ``` ``` experiment_3(exp, k_max=k, slack_var=slack_var) ``` ```def main(): ``` ``` # INPUT_DIR = 'output' ``` ``` if len(sys.argv) == 2: ``` ``` INPUT_DIR = sys.argv[1] ``` ``` else: ``` ``` INPUT_DIR = 'output2' ``` ``` network = 'FFGraz' ``` ``` files = utility.all_files_for_network(network, INPUT_DIR) ``` ``` num_of_snapshots = len(files) ``` ``` num_of_nodes = 200 ``` ``` exp = Experiment(num_of_nodes, num_of_snapshots) ``` ``` for file in files: ``` ``` exp.add_new_result(file) ``` ``` exp.summarize() ``` ``` # experiment_1(exp) ``` ``` # experiment_2(exp) ``` ``` # run_experiment_3(exp) ``` ```if __name__ == '__main__': ``` ``` main() ```