root / globecomm / analyse_snapshot.py @ 5f27ee90
History  View  Annotate  Download (3.96 KB)
1 
import os 

2 
import sys 
3 
from collections import defaultdict 
4 
import numpy as np 
5 
import scipy 
6 
from scipy import stats 
7 
import matplotlib.pyplot as plt 
8  
9 
import metrics 
10  
11 
from pdb import set_trace as debugger 
12  
13 
class Experiment(): 
14 
def __init__(self, num_of_nodes, num_of_snapshots): 
15 
self.scores = np.zeros((num_of_snapshots, num_of_nodes))

16 
self.index_of_snapshot = 0 
17 
self.num_of_nodes = num_of_nodes

18 
self.num_of_snapshots = num_of_snapshots

19  
20 
def add_new_result(self, in_filepath): 
21 
if self.index_of_snapshot == self.num_of_snapshots: 
22 
print "ERROR: the number of snapshots provided is less than the input files" 
23 
sys.exit() 
24  
25 
data = np.loadtxt(in_filepath, delimiter=',', dtype={

26 
'names': ('node_id', 'score'), 
27 
'formats': ('i4', 'f4') 
28 
}) 
29  
30 
for row in data: 
31 
self.update_score(row[0], row[1]) 
32  
33 
self.index_of_snapshot += 1 
34  
35 
def update_score(self, node_id, score): 
36 
self.scores[self.index_of_snapshot][node_id] = score 
37  
38 
def summarize(self): 
39 
np.average(self.scores, axis=1) 
40  
41 
def spearman_rank_correlation_coef(self): 
42 
time_diff = defaultdict(list)

43 
for j in range(self.num_of_snapshots): 
44 
for i in range(j, self.num_of_snapshots): 
45 
diff = scipy.stats.spearmanr(self.scores[j], self.scores[i]) 
46 
time_diff[ij].append(diff[0])

47 
self._plot_time_diff(time_diff,

48 
title='FFGraz',

49 
xlabel='time diff (?)',

50 
ylabel='Spearman rank correlation coefficient')

51  
52 
def percentage_overlap(self, top_k=20): 
53 
"""

54 
"""

55 
time_diff = defaultdict(list)

56 
for j in range(self.num_of_snapshots): 
57 
for i in range(j, self.num_of_snapshots): 
58 
diff = metrics.percentage_overlap(self.scores[j], self.scores[i], top_k) 
59 
time_diff[ij].append(diff) 
60  
61 
out_filepath = 'output/overlap_%s.png' % top_k

62 
fig = self._plot_time_diff(time_diff,

63 
title='FFGraz',

64 
xlabel='time_diff',

65 
ylabel='Percentage overlap for topk = %s' % top_k,

66 
ylim=(40, 101), 
67 
out_filepath=out_filepath) 
68  
69  
70 
def _plot_time_diff(self, time_diff, title='', xlabel='', ylabel='', ylim=None, out_filepath=''): 
71 
max_key = max(time_diff.keys()) + 1 
72 
min_diff = [0 for i in range(max_key)] 
73 
max_diff = [0 for i in range(max_key)] 
74 
mean_diff = [0 for i in range(max_key)] 
75  
76 
for i, value in time_diff.iteritems(): 
77 
min_diff[i] = np.min(value) 
78 
mean_diff[i] = np.mean(value) 
79 
max_diff[i] = np.max(value) 
80  
81 
fig = plt.figure() 
82 
# Plot

83 
x_range = sorted(time_diff.keys())

84 
plt.plot(x_range, min_diff, label='min')

85 
plt.plot(x_range, mean_diff, label='mean')

86 
plt.plot(x_range, max_diff, label='max')

87  
88 
plt.ylabel(ylabel) 
89 
plt.xlabel(xlabel) 
90 
if ylim:

91 
plt.ylim(ylim) 
92 
plt.legend() 
93 
plt.title(title) 
94  
95 
if out_filepath:

96 
plt.savefig(out_filepath) 
97 
else:

98 
plt.show() 
99  
100 
def all_files_for_network(network_name, dir): 
101 
files = [] 
102 
for file in os.listdir(dir): 
103 
prefix = file.split('_')[0] 
104 
if prefix == network_name:

105 
files.append(os.path.join(dir, file)) 
106  
107 
return files

108  
109 
def main(): 
110 
dir = 'output'

111 
network = 'FFGraz'

112 
files = all_files_for_network(network, dir)

113 
num_of_snapshots = len(files)

114 
num_of_nodes = 200

115 
exp = Experiment(num_of_nodes, num_of_snapshots) 
116 
for file in files: 
117 
exp.add_new_result(file)

118  
119 
exp.summarize() 
120  
121 
# Show the percentage over lap for multiple snapshots

122 
percentages = [i/10. for i in range(1, 6)] 
123 
top_ks = [int(p*num_of_nodes) for p in percentages] 
124 
for k in top_ks: 
125 
exp.percentage_overlap(k) 
126  
127 
if __name__ == '__main__': 
128 
main() 