Statistics
| Branch: | Revision:

root / globecomm / analyse_snapshot.py @ 5f27ee90

History | View | Annotate | Download (3.96 KB)

1
import os
2
import sys
3
from collections import defaultdict
4
import numpy as np
5
import scipy
6
from scipy import stats
7
import matplotlib.pyplot as plt
8

    
9
import metrics
10

    
11
from pdb import set_trace as debugger
12

    
13
class Experiment():
14
    def __init__(self, num_of_nodes, num_of_snapshots):
15
        self.scores = np.zeros((num_of_snapshots, num_of_nodes))
16
        self.index_of_snapshot = 0
17
        self.num_of_nodes = num_of_nodes
18
        self.num_of_snapshots = num_of_snapshots
19

    
20
    def add_new_result(self, in_filepath):
21
        if self.index_of_snapshot == self.num_of_snapshots:
22
            print "ERROR: the number of snapshots provided is less than the input files"
23
            sys.exit()
24

    
25
        data = np.loadtxt(in_filepath, delimiter=',', dtype={
26
                'names': ('node_id', 'score'),
27
                'formats': ('i4', 'f4')
28
            })
29

    
30
        for row in data:
31
            self.update_score(row[0], row[1])
32

    
33
        self.index_of_snapshot += 1
34

    
35
    def update_score(self, node_id, score):
36
        self.scores[self.index_of_snapshot][node_id] = score
37

    
38
    def summarize(self):
39
        np.average(self.scores, axis=1)
40

    
41
    def spearman_rank_correlation_coef(self):
42
        time_diff = defaultdict(list)
43
        for j in range(self.num_of_snapshots):
44
            for i in range(j, self.num_of_snapshots):
45
                diff = scipy.stats.spearmanr(self.scores[j], self.scores[i])
46
                time_diff[i-j].append(diff[0])
47
        self._plot_time_diff(time_diff,
48
                        title='FFGraz',
49
                        xlabel='time diff (?)',
50
                        ylabel='Spearman rank correlation coefficient')
51

    
52
    def percentage_overlap(self, top_k=20):
53
        """
54
        """
55
        time_diff = defaultdict(list)
56
        for j in range(self.num_of_snapshots):
57
            for i in range(j, self.num_of_snapshots):
58
                diff = metrics.percentage_overlap(self.scores[j], self.scores[i], top_k)
59
                time_diff[i-j].append(diff)
60

    
61
        out_filepath = 'output/overlap_%s.png' % top_k
62
        fig = self._plot_time_diff(time_diff,
63
                        title='FFGraz',
64
                        xlabel='time_diff',
65
                        ylabel='Percentage overlap for top-k = %s' % top_k,
66
                        ylim=(40, 101),
67
                        out_filepath=out_filepath)
68

    
69

    
70
    def _plot_time_diff(self, time_diff, title='', xlabel='', ylabel='', ylim=None, out_filepath=''):
71
        max_key = max(time_diff.keys()) + 1
72
        min_diff = [0 for i in range(max_key)]
73
        max_diff = [0 for i in range(max_key)]
74
        mean_diff = [0 for i in range(max_key)]
75

    
76
        for i, value in time_diff.iteritems():
77
            min_diff[i] = np.min(value)
78
            mean_diff[i] = np.mean(value)
79
            max_diff[i] = np.max(value)
80

    
81
        fig = plt.figure()
82
        # Plot
83
        x_range = sorted(time_diff.keys())
84
        plt.plot(x_range, min_diff, label='min')
85
        plt.plot(x_range, mean_diff, label='mean')
86
        plt.plot(x_range, max_diff, label='max')
87

    
88
        plt.ylabel(ylabel)
89
        plt.xlabel(xlabel)
90
        if ylim:
91
            plt.ylim(ylim)
92
        plt.legend()
93
        plt.title(title)
94

    
95
        if out_filepath:
96
            plt.savefig(out_filepath)
97
        else:
98
            plt.show()
99

    
100
def all_files_for_network(network_name, dir):
101
    files = []
102
    for file in os.listdir(dir):
103
        prefix = file.split('_')[0]
104
        if prefix == network_name:
105
            files.append(os.path.join(dir, file))
106

    
107
    return files
108

    
109
def main():
110
    dir = 'output'
111
    network = 'FFGraz'
112
    files = all_files_for_network(network, dir)
113
    num_of_snapshots = len(files)
114
    num_of_nodes = 200
115
    exp = Experiment(num_of_nodes, num_of_snapshots)
116
    for file in files:
117
        exp.add_new_result(file)
118

    
119
    exp.summarize()
120

    
121
    # Show the percentage over lap for multiple snapshots
122
    percentages = [i/10. for i in range(1, 6)]
123
    top_ks = [int(p*num_of_nodes) for p in percentages]
124
    for k in top_ks:
125
        exp.percentage_overlap(k)
126

    
127
if __name__ == '__main__':
128
    main()