Statistics
| Branch: | Revision:

mobicen / sampleACF.py @ 05763cfb

History | View | Annotate | Download (9.8 KB)

1
import code  # code.interact(local=dict(globals(), **locals()))
2
from collections import deque
3
from scipy import stats
4
import matplotlib.pyplot as plt
5
from collections import defaultdict
6
import os
7
import sys
8
from statsmodels.graphics.tsaplots import plot_acf
9
from statsmodels.tsa.stattools import acf
10
import operator
11
import pandas as pd
12
from pprint import pprint
13
import numpy as np
14
import glob
15
from tqdm import tqdm
16
from multiprocessing import Pool
17

    
18

    
19
# mys.rank(method='first', ascending=False)
20

    
21
folder = sys.argv[1]
22
num_workers = 1
23
if len(sys.argv) > 2:
24
    num_workers = int(sys.argv[2])
25
lags = 100
26
if len(sys.argv) > 3:
27
    lags = int(sys.argv[3])
28
nick = folder.split('/')[-2].split('_')[0]
29
os.chdir(folder)
30

    
31

    
32
'''def nistRH(v, h):
33
    N, mu, var = len(v), np.mean(v), np.var(v)
34
    Ch = 0.0
35
    for t in range(0, N-h):
36
        Ch += (v[t]-mu)*(v[t+h]-mu)
37
    return (Ch/(N-h))/var
38

39

40
def sampleACF(params):
41
    v = params['v']
42
    nlags = params['nlags']
43
    timeDepth = params['timeDepth']
44
    v = v[:timeDepth].to_list()
45
    assert len(v) == timeDepth
46
    acf_t = defaultdict(list)
47
    for tau in range(0, nlags):
48
        for k in range(0, timeDepth-tau):
49
            if (v[k] > 0 and v[k+tau] > 0):
50
                acf_kt = v[k] * v[k+tau] / (np.mean([v[k], v[k+tau]])**2)
51
                acf_t[tau].append(acf_kt)
52
    return map(np.mean, acf_t.values())
53

54

55
def sampleACF2(v, nlags=40, timeDepth=500):
56
    v = v[:timeDepth].to_list()
57
    return pd.Series(acf(v, nlags=nlags, fft=True, unbiased=True))'''
58

    
59

    
60
bcdf = pd.DataFrame()  # rows=nodes columns=BC at column-index time-instant
61
degdf = pd.DataFrame()  # rows=nodes columns=DEG at column-index time-instant
62
kcoredf = pd.DataFrame()  # rows=nodes columns=KCORE at column-index time-instant
63
print "Loading data from", folder, "..."
64
for snap in sorted(glob.glob('./stats*')):
65
    # print "",snap
66
    node_id = int(snap.strip('.csv').strip('./stats'))
67
    df = pd.read_csv(snap, names=['time', 'bc', 'deg', 'kcore'], skiprows=1)
68
    bcdf = pd.concat([bcdf, df['bc']], axis=1)
69
    degdf = pd.concat([degdf, df['deg']], axis=1)
70
    kcoredf = pd.concat([kcoredf, df['kcore']], axis=1)
71

    
72
rankbcdf = pd.DataFrame()
73
for t in range(0, len(bcdf)):
74
    r = bcdf.iloc[t].rank(
75
        method='first', ascending=False).reset_index(drop=True)
76
    rankbcdf = rankbcdf.append(r, ignore_index=True)
77

    
78

    
79
nodes = range(len(bcdf.columns))
80

    
81

    
82
if not os.path.exists("plots"+nick):
83
    os.makedirs("plots"+nick)
84

    
85
os.chdir("plots"+nick)
86
# Plotting
87

    
88

    
89
def jaccard_similarity(x, y):
90
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
91
    union_cardinality = len(set.union(*[set(x), set(y)]))
92
    return intersection_cardinality/float(union_cardinality)
93

    
94

    
95
def topNodes(t, perc):
96
    BCd = bcdf.iloc[t].reset_index(drop=True).to_dict()
97
    srtd_BC = sorted(BCd.items(), key=operator.itemgetter(1), reverse=True)
98
    upto = int(len(srtd_BC) * (perc/100.0))
99
    coreNodes = [int(e[0]) for e in srtd_BC[:upto]]
100
    return coreNodes
101

    
102

    
103
def tailNodes(t, perc):
104
    BCd = bcdf.iloc[t].reset_index(drop=True).to_dict()
105
    srtd_BC = sorted(BCd.items(), key=operator.itemgetter(1), reverse=False)
106
    upto = int(len(srtd_BC) * (perc/100.0))
107
    coreNodes = [int(e[0]) for e in srtd_BC[:upto]]
108
    return coreNodes
109

    
110

    
111
def jaccard_CF(start=0, maxt=100, nlags=20, perc=5, top=True):
112
    N = maxt
113
    memoSet = {}
114
    for i in range(0, N):
115
        if top:
116
            memoSet[i] = topNodes(start+i, perc)
117
        else:
118
            memoSet[i] = tailNodes(start+i, perc)
119
    retval = []
120
    for tau in range(0, nlags+1):
121
        jtau = 0.0
122
        for t in range(0, N - tau):
123
            jtau += jaccard_similarity(memoSet[t], memoSet[t+tau])
124
        jtau /= N - tau
125
        retval.append(jtau)
126
    return retval
127

    
128

    
129
#code.interact(local=dict(globals(), **locals()))
130

    
131
colors = iter(['k', 'r', 'b', 'g', 'y', 'c'])
132
styles = iter(['s', 'o', '^', '*', 'p'])
133
p = 5
134
for klen in [25, 50, 100, 200, 500]:
135
    pd.Series(jaccard_CF(start=0, maxt=klen, perc=p, top=True)).plot.line(
136
        label='N='+str(klen), style=next(colors)+next(styles)+'-')
137

    
138
plt.legend()
139
plt.xticks(range(0,21))
140
plt.yticks(np.arange(0,1.0,0.1))
141
plt.grid()
142
plt.ylim(0, 1.0)
143
plt.title("start=t0")
144
plt.xlabel('Tau')
145
plt.ylabel('Jaccard Correlation')
146
plt.savefig(nick+"jaccard_ACF_changingN.pdf", format='pdf')
147
plt.clf()
148
#plt.show()
149

    
150
colors = iter(['k', 'r', 'b', 'g', 'y', 'c'])
151
styles = iter(['s', 'o', '^', '*', 'p'])
152
p = 5
153
for s in [0, 50, 100, 200, 500]:
154
    pd.Series(jaccard_CF(start=s, maxt=50, perc=p, top=True)).plot.line(
155
        label='start='+str(s), style=next(colors)+next(styles)+'-')
156

    
157
plt.legend()
158
plt.xticks(range(0,21))
159
plt.yticks(np.arange(0,1.0,0.1))
160
plt.grid()
161
plt.title("N=50")
162
plt.xlabel('Tau')
163
plt.ylabel('Jaccard Correlation')
164
plt.ylim(0, 1.0)
165
plt.savefig(nick+"jaccard_ACF_changingSTART.pdf", format='pdf')
166
plt.clf()
167

    
168
perc = 20
169
t0 = topNodes(0, perc)
170
l0 = [jaccard_similarity(t0, topNodes(t, perc)) for t in range(1, 70)]
171
t100 = topNodes(100, perc)
172
l100 = [jaccard_similarity(t100, topNodes(100+t, perc)) for t in range(1, 70)]
173
t500 = topNodes(500, perc)
174
l500 = [jaccard_similarity(t500, topNodes(500+t, perc)) for t in range(1, 70)]
175
t750 = topNodes(750, perc)
176
l750 = [jaccard_similarity(t750, topNodes(750+t, perc)) for t in range(1, 70)]
177

    
178
pd.Series(l0).plot(label="start = t0", linewidth=1.5)
179
pd.Series(l100).plot(label="start = t100", linewidth=1.5)
180
pd.Series(l500).plot(label="start = t500", linewidth=1.5)
181
pd.Series(l750).plot(label="start = t750", linewidth=1.5)
182
plt.ylabel("Jaccard Similarity")
183
plt.xlabel("tau")
184
plt.title("Top Set size = "+str(len(t0))+", "+str(perc)+"% of nodes")
185
plt.legend()
186
plt.xticks(range(0,70,5))
187
plt.yticks(np.arange(0,1.1,0.1))
188
plt.ylim(-0.02,1.02)
189
plt.grid()
190
plt.savefig(nick+"jaccard_scaletta.pdf", format='pdf')
191
plt.clf()
192
#t0 = [jaccard_similarity(topNodes(t, perc), topNodes(t+1, perc)) for t in range(0,100)]
193

    
194

    
195
'''p = 80
196
pd.Series(jaccard_top_CF(perc=p)).plot(label='Top')
197
pd.Series(jaccard_tail_CF(perc=p)).plot(label='Tail')
198
plt.ylabel("Jaccard Similarity")
199
plt.legend()
200
plt.savefig(nick+"Jaccard-tau1-perc="+str(p)+".pdf", format='pdf')
201
plt.clf()
202

203
jtop1 = [] 
204

205
for t in range(0,100):
206
    x,y=topNodes(t,15), topNodes(t+1,15)
207
    jtop1.append(jaccard_similarity(x,y))
208

209
jtail1 = [] 
210

211
for t in range(0,100):
212
    x,y=tailNodes(t,15), tailNodes(t+1,15)
213
    jtail1.append(jaccard_similarity(x,y))'''
214

    
215

    
216
'''for i in range(k, k+memoryMax):
217
            bcktop, bcitop = bcdf.iloc[k, top], bcdf.iloc[i, top]
218
            acTop[i-k].append(bcktop * bcitop / (np.mean([bcktop, bcitop])**2))
219
for i in range(k, k+memoryMax):
220
            bcktail, bcitail = bcdf.iloc[k, tail], bcdf.iloc[i, tail]
221
            if (bcitail > 0 and bcktail > 0):
222
                acTail[i-k].append(bcktail*bcitail /
223
                                   (np.mean([bcktail, bcitail])**2))'''
224

    
225

    
226
perc = 5
227
memoryMax = 70
228
klim = 200
229
acTop = {}
230
acTail = {}
231

    
232
acTopRank = {}
233
acTailRank = {}
234

    
235

    
236
for t in tqdm(range(0, 900, 10)):
237
    topn = topNodes(t, perc)[0]
238
    tailn = tailNodes(t, perc)[0]
239

    
240
    topSeries = bcdf.iloc[:, topn][t:t+klim]
241
    topacf = acf(topSeries, nlags=memoryMax, unbiased=True, fft=True)
242

    
243
    tailSeries = bcdf.iloc[:, tailn][t:t+klim]
244
    tailacf = acf(tailSeries, nlags=memoryMax, unbiased=True, fft=True)
245

    
246
    topRSeries = rankbcdf.iloc[:, topn][t:t+klim]
247
    topRacf = acf(topRSeries, nlags=memoryMax, unbiased=True, fft=True)
248

    
249
    tailRSeries = rankbcdf.iloc[:, tailn][t:t+klim]
250
    tailRacf = acf(tailRSeries, nlags=memoryMax, unbiased=True, fft=True)
251

    
252
    acTop[t] = topacf
253
    acTail[t] = tailacf
254
    acTopRank[t] = topRacf
255
    acTailRank[t] = tailRacf
256

    
257
acTopDF = pd.DataFrame(acTop)
258
acTailDF = pd.DataFrame(acTail)
259

    
260
acTopDF.T.mean().plot(label='Top')
261
acTailDF.T.mean().plot(label='Tail')
262
plt.ylim(-1, 1)
263
plt.ylabel('ACF...')
264
plt.xlabel('Tau')
265
plt.legend()
266
plt.grid()
267
plt.savefig(nick+"ACFtopVSTail.pdf", format='pdf')
268
plt.clf()
269
# plt.show()
270

    
271
acTopRDF = pd.DataFrame(acTopRank)
272
acTailRDF = pd.DataFrame(acTailRank)
273

    
274
acTopRDF.T.mean().plot(label='TopR')
275
acTailRDF.T.mean().plot(label='TailR')
276
plt.ylim(-1, 1)
277
plt.ylabel('ACF_rank...')
278
plt.xlabel('Tau')
279
plt.legend()
280
plt.grid()
281
plt.savefig(nick+"ACFrank_topVSTail.pdf", format='pdf')
282
plt.clf()
283

    
284

    
285
topSeries.plot(label='Top')
286
tailSeries.plot(label='Tail')
287
plt.legend()
288
plt.grid()
289
plt.ylabel('BC')
290
plt.xlabel('Time')
291
plt.savefig(nick+"topVStailExample.pdf", format='pdf')
292
plt.clf()
293
# plt.show()
294

    
295
rankbcdf.iloc[:, topn][t:t+klim].plot(label='TopRank')
296
rankbcdf.iloc[:, tailn][t:t+klim].plot(label='TailRank')
297
plt.legend()
298
plt.ylabel('Rank (1==HighestBC)')
299
plt.xlabel('Time')
300
plt.grid()
301
plt.savefig(nick+"topVStailRankExample.pdf", format='pdf')
302
plt.clf()
303
# plt.show()
304

    
305

    
306
time2top = {}
307
time2tail = {}
308

    
309

    
310
'''# Per tanti istanti di inizio detti k
311
for k in tqdm(range(1, klim)):
312
    # Prendi i top e tail nodi a quell'istante k
313
    topn = topNodes(k, perc)
314
    tailn = tailNodes(k, perc)
315
    # Calcola la Normalized Istantaneous Correlation,
316
    p = Pool(num_workers)
317
    timeSeries = []
318
    # code.interact(local=dict(globals(), **locals()))
319
    for top in topn:
320
        params = {'v': bcdf.iloc[:, top][k:],
321
                  'nlags': memoryMax, 'timeDepth': klim}
322
        timeSeries.append(params)
323
        # tacf = sampleACF(bcdf.iloc[:,top][k:], nlags=memoryMax, timeDepth=100)
324
    res = p.map(sampleACF, timeSeries)
325
    for r in res:
326
        for lag in range(0, len(r)):
327
            acTop[lag].append(r[lag])
328

329
    for tail in tailn:
330
        params = {'v': bcdf.iloc[:, tail][k:],
331
                  'nlags': memoryMax, 'timeDepth': klim}
332
        timeSeries.append(params)
333
        # tacf = sampleACF(bcdf.iloc[:,top][k:], nlags=memoryMax, timeDepth=100)
334
    res = p.map(sampleACF, timeSeries)
335
    for r in res:
336
        for lag in range(0, len(r)):
337
            acTail[lag].append(r[lag])
338
    p.close()
339

340
pd.Series(map(np.mean, acTop.values())).plot(label='Top-'+str(perc)+'%')
341
pd.Series(map(np.mean, acTail.values())).plot(label='Tail-'+str(perc)+'%')
342
plt.legend()
343
plt.xlabel('Tau')
344
plt.ylabel('Sample ACF')
345
plt.savefig(nick+"sampleACF.pdf", format='pdf')'''