-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathread_data.py
144 lines (128 loc) · 4.98 KB
/
read_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from __future__ import print_function
import pandas as pd
import sys
from data import Feature
from feature_file import *
from helper import *
def read_data(args):
print( "\nFilename = ", end='' ); cprint(args.datafile, OKGREEN)
# Read data from csv file
cprint("Reading File")
data = pd.read_csv(args.datafolder + args.datafile, delimiter=args.data_delimiter) # Be patient, takes some time
print_ok( "CSV File Read Complete" )
""" TRANSFORMATIONS """
cprint("Transformations")
# Group by identity field
print( "\t-> Group by identity field: " + identity_field )
if entry_limit:
# Remove unwanted rows
print( "\t-> Removing Unwanted Rows" )
data = data[data.groupby(identity_field)[identity_field].transform(len)>=entry_limit]
if time_series_data:
# Calculate Time Series Metrics
data.insert(0,'FIRST_TRANSMISSION_DATE', data.groupby(identity_field)[timestamp_field].transform(lambda x: min(x)))
data.insert(1,'LAST_TRANSMISSION_DATE', data.groupby(identity_field)[timestamp_field].transform(lambda x: max(x)))
data['LIFETIME'] = (data['LAST_TRANSMISSION_DATE'] - data['FIRST_TRANSMISSION_DATE']) # Add column for user lifetime
data = data[data.LIFETIME > 0]
# Add IAT information
print( "\t-> Adding IAT Information" )
data = data.sort_values([identity_field,timestamp_field])
data['NEXT_TIMESTAMP'] = data.groupby(identity_field)[timestamp_field].shift(-1)
data['IAT'] = data['NEXT_TIMESTAMP'] - data['TIMESTAMP']
data.insert(0,'MEAN_IAT', data.groupby(identity_field)['IAT'].transform(lambda x: mean(x)))
data.IAT.fillna(data.MEAN_IAT, inplace=True)
# Add Quantile information based on Amount Spent by the User
print( "\t-> Adding IAT Quantile Information" )
update_progress(0, 11)
quant_list = []
for i in range(11):
data.insert(i,'QUANTILE_' + str(10*i), data.groupby(identity_field).IAT.transform(lambda x: quantile(x)[i]))
quant_list.append('QUANTILE_' + str(10*i))
update_progress(i+1, 11)
data['IAT_VAR_MEAN']=data[quant_list].var(axis=1)
data['MEDIAN_IAT']=data[quant_list[5]]
print_ok("Transformations Complete")
""" Create Feature Objects """
cprint("Creating Features")
features = {}
# Get ids mapping to identities
print( "\t-> Getting Object Ids" )
print( "\t\t-> ", end='' );
objects = data.groupby(identity_field)
ids = objects.groups.keys()
stats = objects.size().reset_index(name='counts')
COUNT = stats['counts'].values.tolist()
if data_not_flat(COUNT):
features['COUNT'] = Feature('COUNT', COUNT, ids)
cprint("COUNT", OKBLUE)
else:
cprint('COUNT', FAIL)
if time_series_data: # Add all time series metric features
print( "\t-> Creating Temporal Features" )
print( "\t\t-> ", end='' )
LIFETIME = objects['LIFETIME'].first().values.tolist()
if data_not_flat(LIFETIME):
features['LIFETIME'] = Feature('LIFETIME', LIFETIME, ids)
cprint('LIFETIME', OKBLUE, end=' ')
else:
cprint('LIFETIME', FAIL, end=' ')
IAT_VAR_MEAN = objects['IAT_VAR_MEAN'].first().values.tolist()
if data_not_flat(IAT_VAR_MEAN):
features['IAT_VAR_MEAN'] = Feature('IAT_VAR_MEAN', IAT_VAR_MEAN, ids)
cprint('IAT_VAR_MEAN', OKBLUE, end=' ')
else:
cprint('IAT_VAR_MEAN', FAIL, end=' ')
MEAN_IAT = objects['MEAN_IAT'].first().values.tolist()
if data_not_flat(MEAN_IAT):
features['MEAN_IAT'] = Feature('MEAN_IAT', MEAN_IAT, ids)
cprint('MEAN_IAT', OKBLUE, end=' ')
else:
cprint('MEAN_IAT', FAIL, end=' ')
MEDIAN_IAT = objects['MEDIAN_IAT'].first().values.tolist()
if data_not_flat(MEDIAN_IAT):
features['MEDIAN_IAT'] = Feature('MEDIAN_IAT', MEDIAN_IAT, ids)
cprint('MEDIAN_IAT', OKBLUE, end=' ')
else:
cprint('MEDIAN_IAT', FAIL, end=' ')
# Create Feature Object for individual data fields
print( "\n\t-> Creating Aggregate Features" )
for field in aggregate_fields:
print( "\t\t-> ", end='' )
values = objects[field].sum().values.tolist()
if data_not_flat(values):
features[field] = Feature(field, values, ids)
cprint(field, OKBLUE, end=' ')
if time_series_data: # Calculate extra metrics
values = objects[field].mean().values.tolist()
name = field + '_MEAN'
if data_not_flat(values):
features[name] = Feature(name, values, ids)
cprint(name, OKBLUE, end=' ')
else:
cprint(name, FAIL, end=' ')
values = objects[field].var().values.tolist()
name = field + '_VAR'
if data_not_flat(values):
features[name] = Feature(name, values, ids)
cprint(name, OKBLUE, end=' ')
else:
cprint(name, FAIL, end=' ')
else:
cprint(field, FAIL, end=' ')
# Create Feature Object for individual object fields
print( "\n\t-> Creating Object Features" )
print( "\t\t-> ", end='' )
for field in object_fields:
values = objects[field].nunique().values.tolist()
if data_not_flat(values):
features[field] = Feature(field, values, ids)
cprint(field, OKBLUE, end=' ')
else:
cprint(field, FAIL, end=' ')
print()
if len(features) != 0:
print_ok("Feature Creation Complete")
return features
else:
print_fail("No Features were Created. Exiting...")
sys.exit()