[d2c46b]: / diff_sex / DataReader.py

Download this file

99 lines (75 with data), 3.1 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
class FeatureDictionary(object):
def __init__(self,trainfile=None,testfile=None,
dfTrain=None,dfTest=None,numeric_cols=[],
ignore_cols=[]):
assert not ((trainfile is None) and (dfTrain is None)), "trainfile or dfTrain at least one is set"
assert not ((trainfile is not None) and (dfTrain is not None)), "only one can be set"
assert not ((testfile is None) and (dfTest is None)), "testfile or dfTest at least one is set"
assert not ((testfile is not None) and (dfTest is not None)), "only one can be set"
self.trainfile = trainfile
self.testfile = testfile
self.dfTrain = dfTrain
self.dfTest = dfTest
self.numeric_cols = numeric_cols
self.ignore_cols = ignore_cols
self.gen_feat_dict()
def gen_feat_dict(self):
if self.dfTrain is None:
dfTrain = pd.read_csv(self.trainfile)
else:
dfTrain = self.dfTrain
if self.dfTest is None:
dfTest = pd.read_csv(self.testfile)
else:
dfTest = self.dfTest
df = pd.concat([dfTrain,dfTest])
self.feat_dict = {}
tc = 0
for col in df.columns:
if col in self.ignore_cols:
continue
if col in self.numeric_cols:
self.feat_dict[col] = tc
tc += 1
else:
us = df[col].unique()
print(us)
self.feat_dict[col] = dict(zip(us,range(tc,len(us)+tc)))
tc += len(us)
self.feat_dim = tc
class DataParser(object):
def __init__(self,feat_dict):
self.feat_dict = feat_dict
def parse(self,infile=None,df=None,has_label=False):
assert not ((infile is None) and (df is None)), "infile or df at least one is set"
assert not ((infile is not None) and (df is not None)), "only one can be set"
if infile is None:
dfi = df.copy()
else:
dfi = pd.read_csv(infile)
if has_label:
y = dfi['target'].values.tolist()
dfi.drop(["ID",'target'],axis=1,inplace=True)
else:
ids = dfi['ID'].values.tolist()
dfi.drop(['ID'],axis=1,inplace=True)
# dfi for feature index
# dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
dfv = dfi.copy()
for col in dfi.columns:
if col in self.feat_dict.ignore_cols:
dfi.drop(col,axis=1,inplace=True)
dfv.drop(col,axis=1,inplace=True)
continue
if col in self.feat_dict.numeric_cols:
dfi[col] = self.feat_dict.feat_dict[col]
else:
dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
#dfv[col] = 1.
xi = dfi.values.tolist()
xv = dfv.values.tolist()
if has_label:
return xi,xv,y
else:
return xi,xv,ids