Diff of /diff_sex/DataReader.py [000000] .. [d2c46b]

Switch to unified view

a b/diff_sex/DataReader.py
1
import pandas as pd
2
3
class FeatureDictionary(object):
4
    def __init__(self,trainfile=None,testfile=None,
5
                 dfTrain=None,dfTest=None,numeric_cols=[],
6
                 ignore_cols=[]):
7
        assert not ((trainfile is None) and (dfTrain is None)), "trainfile or dfTrain at least one is set"
8
        assert not ((trainfile is not None) and (dfTrain is not None)), "only one can be set"
9
        assert not ((testfile is None) and (dfTest is None)), "testfile or dfTest at least one is set"
10
        assert not ((testfile is not None) and (dfTest is not None)), "only one can be set"
11
12
        self.trainfile = trainfile
13
        self.testfile = testfile
14
        self.dfTrain = dfTrain
15
        self.dfTest = dfTest
16
        self.numeric_cols = numeric_cols
17
        self.ignore_cols = ignore_cols
18
        self.gen_feat_dict()
19
20
21
22
23
    def gen_feat_dict(self):
24
        if self.dfTrain is None:
25
            dfTrain = pd.read_csv(self.trainfile)
26
27
        else:
28
            dfTrain = self.dfTrain
29
30
        if self.dfTest is None:
31
            dfTest = pd.read_csv(self.testfile)
32
33
        else:
34
            dfTest = self.dfTest
35
36
        df = pd.concat([dfTrain,dfTest])
37
38
        self.feat_dict = {}
39
        tc = 0
40
        for col in df.columns:
41
            if col in self.ignore_cols:
42
                continue
43
            if col in self.numeric_cols:
44
                self.feat_dict[col] = tc
45
                tc += 1
46
47
            else:
48
                us = df[col].unique()
49
                print(us)
50
                self.feat_dict[col] = dict(zip(us,range(tc,len(us)+tc)))
51
                tc += len(us)
52
        
53
        self.feat_dim = tc
54
55
56
class DataParser(object):
57
    def __init__(self,feat_dict):
58
        self.feat_dict = feat_dict
59
60
    def parse(self,infile=None,df=None,has_label=False):
61
        assert not ((infile is None) and (df is None)), "infile or df at least one is set"
62
        assert not ((infile is not None) and (df is not None)), "only one can be set"
63
64
65
        if infile is None:
66
            dfi = df.copy()
67
        else:
68
            dfi = pd.read_csv(infile)
69
70
        if has_label:
71
            y = dfi['target'].values.tolist()
72
            dfi.drop(["ID",'target'],axis=1,inplace=True)
73
        else:
74
            ids = dfi['ID'].values.tolist()
75
            dfi.drop(['ID'],axis=1,inplace=True)
76
        # dfi for feature index
77
        # dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
78
        dfv = dfi.copy()
79
        for col in dfi.columns:
80
            if col in self.feat_dict.ignore_cols:
81
                dfi.drop(col,axis=1,inplace=True)
82
                dfv.drop(col,axis=1,inplace=True)
83
                continue
84
            if col in self.feat_dict.numeric_cols:
85
                dfi[col] = self.feat_dict.feat_dict[col]
86
            else:
87
                dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
88
                #dfv[col] = 1.
89
90
        xi = dfi.values.tolist()
91
        xv = dfv.values.tolist()
92
93
        if has_label:
94
            return xi,xv,y
95
        else:
96
            return xi,xv,ids
97
98