Diff of /diff_sex/DataReader.py [000000] .. [d2c46b]

Switch to side-by-side view

--- a
+++ b/diff_sex/DataReader.py
@@ -0,0 +1,98 @@
+import pandas as pd
+
+class FeatureDictionary(object):
+    def __init__(self,trainfile=None,testfile=None,
+                 dfTrain=None,dfTest=None,numeric_cols=[],
+                 ignore_cols=[]):
+        assert not ((trainfile is None) and (dfTrain is None)), "trainfile or dfTrain at least one is set"
+        assert not ((trainfile is not None) and (dfTrain is not None)), "only one can be set"
+        assert not ((testfile is None) and (dfTest is None)), "testfile or dfTest at least one is set"
+        assert not ((testfile is not None) and (dfTest is not None)), "only one can be set"
+
+        self.trainfile = trainfile
+        self.testfile = testfile
+        self.dfTrain = dfTrain
+        self.dfTest = dfTest
+        self.numeric_cols = numeric_cols
+        self.ignore_cols = ignore_cols
+        self.gen_feat_dict()
+
+
+
+
+    def gen_feat_dict(self):
+        if self.dfTrain is None:
+            dfTrain = pd.read_csv(self.trainfile)
+
+        else:
+            dfTrain = self.dfTrain
+
+        if self.dfTest is None:
+            dfTest = pd.read_csv(self.testfile)
+
+        else:
+            dfTest = self.dfTest
+
+        df = pd.concat([dfTrain,dfTest])
+
+        self.feat_dict = {}
+        tc = 0
+        for col in df.columns:
+            if col in self.ignore_cols:
+                continue
+            if col in self.numeric_cols:
+                self.feat_dict[col] = tc
+                tc += 1
+
+            else:
+                us = df[col].unique()
+                print(us)
+                self.feat_dict[col] = dict(zip(us,range(tc,len(us)+tc)))
+                tc += len(us)
+        
+        self.feat_dim = tc
+
+
+class DataParser(object):
+    def __init__(self,feat_dict):
+        self.feat_dict = feat_dict
+
+    def parse(self,infile=None,df=None,has_label=False):
+        assert not ((infile is None) and (df is None)), "infile or df at least one is set"
+        assert not ((infile is not None) and (df is not None)), "only one can be set"
+
+
+        if infile is None:
+            dfi = df.copy()
+        else:
+            dfi = pd.read_csv(infile)
+
+        if has_label:
+            y = dfi['target'].values.tolist()
+            dfi.drop(["ID",'target'],axis=1,inplace=True)
+        else:
+            ids = dfi['ID'].values.tolist()
+            dfi.drop(['ID'],axis=1,inplace=True)
+        # dfi for feature index
+        # dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
+        dfv = dfi.copy()
+        for col in dfi.columns:
+            if col in self.feat_dict.ignore_cols:
+                dfi.drop(col,axis=1,inplace=True)
+                dfv.drop(col,axis=1,inplace=True)
+                continue
+            if col in self.feat_dict.numeric_cols:
+                dfi[col] = self.feat_dict.feat_dict[col]
+            else:
+                dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
+                #dfv[col] = 1.
+
+        xi = dfi.values.tolist()
+        xv = dfv.values.tolist()
+
+        if has_label:
+            return xi,xv,y
+        else:
+            return xi,xv,ids
+
+