pharmahacks_multi_disease / Git / Diff of /data

Models:
Mathew-S/
pharmahacks_multi_disease
Downloads: 1
Diff of /data_generator.py [000000] .. [11ca2d]
Switch to side-by-side view

--- a
+++ b/data_generator.py
@@ -0,0 +1,76 @@
+import pandas
+import numpy as np
+from tensorflow.keras.utils import to_categorical
+from sklearn.model_selection import train_test_split
+from sklearn.utils.class_weight import compute_class_weight
+
+
+def data_generator(csv='data/challenge_1_gut_microbiome_data.csv'):
+    # reading in the csv file
+    df = pandas.read_csv(csv)
+    bact = df[[i for i in df.columns if i != 'Sample' and i != 'disease']]
+
+    # mean normalization
+    mean = bact.mean()
+    std = bact.std()
+    mean_normalized_df = (bact-mean) / std
+
+    # min max normalization
+    min = bact.min()
+    max = bact.max()
+    min_max_normalized_df = (bact-min) / (max-min)
+
+    # column names
+    mndf_cols = mean_normalized_df.columns
+
+    # removes the crazy outliers
+    count = 0
+    for i, col in enumerate(mndf_cols):
+        rows = mean_normalized_df[col]
+        for j, row in enumerate(rows):
+            if row < -5:
+                mean_normalized_df.at[j, col] = -5
+            elif row > 5:
+                mean_normalized_df.at[j, col] = 5
+
+    # getting the labels
+    labels = df['disease']
+    possible_labels = ['Disease-1', 'Disease-3', 'Disease-2', 'Healthy']
+    for i, label in enumerate(labels):
+        if label == possible_labels[0]:
+            labels[i] = 0
+        elif label == possible_labels[1]:
+            labels[i] = 1
+        elif label == possible_labels[2]:
+            labels[i] = 2
+        elif label == possible_labels[3]:
+            labels[i] = 3
+
+    # splitting the data
+    mean_normalized_df = np.asarray(mean_normalized_df).astype('float32')
+    labels = np.asarray(labels).astype('float32')
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        mean_normalized_df, 
+        labels, 
+        test_size=0.2, 
+        random_state=3
+    )
+
+    # categorizing the data
+    y_train_categorical = to_categorical(y_train, num_classes=4)
+    y_test_categorical = to_categorical(y_test, num_classes=4)
+
+    # compute class weights
+    class_weight = compute_class_weight(
+        class_weight='balanced',
+        classes = [0,1,2,3], 
+        y=labels
+    )
+    sum = 0
+    for i in class_weight:
+        sum += i
+    class_weight = class_weight / sum
+
+    # return
+    return X_train, X_test, y_train, y_test, y_train_categorical, y_test_categorical, class_weight
\ No newline at end of file