a b/data_generator.py
1
import pandas
2
import numpy as np
3
from tensorflow.keras.utils import to_categorical
4
from sklearn.model_selection import train_test_split
5
from sklearn.utils.class_weight import compute_class_weight
6
7
8
def data_generator(csv='data/challenge_1_gut_microbiome_data.csv'):
9
    # reading in the csv file
10
    df = pandas.read_csv(csv)
11
    bact = df[[i for i in df.columns if i != 'Sample' and i != 'disease']]
12
13
    # mean normalization
14
    mean = bact.mean()
15
    std = bact.std()
16
    mean_normalized_df = (bact-mean) / std
17
18
    # min max normalization
19
    min = bact.min()
20
    max = bact.max()
21
    min_max_normalized_df = (bact-min) / (max-min)
22
23
    # column names
24
    mndf_cols = mean_normalized_df.columns
25
26
    # removes the crazy outliers
27
    count = 0
28
    for i, col in enumerate(mndf_cols):
29
        rows = mean_normalized_df[col]
30
        for j, row in enumerate(rows):
31
            if row < -5:
32
                mean_normalized_df.at[j, col] = -5
33
            elif row > 5:
34
                mean_normalized_df.at[j, col] = 5
35
36
    # getting the labels
37
    labels = df['disease']
38
    possible_labels = ['Disease-1', 'Disease-3', 'Disease-2', 'Healthy']
39
    for i, label in enumerate(labels):
40
        if label == possible_labels[0]:
41
            labels[i] = 0
42
        elif label == possible_labels[1]:
43
            labels[i] = 1
44
        elif label == possible_labels[2]:
45
            labels[i] = 2
46
        elif label == possible_labels[3]:
47
            labels[i] = 3
48
49
    # splitting the data
50
    mean_normalized_df = np.asarray(mean_normalized_df).astype('float32')
51
    labels = np.asarray(labels).astype('float32')
52
53
    X_train, X_test, y_train, y_test = train_test_split(
54
        mean_normalized_df, 
55
        labels, 
56
        test_size=0.2, 
57
        random_state=3
58
    )
59
60
    # categorizing the data
61
    y_train_categorical = to_categorical(y_train, num_classes=4)
62
    y_test_categorical = to_categorical(y_test, num_classes=4)
63
64
    # compute class weights
65
    class_weight = compute_class_weight(
66
        class_weight='balanced',
67
        classes = [0,1,2,3], 
68
        y=labels
69
    )
70
    sum = 0
71
    for i in class_weight:
72
        sum += i
73
    class_weight = class_weight / sum
74
75
    # return
76
    return X_train, X_test, y_train, y_test, y_train_categorical, y_test_categorical, class_weight