|
a |
|
b/data_generator.py |
|
|
1 |
import pandas |
|
|
2 |
import numpy as np |
|
|
3 |
from tensorflow.keras.utils import to_categorical |
|
|
4 |
from sklearn.model_selection import train_test_split |
|
|
5 |
from sklearn.utils.class_weight import compute_class_weight |
|
|
6 |
|
|
|
7 |
|
|
|
8 |
def data_generator(csv='data/challenge_1_gut_microbiome_data.csv'): |
|
|
9 |
# reading in the csv file |
|
|
10 |
df = pandas.read_csv(csv) |
|
|
11 |
bact = df[[i for i in df.columns if i != 'Sample' and i != 'disease']] |
|
|
12 |
|
|
|
13 |
# mean normalization |
|
|
14 |
mean = bact.mean() |
|
|
15 |
std = bact.std() |
|
|
16 |
mean_normalized_df = (bact-mean) / std |
|
|
17 |
|
|
|
18 |
# min max normalization |
|
|
19 |
min = bact.min() |
|
|
20 |
max = bact.max() |
|
|
21 |
min_max_normalized_df = (bact-min) / (max-min) |
|
|
22 |
|
|
|
23 |
# column names |
|
|
24 |
mndf_cols = mean_normalized_df.columns |
|
|
25 |
|
|
|
26 |
# removes the crazy outliers |
|
|
27 |
count = 0 |
|
|
28 |
for i, col in enumerate(mndf_cols): |
|
|
29 |
rows = mean_normalized_df[col] |
|
|
30 |
for j, row in enumerate(rows): |
|
|
31 |
if row < -5: |
|
|
32 |
mean_normalized_df.at[j, col] = -5 |
|
|
33 |
elif row > 5: |
|
|
34 |
mean_normalized_df.at[j, col] = 5 |
|
|
35 |
|
|
|
36 |
# getting the labels |
|
|
37 |
labels = df['disease'] |
|
|
38 |
possible_labels = ['Disease-1', 'Disease-3', 'Disease-2', 'Healthy'] |
|
|
39 |
for i, label in enumerate(labels): |
|
|
40 |
if label == possible_labels[0]: |
|
|
41 |
labels[i] = 0 |
|
|
42 |
elif label == possible_labels[1]: |
|
|
43 |
labels[i] = 1 |
|
|
44 |
elif label == possible_labels[2]: |
|
|
45 |
labels[i] = 2 |
|
|
46 |
elif label == possible_labels[3]: |
|
|
47 |
labels[i] = 3 |
|
|
48 |
|
|
|
49 |
# splitting the data |
|
|
50 |
mean_normalized_df = np.asarray(mean_normalized_df).astype('float32') |
|
|
51 |
labels = np.asarray(labels).astype('float32') |
|
|
52 |
|
|
|
53 |
X_train, X_test, y_train, y_test = train_test_split( |
|
|
54 |
mean_normalized_df, |
|
|
55 |
labels, |
|
|
56 |
test_size=0.2, |
|
|
57 |
random_state=3 |
|
|
58 |
) |
|
|
59 |
|
|
|
60 |
# categorizing the data |
|
|
61 |
y_train_categorical = to_categorical(y_train, num_classes=4) |
|
|
62 |
y_test_categorical = to_categorical(y_test, num_classes=4) |
|
|
63 |
|
|
|
64 |
# compute class weights |
|
|
65 |
class_weight = compute_class_weight( |
|
|
66 |
class_weight='balanced', |
|
|
67 |
classes = [0,1,2,3], |
|
|
68 |
y=labels |
|
|
69 |
) |
|
|
70 |
sum = 0 |
|
|
71 |
for i in class_weight: |
|
|
72 |
sum += i |
|
|
73 |
class_weight = class_weight / sum |
|
|
74 |
|
|
|
75 |
# return |
|
|
76 |
return X_train, X_test, y_train, y_test, y_train_categorical, y_test_categorical, class_weight |