[11ca2d]: / data_generator.py

Download this file

76 lines (64 with data), 2.3 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
def data_generator(csv='data/challenge_1_gut_microbiome_data.csv'):
# reading in the csv file
df = pandas.read_csv(csv)
bact = df[[i for i in df.columns if i != 'Sample' and i != 'disease']]
# mean normalization
mean = bact.mean()
std = bact.std()
mean_normalized_df = (bact-mean) / std
# min max normalization
min = bact.min()
max = bact.max()
min_max_normalized_df = (bact-min) / (max-min)
# column names
mndf_cols = mean_normalized_df.columns
# removes the crazy outliers
count = 0
for i, col in enumerate(mndf_cols):
rows = mean_normalized_df[col]
for j, row in enumerate(rows):
if row < -5:
mean_normalized_df.at[j, col] = -5
elif row > 5:
mean_normalized_df.at[j, col] = 5
# getting the labels
labels = df['disease']
possible_labels = ['Disease-1', 'Disease-3', 'Disease-2', 'Healthy']
for i, label in enumerate(labels):
if label == possible_labels[0]:
labels[i] = 0
elif label == possible_labels[1]:
labels[i] = 1
elif label == possible_labels[2]:
labels[i] = 2
elif label == possible_labels[3]:
labels[i] = 3
# splitting the data
mean_normalized_df = np.asarray(mean_normalized_df).astype('float32')
labels = np.asarray(labels).astype('float32')
X_train, X_test, y_train, y_test = train_test_split(
mean_normalized_df,
labels,
test_size=0.2,
random_state=3
)
# categorizing the data
y_train_categorical = to_categorical(y_train, num_classes=4)
y_test_categorical = to_categorical(y_test, num_classes=4)
# compute class weights
class_weight = compute_class_weight(
class_weight='balanced',
classes = [0,1,2,3],
y=labels
)
sum = 0
for i in class_weight:
sum += i
class_weight = class_weight / sum
# return
return X_train, X_test, y_train, y_test, y_train_categorical, y_test_categorical, class_weight