|
a |
|
b/app.py |
|
|
1 |
import pandas as pd |
|
|
2 |
import numpy as np |
|
|
3 |
|
|
|
4 |
data = pd.read_csv("kidney_disease.csv") |
|
|
5 |
data.drop('id',axis=1, inplace=True) |
|
|
6 |
|
|
|
7 |
data.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell', |
|
|
8 |
'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium', |
|
|
9 |
'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count', |
|
|
10 |
'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema', |
|
|
11 |
'aanemia', 'class'] |
|
|
12 |
|
|
|
13 |
data['packed_cell_volume'] = pd.to_numeric(data['packed_cell_volume'], errors='coerce') |
|
|
14 |
data['white_blood_cell_count'] = pd.to_numeric(data['white_blood_cell_count'], errors='coerce') |
|
|
15 |
data['red_blood_cell_count'] = pd.to_numeric(data['red_blood_cell_count'], errors='coerce') |
|
|
16 |
|
|
|
17 |
cat_cols = [col for col in data.columns if data[col].dtype == 'object'] |
|
|
18 |
num_cols = [col for col in data.columns if data[col].dtype != 'object'] |
|
|
19 |
|
|
|
20 |
data['diabetes_mellitus'].replace(to_replace = {'\tno':'no', '\tyes': 'yes', ' yes':'yes'}, inplace=True) |
|
|
21 |
data['coronary_artery_disease'] = data['coronary_artery_disease'].replace(to_replace = '\tno', value = 'no') |
|
|
22 |
|
|
|
23 |
data['class'] = data['class'].replace(to_replace={'ckd\t':'ckd', 'notckd': 'not ckd'}) |
|
|
24 |
data['class'] = data['class'].map({'ckd':0, 'not ckd': 1}) |
|
|
25 |
data['class'] = pd.to_numeric(data['class'], errors = 'coerce') |
|
|
26 |
|
|
|
27 |
def random_sampling(feature): |
|
|
28 |
random_sample = data[feature].dropna().sample(data[feature].isna().sum()) |
|
|
29 |
random_sample.index = data[data[feature].isnull()].index |
|
|
30 |
data.loc[data[feature].isnull(), feature] = random_sample |
|
|
31 |
|
|
|
32 |
def impute_mode(feature): |
|
|
33 |
mode = data[feature].mode()[0] |
|
|
34 |
data[feature] = data[feature].fillna(mode) |
|
|
35 |
|
|
|
36 |
for col in num_cols: |
|
|
37 |
random_sampling(col) |
|
|
38 |
|
|
|
39 |
random_sampling('red_blood_cells') |
|
|
40 |
random_sampling('pus_cell') |
|
|
41 |
for col in cat_cols: |
|
|
42 |
impute_mode(col) |
|
|
43 |
|
|
|
44 |
from sklearn.preprocessing import LabelEncoder |
|
|
45 |
le = LabelEncoder() |
|
|
46 |
for col in cat_cols: |
|
|
47 |
data[col] = le.fit_transform(data[col]) |
|
|
48 |
|
|
|
49 |
X = data.drop('class', axis = 1) |
|
|
50 |
Y = data['class'] |
|
|
51 |
|
|
|
52 |
from sklearn.model_selection import train_test_split |
|
|
53 |
X_train,X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 40) |
|
|
54 |
print(Y_test) |
|
|
55 |
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report |
|
|
56 |
from sklearn.ensemble import RandomForestClassifier |
|
|
57 |
rand_clf = RandomForestClassifier(criterion = "gini", max_depth = 10, max_features="sqrt", min_samples_leaf= 1, min_samples_split= 7, n_estimators = 400) |
|
|
58 |
rand_clf.fit(X_train, Y_train) |
|
|
59 |
rand_clf_acc = accuracy_score(Y_test, rand_clf.predict(X_test)) |
|
|
60 |
print(f"Training Accuracy of Random Forest is {accuracy_score(Y_train, rand_clf.predict(X_train))}") |
|
|
61 |
print(f"Testing Accuracy of Random Forest is {accuracy_score(Y_test, rand_clf.predict(X_test))}") |
|
|
62 |
print(f"Confusion Matrix of Random Forest is \n {confusion_matrix(Y_test, rand_clf.predict(X_test))}\n") |
|
|
63 |
print(f"Classification Report of Random Forest is \n{classification_report(Y_test, rand_clf.predict(X_test))}") |
|
|
64 |
|
|
|
65 |
import joblib |
|
|
66 |
joblib.dump(rand_clf, 'kidney_disease_model.pkl') |