|
a |
|
b/Models/decisiontrees.py |
|
|
1 |
# Importing the libraries |
|
|
2 |
import numpy as np |
|
|
3 |
import matplotlib.pyplot as plt |
|
|
4 |
import pandas as pd |
|
|
5 |
from sklearn.model_selection import GridSearchCV |
|
|
6 |
from sklearn.tree import DecisionTreeClassifier |
|
|
7 |
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve |
|
|
8 |
|
|
|
9 |
# Importing the dataset |
|
|
10 |
dataset = pd.read_csv('../Dataset/diabetes.csv') |
|
|
11 |
X = dataset.iloc[:, :-1].values |
|
|
12 |
y = dataset.iloc[:, 8].values |
|
|
13 |
|
|
|
14 |
# Splitting the dataset into the Training set and Test set |
|
|
15 |
from sklearn.model_selection import train_test_split |
|
|
16 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42) |
|
|
17 |
|
|
|
18 |
# Feature Scaling |
|
|
19 |
from sklearn.preprocessing import StandardScaler |
|
|
20 |
sc = StandardScaler() |
|
|
21 |
X_train = sc.fit_transform(X_train) |
|
|
22 |
X_test = sc.transform(X_test) |
|
|
23 |
|
|
|
24 |
# Parameter evaluation |
|
|
25 |
treeclf = DecisionTreeClassifier(random_state=42) |
|
|
26 |
parameters = {'max_depth': [6, 7, 8, 9], |
|
|
27 |
'min_samples_split': [2, 3, 4, 5], |
|
|
28 |
'max_features': [1, 2, 3, 4] |
|
|
29 |
} |
|
|
30 |
gridsearch=GridSearchCV(treeclf, parameters, cv=100, scoring='roc_auc') |
|
|
31 |
gridsearch.fit(X,y) |
|
|
32 |
print(gridsearch.best_params_) |
|
|
33 |
print(gridsearch.best_score_) |
|
|
34 |
|
|
|
35 |
# Adjusting development threshold |
|
|
36 |
tree = DecisionTreeClassifier(max_depth = 6, max_features = 4, |
|
|
37 |
min_samples_split = 5, |
|
|
38 |
random_state=42) |
|
|
39 |
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=42) |
|
|
40 |
tree.fit(X_train, y_train) |
|
|
41 |
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train))) |
|
|
42 |
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test))) |
|
|
43 |
|
|
|
44 |
# Predicting the Test set results |
|
|
45 |
y_pred = tree.predict(X_test) |
|
|
46 |
|
|
|
47 |
# Making the Confusion Matrix |
|
|
48 |
from sklearn.metrics import classification_report, confusion_matrix |
|
|
49 |
cm = confusion_matrix(y_test, y_pred) |
|
|
50 |
|
|
|
51 |
print('TP - True Negative {}'.format(cm[0,0])) |
|
|
52 |
print('FP - False Positive {}'.format(cm[0,1])) |
|
|
53 |
print('FN - False Negative {}'.format(cm[1,0])) |
|
|
54 |
print('TP - True Positive {}'.format(cm[1,1])) |
|
|
55 |
print('Accuracy Rate: {}'.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm)))) |
|
|
56 |
print('Misclassification Rate: {}'.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm)))) |
|
|
57 |
|
|
|
58 |
round(roc_auc_score(y_test,y_pred),5) |