In [1]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, IsolationForest, AdaBoostClassifier, StackingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, BaggingClassifier
from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate

from time import time 
import warnings
import pandas as pd
import joblib

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('../../../data/input/train.csv')

In [3]:
feature = data.drop('LUNG_CANCER', axis='columns')
label = data['LUNG_CANCER']

In [4]:
# Train every classifiers
estimators = {
    'logreg': LogisticRegression(solver='lbfgs', max_iter=200),
    'ridge': RidgeClassifier(),
    'passive_agressive': PassiveAggressiveClassifier(),
    'sgd': SGDClassifier(),
    'tree': DecisionTreeClassifier(),
    'extra_tree': ExtraTreesClassifier(),
    'random_forest': RandomForestClassifier(),
    'isolation_forest': IsolationForest(),
    'adaboost': AdaBoostClassifier(),
    'stacking': StackingClassifier(estimators=[('tree', DecisionTreeClassifier())]), 
    'gradient_boosting': GradientBoostingClassifier(), 
    'hist_gradient_boosting': HistGradientBoostingClassifier(), 
    'voting': VotingClassifier(estimators=[('tree', DecisionTreeClassifier())]), 
    'bagging': BaggingClassifier(),
    'radius_neighbors': RadiusNeighborsClassifier(), 
    'kneighbors': KNeighborsClassifier(),
    'svm': SVC(),
    'linear_svm': LinearSVC(),
    'categorical': CategoricalNB(),
    'bernoulli': BernoulliNB(), 
    'gaussian': GaussianNB(),
    'neural_net': MLPClassifier()
}

best_accuracy = 0
best_precision = 0
best_recall = 0
best_f1 = 0
best_model = None
model_name = None

for estimator_name, estimator in estimators.items():
    # Iterate & validate the model
    cv = cross_validate(estimator, feature, label, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=10)
    
    accuracy = cv['test_accuracy'].mean()
    precision = cv['test_precision'].mean()
    recall = cv['test_recall'].mean()
    f1 = cv['test_f1'].mean()
 
    # Select the performing model
    if accuracy > best_accuracy:
       best_accuracy = accuracy
       best_precision = precision
       best_recall = recall
       best_f1 = f1
       model_name = estimator_name
       best_model = estimator

    print(f'{estimator_name} -> ( Accuracy: {(accuracy * 100):.0f}%) - (Precision: {(precision * 100):.0f}%) - (Recall: {(recall * 100):.0f}%) - (Measure: {(f1 * 100):.0f}%)')

print(f'\nBest Model: {model_name}')
print(f'Accuracy: {best_accuracy}')
print(f'Precision: {best_precision}')
print(f'Recall: {best_recall}')
print(f'Measure: {best_f1}')

# Re-train the performing model
best_model.fit(feature, label)

# Save the trained model
model = joblib.dump(best_model, f'../../../models/{model_name}.joblib')
print(f'Model Saved: {model}')

logreg -> ( Accuracy: 92%) - (Precision: 91%) - (Recall: 94%) - (Measure: 93%)
ridge -> ( Accuracy: 90%) - (Precision: 93%) - (Recall: 89%) - (Measure: 91%)
passive_agressive -> ( Accuracy: 52%) - (Precision: 38%) - (Recall: 70%) - (Measure: 49%)
sgd -> ( Accuracy: 69%) - (Precision: 68%) - (Recall: 93%) - (Measure: 77%)
tree -> ( Accuracy: 89%) - (Precision: 91%) - (Recall: 88%) - (Measure: 90%)
extra_tree -> ( Accuracy: 92%) - (Precision: 93%) - (Recall: 92%) - (Measure: 92%)
random_forest -> ( Accuracy: 92%) - (Precision: 93%) - (Recall: 93%) - (Measure: 93%)
isolation_forest -> ( Accuracy: 4%) - (Precision: nan%) - (Recall: nan%) - (Measure: nan%)
adaboost -> ( Accuracy: 91%) - (Precision: 91%) - (Recall: 93%) - (Measure: 92%)
stacking -> ( Accuracy: 89%) - (Precision: 90%) - (Recall: 89%) - (Measure: 89%)
gradient_boosting -> ( Accuracy: 93%) - (Precision: 94%) - (Recall: 93%) - (Measure: 93%)
hist_gradient_boosting -> ( Accuracy: 91%) - (Precision: 93%) - (Recall: 90%) - (Measure