{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Wczytanie danych" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "X = pd.read_csv(\"../transformed_data/X_train.csv\")\n", "y = pd.read_csv(\"../transformed_data/y_train.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Procalcitonin [Mass/volume] in Serum or Plasma | \n", "Creatine kinase [Enzymatic activity/volume] in Serum or Plasma | \n", "C reactive protein [Mass/volume] in Serum or Plasma | \n", "White Blood Cell (Elevated) | \n", "RBC Distribution Width | \n", "Monocytes/100 leukocytes in Blood by Automated count | \n", "pH of Arterial blood | \n", "Fibrin D-dimer FEU [Mass/volume] in Platelet poor plasma | \n", "Basophils/100 leukocytes in Blood by Automated count | \n", "Anion Gap | \n", "... | \n", "Ketones [Presence] in Urine by Test strip_3+ | \n", "Ketones [Presence] in Urine by Test strip_trace | \n", "Clarity of Urine_cloudy | \n", "Clarity of Urine_translucent | \n", "Stage group.clinical Cancer_earlystage | \n", "Stage group.clinical Cancer_latestage | \n", "Rhinovirus RNA [Presence] in Respiratory specimen by NAA with probe detection_negative | \n", "HER2 [Presence] in Breast cancer specimen by Immune stain_negative | \n", "HER2 [Presence] in Breast cancer specimen by Immune stain_positive | \n", "Smokes tobacco daily_True | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.1 | \n", "33.91 | \n", "10.11 | \n", "13.4 | \n", "13.1 | \n", "10.025 | \n", "7.06 | \n", "0.43 | \n", "3.005 | \n", "8.2 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
1 | \n", "0.1 | \n", "33.91 | \n", "10.11 | \n", "13.4 | \n", "13.1 | \n", "10.025 | \n", "7.06 | \n", "0.43 | \n", "3.005 | \n", "8.2 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
2 | \n", "0.1 | \n", "33.91 | \n", "10.11 | \n", "13.4 | \n", "13.1 | \n", "10.025 | \n", "7.06 | \n", "0.43 | \n", "3.005 | \n", "8.2 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 | \n", "0.1 | \n", "33.91 | \n", "10.11 | \n", "13.4 | \n", "13.1 | \n", "10.025 | \n", "7.06 | \n", "0.43 | \n", "3.005 | \n", "8.2 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
4 | \n", "0.1 | \n", "33.91 | \n", "10.11 | \n", "13.4 | \n", "13.1 | \n", "10.025 | \n", "7.06 | \n", "0.43 | \n", "3.005 | \n", "8.2 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 rows × 222 columns
\n", "\n", " | Procalcitonin [Mass/volume] in Serum or Plasma | \n", "Creatine kinase [Enzymatic activity/volume] in Serum or Plasma | \n", "C reactive protein [Mass/volume] in Serum or Plasma | \n", "White Blood Cell (Elevated) | \n", "RBC Distribution Width | \n", "Monocytes/100 leukocytes in Blood by Automated count | \n", "pH of Arterial blood | \n", "Fibrin D-dimer FEU [Mass/volume] in Platelet poor plasma | \n", "Basophils/100 leukocytes in Blood by Automated count | \n", "Anion Gap | \n", "... | \n", "Ketones [Presence] in Urine by Test strip_3+ | \n", "Ketones [Presence] in Urine by Test strip_trace | \n", "Clarity of Urine_cloudy | \n", "Clarity of Urine_translucent | \n", "Stage group.clinical Cancer_earlystage | \n", "Stage group.clinical Cancer_latestage | \n", "Rhinovirus RNA [Presence] in Respiratory specimen by NAA with probe detection_negative | \n", "HER2 [Presence] in Breast cancer specimen by Immune stain_negative | \n", "HER2 [Presence] in Breast cancer specimen by Immune stain_positive | \n", "Smokes tobacco daily_True | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.1 | \n", "33.91 | \n", "10.11 | \n", "13.4 | \n", "13.1 | \n", "10.025 | \n", "7.06 | \n", "0.43 | \n", "3.005 | \n", "8.2 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
1 | \n", "0.1 | \n", "33.91 | \n", "10.11 | \n", "13.4 | \n", "13.1 | \n", "10.025 | \n", "7.06 | \n", "0.43 | \n", "3.005 | \n", "8.2 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
2 | \n", "0.1 | \n", "33.91 | \n", "10.11 | \n", "13.4 | \n", "13.1 | \n", "10.025 | \n", "7.06 | \n", "0.43 | \n", "3.005 | \n", "8.2 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 | \n", "0.1 | \n", "33.91 | \n", "10.11 | \n", "13.4 | \n", "13.1 | \n", "10.025 | \n", "7.06 | \n", "0.43 | \n", "3.005 | \n", "8.2 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
4 | \n", "0.1 | \n", "33.91 | \n", "10.11 | \n", "13.4 | \n", "13.1 | \n", "10.025 | \n", "7.06 | \n", "0.43 | \n", "3.005 | \n", "8.2 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 rows × 222 columns
\n", "\n", " | Procalcitonin [Mass/volume] in Serum or Plasma | \n", "Creatine kinase [Enzymatic activity/volume] in Serum or Plasma | \n", "C reactive protein [Mass/volume] in Serum or Plasma | \n", "White Blood Cell (Elevated) | \n", "RBC Distribution Width | \n", "Monocytes/100 leukocytes in Blood by Automated count | \n", "pH of Arterial blood | \n", "Fibrin D-dimer FEU [Mass/volume] in Platelet poor plasma | \n", "Basophils/100 leukocytes in Blood by Automated count | \n", "Anion Gap | \n", "... | \n", "Ketones [Presence] in Urine by Test strip_3+ | \n", "Ketones [Presence] in Urine by Test strip_trace | \n", "Clarity of Urine_cloudy | \n", "Clarity of Urine_translucent | \n", "Stage group.clinical Cancer_earlystage | \n", "Stage group.clinical Cancer_latestage | \n", "Rhinovirus RNA [Presence] in Respiratory specimen by NAA with probe detection_negative | \n", "HER2 [Presence] in Breast cancer specimen by Immune stain_negative | \n", "HER2 [Presence] in Breast cancer specimen by Immune stain_positive | \n", "Smokes tobacco daily_True | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "... | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "9048.000000 | \n", "
mean | \n", "0.100612 | \n", "34.221747 | \n", "10.119776 | \n", "13.396253 | \n", "13.098442 | \n", "10.025219 | \n", "7.060000 | \n", "0.462666 | \n", "3.005082 | \n", "8.202321 | \n", "... | \n", "0.149425 | \n", "0.034483 | \n", "0.136715 | \n", "0.053050 | \n", "0.011826 | \n", "0.024204 | \n", "0.032604 | \n", "0.029730 | \n", "0.006300 | \n", "0.038904 | \n", "
std | \n", "0.011126 | \n", "5.500458 | \n", "0.209493 | \n", "0.160186 | \n", "0.116452 | \n", "0.129218 | \n", "0.005288 | \n", "0.562014 | \n", "0.029159 | \n", "0.452099 | \n", "... | \n", "0.356527 | \n", "0.182476 | \n", "0.343566 | \n", "0.224146 | \n", "0.108108 | \n", "0.153691 | \n", "0.177607 | \n", "0.169852 | \n", "0.079125 | \n", "0.193376 | \n", "
min | \n", "0.020000 | \n", "19.720000 | \n", "8.910000 | \n", "11.100000 | \n", "11.600000 | \n", "7.910000 | \n", "6.960000 | \n", "0.300000 | \n", "2.560000 | \n", "2.000000 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
25% | \n", "0.100000 | \n", "33.910000 | \n", "10.110000 | \n", "13.400000 | \n", "13.100000 | \n", "10.025000 | \n", "7.060000 | \n", "0.430000 | \n", "3.005000 | \n", "8.200000 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
50% | \n", "0.100000 | \n", "33.910000 | \n", "10.110000 | \n", "13.400000 | \n", "13.100000 | \n", "10.025000 | \n", "7.060000 | \n", "0.430000 | \n", "3.005000 | \n", "8.200000 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
75% | \n", "0.100000 | \n", "33.910000 | \n", "10.110000 | \n", "13.400000 | \n", "13.100000 | \n", "10.025000 | \n", "7.060000 | \n", "0.430000 | \n", "3.005000 | \n", "8.200000 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
max | \n", "0.370000 | \n", "161.340000 | \n", "15.400000 | \n", "15.000000 | \n", "14.600000 | \n", "12.270000 | \n", "7.170000 | \n", "14.050000 | \n", "3.490000 | \n", "14.900000 | \n", "... | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "
8 rows × 222 columns
\n", "TPOTClassifier(generations=10, n_jobs=-1, offspring_size=50, population_size=50,\n", " scoring='recall', verbosity=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
TPOTClassifier(generations=10, n_jobs=-1, offspring_size=50, population_size=50,\n", " scoring='recall', verbosity=2)
Pipeline(steps=[('mlp', MLPTransformer()),\n", " ('extra_trees',\n", " CustomClassifierTransformer(classifier=ExtraTreesClassifier(max_features=0.6,\n", " min_samples_leaf=12,\n", " min_samples_split=3,\n", " random_state=42))),\n", " ('gaussian_nb', GaussianNB())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('mlp', MLPTransformer()),\n", " ('extra_trees',\n", " CustomClassifierTransformer(classifier=ExtraTreesClassifier(max_features=0.6,\n", " min_samples_leaf=12,\n", " min_samples_split=3,\n", " random_state=42))),\n", " ('gaussian_nb', GaussianNB())])
MLPTransformer()
CustomClassifierTransformer(classifier=ExtraTreesClassifier(max_features=0.6,\n", " min_samples_leaf=12,\n", " min_samples_split=3,\n", " random_state=42))
ExtraTreesClassifier(max_features=0.6, min_samples_leaf=12, min_samples_split=3,\n", " random_state=42)
ExtraTreesClassifier(max_features=0.6, min_samples_leaf=12, min_samples_split=3,\n", " random_state=42)
GaussianNB()