--- a +++ b/notebooks/data/preparation.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Load processed datasets\n", + "original = pd.read_csv(\"../../data/processed/original.csv\")\n", + "synthetic = pd.read_csv(\"../../data/processed/synthetic.csv\")\n", + "\n", + "# mix both original and synthetic sets\n", + "data = pd.concat([original, synthetic], keys=[1, 2]).drop_duplicates().dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Select relevant feature\n", + "feature = data[['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'FATIGUE', 'WHEEZING', 'COUGHING', 'SHORTNESS OF BREATH', 'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'CHRONIC DISEASE']]\n", + "label = data['LUNG_CANCER']\n", + "\n", + "data = pd.concat([feature, label], axis=1).drop_duplicates().dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "feature = data.drop('LUNG_CANCER', axis='columns')\n", + "label = data['LUNG_CANCER']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training set: (353, 11)\n", + "Testing set: (89, 11)\n" + ] + } + ], + "source": [ + "# Split train/test holdout sets\n", + "X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=42, stratify=label)\n", + "\n", + "print(f\"Training set: {X_train.shape}\")\n", + "print(f\"Testing set: {X_test.shape}\")\n", + "\n", + "train_data = pd.concat([X_train, y_train], axis=1)\n", + "test_data = pd.concat([X_test , y_test], axis=1)\n", + "\n", + "train_data.to_csv('../../data/input/train.csv', index=False)\n", + "test_data.to_csv('../../data/input/test.csv', index=False)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}