--- a +++ b/notebooks/data/processing.ipynb @@ -0,0 +1,91 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "import pandas as pd\n", + "\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the noisy dataset.\n", + "original = pd.read_csv('../../data/raw/original.csv')\n", + "synthetic = pd.read_csv('../../data/raw/synthetic.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Process the the datasets.\n", + "original = original.drop_duplicates().dropna()\n", + "synthetic = synthetic.drop_duplicates().dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature engineering\n", + "original[['GENDER', 'LUNG_CANCER']] = original[['GENDER', 'LUNG_CANCER']].replace({'M': 1, 'F': 2, 'YES': 1, 'NO': 0})\n", + "synthetic[['GENDER', 'LUNG_CANCER']] = synthetic[['GENDER', 'LUNG_CANCER']].replace({'M': 1, 'F': 2, 'YES': 1, 'NO': 0})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset processed.\n" + ] + } + ], + "source": [ + "# Saved dataset\n", + "original.to_csv('../../data/processed/original.csv', index=False)\n", + "synthetic.to_csv('../../data/processed/synthetic.csv', index=False)\n", + "\n", + "print('Dataset processed.')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}