92 lines (91 with data), 2.1 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"import pandas as pd\n",
"\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Load the noisy dataset.\n",
"original = pd.read_csv('../../data/raw/original.csv')\n",
"synthetic = pd.read_csv('../../data/raw/synthetic.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Process the the datasets.\n",
"original = original.drop_duplicates().dropna()\n",
"synthetic = synthetic.drop_duplicates().dropna()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Feature engineering\n",
"original[['GENDER', 'LUNG_CANCER']] = original[['GENDER', 'LUNG_CANCER']].replace({'M': 1, 'F': 2, 'YES': 1, 'NO': 0})\n",
"synthetic[['GENDER', 'LUNG_CANCER']] = synthetic[['GENDER', 'LUNG_CANCER']].replace({'M': 1, 'F': 2, 'YES': 1, 'NO': 0})"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset processed.\n"
]
}
],
"source": [
"# Saved dataset\n",
"original.to_csv('../../data/processed/original.csv', index=False)\n",
"synthetic.to_csv('../../data/processed/synthetic.csv', index=False)\n",
"\n",
"print('Dataset processed.')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}