Switch to unified view

a b/notebooks/data/preparation.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 6,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "from sklearn.model_selection import train_test_split\n",
10
    "import pandas as pd"
11
   ]
12
  },
13
  {
14
   "cell_type": "code",
15
   "execution_count": 7,
16
   "metadata": {},
17
   "outputs": [],
18
   "source": [
19
    "# Load processed datasets\n",
20
    "original = pd.read_csv(\"../../data/processed/original.csv\")\n",
21
    "synthetic = pd.read_csv(\"../../data/processed/synthetic.csv\")\n",
22
    "\n",
23
    "# mix both original and synthetic sets\n",
24
    "data = pd.concat([original, synthetic], keys=[1, 2]).drop_duplicates().dropna()"
25
   ]
26
  },
27
  {
28
   "cell_type": "code",
29
   "execution_count": 8,
30
   "metadata": {},
31
   "outputs": [],
32
   "source": [
33
    "# Select relevant feature\n",
34
    "feature = data[['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'FATIGUE', 'WHEEZING', 'COUGHING', 'SHORTNESS OF BREATH', 'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'CHRONIC DISEASE']]\n",
35
    "label = data['LUNG_CANCER']\n",
36
    "\n",
37
    "data = pd.concat([feature, label], axis=1).drop_duplicates().dropna()"
38
   ]
39
  },
40
  {
41
   "cell_type": "code",
42
   "execution_count": 9,
43
   "metadata": {},
44
   "outputs": [],
45
   "source": [
46
    "feature = data.drop('LUNG_CANCER', axis='columns')\n",
47
    "label = data['LUNG_CANCER']"
48
   ]
49
  },
50
  {
51
   "cell_type": "code",
52
   "execution_count": 10,
53
   "metadata": {},
54
   "outputs": [
55
    {
56
     "name": "stdout",
57
     "output_type": "stream",
58
     "text": [
59
      "Training set: (353, 11)\n",
60
      "Testing set: (89, 11)\n"
61
     ]
62
    }
63
   ],
64
   "source": [
65
    "# Split train/test holdout sets\n",
66
    "X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=42, stratify=label)\n",
67
    "\n",
68
    "print(f\"Training set: {X_train.shape}\")\n",
69
    "print(f\"Testing set: {X_test.shape}\")\n",
70
    "\n",
71
    "train_data = pd.concat([X_train, y_train], axis=1)\n",
72
    "test_data = pd.concat([X_test , y_test], axis=1)\n",
73
    "\n",
74
    "train_data.to_csv('../../data/input/train.csv', index=False)\n",
75
    "test_data.to_csv('../../data/input/test.csv', index=False)\n"
76
   ]
77
  }
78
 ],
79
 "metadata": {
80
  "kernelspec": {
81
   "display_name": "Python 3",
82
   "language": "python",
83
   "name": "python3"
84
  },
85
  "language_info": {
86
   "codemirror_mode": {
87
    "name": "ipython",
88
    "version": 3
89
   },
90
   "file_extension": ".py",
91
   "mimetype": "text/x-python",
92
   "name": "python",
93
   "nbconvert_exporter": "python",
94
   "pygments_lexer": "ipython3",
95
   "version": "3.12.2"
96
  }
97
 },
98
 "nbformat": 4,
99
 "nbformat_minor": 2
100
}