|
a |
|
b/scripts/data/preparation.py |
|
|
1 |
from sklearn.model_selection import train_test_split |
|
|
2 |
import warnings |
|
|
3 |
import pandas as pd |
|
|
4 |
import os |
|
|
5 |
|
|
|
6 |
warnings.filterwarnings('ignore') |
|
|
7 |
|
|
|
8 |
# Load processed datasets |
|
|
9 |
try: |
|
|
10 |
original = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/processed/original.csv'), 'r')) |
|
|
11 |
synthetic = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/processed/synthetic.csv'), 'r')) |
|
|
12 |
except FileNotFoundError as err: |
|
|
13 |
print(f'Ann error occoured: {err}') |
|
|
14 |
|
|
|
15 |
# mix both original and synthetic sets |
|
|
16 |
data = pd.concat([original, synthetic], keys=[1, 2]).drop_duplicates().dropna() |
|
|
17 |
|
|
|
18 |
# Select relevant features |
|
|
19 |
def feature_selection(): |
|
|
20 |
feature = data[['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'FATIGUE', 'WHEEZING', 'COUGHING', 'SHORTNESS OF BREATH', 'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'CHRONIC DISEASE']] |
|
|
21 |
label = data['LUNG_CANCER'] |
|
|
22 |
|
|
|
23 |
return pd.concat([feature, label], axis=1).drop_duplicates().dropna() |
|
|
24 |
|
|
|
25 |
# Split train/test holdout sets |
|
|
26 |
def data_split(feature, label): |
|
|
27 |
X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=42, stratify=label) |
|
|
28 |
|
|
|
29 |
train_data = pd.concat([X_train, y_train], axis=1) |
|
|
30 |
test_data = pd.concat([X_test , y_test], axis=1) |
|
|
31 |
|
|
|
32 |
train_data.to_csv('../../data/input/train.csv', index=False) |
|
|
33 |
test_data.to_csv('../../data/input/test.csv', index=False) |
|
|
34 |
|
|
|
35 |
return f'Training set: {X_train.shape[0]} and Testing set: {X_test.shape[0]} save.' |
|
|
36 |
|
|
|
37 |
if __name__ == '__main__': |
|
|
38 |
print(data_split(feature_selection().drop('LUNG_CANCER', axis='columns'), feature_selection()['LUNG_CANCER'])) |