Switch to unified view

a b/scripts/data/preparation.py
1
from sklearn.model_selection import train_test_split
2
import warnings
3
import pandas as pd
4
import os
5
6
warnings.filterwarnings('ignore')
7
8
# Load processed datasets
9
try:
10
    original = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/processed/original.csv'), 'r'))
11
    synthetic = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/processed/synthetic.csv'), 'r'))
12
except FileNotFoundError as err:
13
    print(f'Ann error occoured: {err}')
14
15
# mix both original and synthetic sets
16
data = pd.concat([original, synthetic], keys=[1, 2]).drop_duplicates().dropna()
17
18
# Select relevant features
19
def feature_selection():
20
    feature = data[['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'FATIGUE', 'WHEEZING', 'COUGHING', 'SHORTNESS OF BREATH', 'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'CHRONIC DISEASE']]
21
    label = data['LUNG_CANCER']
22
   
23
    return pd.concat([feature, label], axis=1).drop_duplicates().dropna()
24
25
# Split train/test holdout sets
26
def data_split(feature, label):
27
    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=42, stratify=label)
28
29
    train_data = pd.concat([X_train, y_train], axis=1)
30
    test_data = pd.concat([X_test , y_test], axis=1)
31
32
    train_data.to_csv('../../data/input/train.csv', index=False)
33
    test_data.to_csv('../../data/input/test.csv', index=False)
34
35
    return f'Training set: {X_train.shape[0]} and Testing set: {X_test.shape[0]} save.'
36
37
if __name__ == '__main__':
38
   print(data_split(feature_selection().drop('LUNG_CANCER', axis='columns'), feature_selection()['LUNG_CANCER']))