--- a +++ b/scripts/data/preparation.py @@ -0,0 +1,38 @@ +from sklearn.model_selection import train_test_split +import warnings +import pandas as pd +import os + +warnings.filterwarnings('ignore') + +# Load processed datasets +try: + original = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/processed/original.csv'), 'r')) + synthetic = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/processed/synthetic.csv'), 'r')) +except FileNotFoundError as err: + print(f'Ann error occoured: {err}') + +# mix both original and synthetic sets +data = pd.concat([original, synthetic], keys=[1, 2]).drop_duplicates().dropna() + +# Select relevant features +def feature_selection(): + feature = data[['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'FATIGUE', 'WHEEZING', 'COUGHING', 'SHORTNESS OF BREATH', 'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'CHRONIC DISEASE']] + label = data['LUNG_CANCER'] + + return pd.concat([feature, label], axis=1).drop_duplicates().dropna() + +# Split train/test holdout sets +def data_split(feature, label): + X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=42, stratify=label) + + train_data = pd.concat([X_train, y_train], axis=1) + test_data = pd.concat([X_test , y_test], axis=1) + + train_data.to_csv('../../data/input/train.csv', index=False) + test_data.to_csv('../../data/input/test.csv', index=False) + + return f'Training set: {X_train.shape[0]} and Testing set: {X_test.shape[0]} save.' + +if __name__ == '__main__': + print(data_split(feature_selection().drop('LUNG_CANCER', axis='columns'), feature_selection()['LUNG_CANCER'])) \ No newline at end of file