--- a +++ b/scripts/data/processing.py @@ -0,0 +1,30 @@ +import warnings +import pandas as pd +import os + +warnings.filterwarnings('ignore') + +# Load the noisy dataset +try: + original = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/raw/original.csv'), 'r')) + synthetic = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/raw/synthetic.csv'), 'r')) +except FileNotFoundError as err: + print(f'Ann error occoured: {err}') + +# Process the original set +def process_original_dataset(): + return original.drop_duplicates().dropna().replace({'M': 1, 'F': 2, 'YES': 1, 'NO': 0}) + +# Process the synthetic set +def process_synthetic_dataset(): + return synthetic.drop_duplicates().dropna().replace({'M': 1, 'F': 2, 'YES': 1, 'NO': 0}) + +# Processing results +def processing(): + process_original_dataset().to_csv('../../data/processed/original.csv', index=False) + process_synthetic_dataset().to_csv('../../data/processed/synthetic.csv', index=False) + + return 'Dataset Processed.' + +if __name__ == '__main__': + print(processing()) \ No newline at end of file