Switch to unified view

a b/scripts/data/processing.py
1
import warnings
2
import pandas as pd
3
import os
4
5
warnings.filterwarnings('ignore')
6
7
# Load the noisy dataset
8
try:
9
    original = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/raw/original.csv'), 'r'))
10
    synthetic = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/raw/synthetic.csv'), 'r'))
11
except FileNotFoundError as err:
12
    print(f'Ann error occoured: {err}')
13
14
# Process the original set
15
def process_original_dataset():
16
    return original.drop_duplicates().dropna().replace({'M': 1, 'F': 2, 'YES': 1, 'NO': 0})
17
    
18
# Process the synthetic set
19
def process_synthetic_dataset():
20
    return synthetic.drop_duplicates().dropna().replace({'M': 1, 'F': 2, 'YES': 1, 'NO': 0})
21
22
# Processing results
23
def processing():
24
    process_original_dataset().to_csv('../../data/processed/original.csv', index=False)
25
    process_synthetic_dataset().to_csv('../../data/processed/synthetic.csv', index=False)
26
27
    return 'Dataset Processed.'
28
29
if __name__ == '__main__':
30
   print(processing())