|
a |
|
b/scripts/data/processing.py |
|
|
1 |
import warnings |
|
|
2 |
import pandas as pd |
|
|
3 |
import os |
|
|
4 |
|
|
|
5 |
warnings.filterwarnings('ignore') |
|
|
6 |
|
|
|
7 |
# Load the noisy dataset |
|
|
8 |
try: |
|
|
9 |
original = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/raw/original.csv'), 'r')) |
|
|
10 |
synthetic = pd.read_csv(open(os.path.join(os.path.dirname(__file__), '../../data/raw/synthetic.csv'), 'r')) |
|
|
11 |
except FileNotFoundError as err: |
|
|
12 |
print(f'Ann error occoured: {err}') |
|
|
13 |
|
|
|
14 |
# Process the original set |
|
|
15 |
def process_original_dataset(): |
|
|
16 |
return original.drop_duplicates().dropna().replace({'M': 1, 'F': 2, 'YES': 1, 'NO': 0}) |
|
|
17 |
|
|
|
18 |
# Process the synthetic set |
|
|
19 |
def process_synthetic_dataset(): |
|
|
20 |
return synthetic.drop_duplicates().dropna().replace({'M': 1, 'F': 2, 'YES': 1, 'NO': 0}) |
|
|
21 |
|
|
|
22 |
# Processing results |
|
|
23 |
def processing(): |
|
|
24 |
process_original_dataset().to_csv('../../data/processed/original.csv', index=False) |
|
|
25 |
process_synthetic_dataset().to_csv('../../data/processed/synthetic.csv', index=False) |
|
|
26 |
|
|
|
27 |
return 'Dataset Processed.' |
|
|
28 |
|
|
|
29 |
if __name__ == '__main__': |
|
|
30 |
print(processing()) |