Switch to unified view

a b/helpers/old_dataset_processor.py
1
from optparse import OptionParser
2
3
import pandas as pd
4
from sklearn.model_selection import train_test_split
5
6
7
def split_data(admissions, ratio):
8
9
    # Do some limited preprocessing
10
    X = admissions[['HADM_ID', 'text']]
11
    y = admissions['readm_30d']
12
13
    # Create a stratified train test split to preserver distribution.
14
    X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=ratio)
15
    test = pd.merge(X_test, y_test, left_index=True, right_index=True)
16
    train = pd.merge(X_train, y_train, left_index=True, right_index=True)
17
18
    return train, test
19
20
21
def main(input_data):
22
23
    # read the dataset from file.
24
    admissions = pd.read_csv(input_data)
25
26
    # split into training and testing
27
    train, test = split_data(admissions)
28
29
    # now save the files
30
    train.to_csv("train.csv", index=None)
31
    test.to_csv("test.csv", index=None)
32
33
34
if __name__ == "__main__":
35
36
    parser = OptionParser()
37
38
    parser.add_option("--input", help="specify the input data")
39
40
    (options, args) = parser.parse_args()
41
42
    # load the data
43
    main(options.input)