[d129b2]: / helpers / old_dataset_processor.py

Download this file

43 lines (26 with data), 1.1 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from optparse import OptionParser
import pandas as pd
from sklearn.model_selection import train_test_split
def split_data(admissions, ratio):
# Do some limited preprocessing
X = admissions[['HADM_ID', 'text']]
y = admissions['readm_30d']
# Create a stratified train test split to preserver distribution.
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=ratio)
test = pd.merge(X_test, y_test, left_index=True, right_index=True)
train = pd.merge(X_train, y_train, left_index=True, right_index=True)
return train, test
def main(input_data):
# read the dataset from file.
admissions = pd.read_csv(input_data)
# split into training and testing
train, test = split_data(admissions)
# now save the files
train.to_csv("train.csv", index=None)
test.to_csv("test.csv", index=None)
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("--input", help="specify the input data")
(options, args) = parser.parse_args()
# load the data
main(options.input)