|
a |
|
b/helpers/old_dataset_processor.py |
|
|
1 |
from optparse import OptionParser |
|
|
2 |
|
|
|
3 |
import pandas as pd |
|
|
4 |
from sklearn.model_selection import train_test_split |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
def split_data(admissions, ratio): |
|
|
8 |
|
|
|
9 |
# Do some limited preprocessing |
|
|
10 |
X = admissions[['HADM_ID', 'text']] |
|
|
11 |
y = admissions['readm_30d'] |
|
|
12 |
|
|
|
13 |
# Create a stratified train test split to preserver distribution. |
|
|
14 |
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=ratio) |
|
|
15 |
test = pd.merge(X_test, y_test, left_index=True, right_index=True) |
|
|
16 |
train = pd.merge(X_train, y_train, left_index=True, right_index=True) |
|
|
17 |
|
|
|
18 |
return train, test |
|
|
19 |
|
|
|
20 |
|
|
|
21 |
def main(input_data): |
|
|
22 |
|
|
|
23 |
# read the dataset from file. |
|
|
24 |
admissions = pd.read_csv(input_data) |
|
|
25 |
|
|
|
26 |
# split into training and testing |
|
|
27 |
train, test = split_data(admissions) |
|
|
28 |
|
|
|
29 |
# now save the files |
|
|
30 |
train.to_csv("train.csv", index=None) |
|
|
31 |
test.to_csv("test.csv", index=None) |
|
|
32 |
|
|
|
33 |
|
|
|
34 |
if __name__ == "__main__": |
|
|
35 |
|
|
|
36 |
parser = OptionParser() |
|
|
37 |
|
|
|
38 |
parser.add_option("--input", help="specify the input data") |
|
|
39 |
|
|
|
40 |
(options, args) = parser.parse_args() |
|
|
41 |
|
|
|
42 |
# load the data |
|
|
43 |
main(options.input) |