a b/helpers/randomundersampler.py
1
import sys
2
import pandas as pd
3
from imblearn.under_sampling import RandomUnderSampler
4
5
6
def resample_data(t):
7
    t = t[['text', 'readm_30d']]
8
    label = t.pop('readm_30d')
9
10
    rus = RandomUnderSampler(random_state=42)
11
    X, y = rus.fit_resample(t, label.astype('category'))
12
13
    df = pd.DataFrame(X[:, 0])
14
    df.columns = ['text']
15
    df['readm_30d'] = pd.Series(y)
16
17
    return df
18
19
20
if __name__ == '__main__':
21
    input_file = args = sys.argv[1]
22
    output_file = args = sys.argv[2]
23
24
    df = pd.read_csv(input_file)
25
26
    df = resample_data(df)
27
28
    print("new shape: {}".format(df.shape))
29
30
    df.to_csv(output_file)