[d129b2]: / helpers / randomundersampler.py

Download this file

31 lines (19 with data), 623 Bytes

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import sys
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
def resample_data(t):
t = t[['text', 'readm_30d']]
label = t.pop('readm_30d')
rus = RandomUnderSampler(random_state=42)
X, y = rus.fit_resample(t, label.astype('category'))
df = pd.DataFrame(X[:, 0])
df.columns = ['text']
df['readm_30d'] = pd.Series(y)
return df
if __name__ == '__main__':
input_file = args = sys.argv[1]
output_file = args = sys.argv[2]
df = pd.read_csv(input_file)
df = resample_data(df)
print("new shape: {}".format(df.shape))
df.to_csv(output_file)