|
a |
|
b/helpers/randomundersampler.py |
|
|
1 |
import sys |
|
|
2 |
import pandas as pd |
|
|
3 |
from imblearn.under_sampling import RandomUnderSampler |
|
|
4 |
|
|
|
5 |
|
|
|
6 |
def resample_data(t): |
|
|
7 |
t = t[['text', 'readm_30d']] |
|
|
8 |
label = t.pop('readm_30d') |
|
|
9 |
|
|
|
10 |
rus = RandomUnderSampler(random_state=42) |
|
|
11 |
X, y = rus.fit_resample(t, label.astype('category')) |
|
|
12 |
|
|
|
13 |
df = pd.DataFrame(X[:, 0]) |
|
|
14 |
df.columns = ['text'] |
|
|
15 |
df['readm_30d'] = pd.Series(y) |
|
|
16 |
|
|
|
17 |
return df |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
if __name__ == '__main__': |
|
|
21 |
input_file = args = sys.argv[1] |
|
|
22 |
output_file = args = sys.argv[2] |
|
|
23 |
|
|
|
24 |
df = pd.read_csv(input_file) |
|
|
25 |
|
|
|
26 |
df = resample_data(df) |
|
|
27 |
|
|
|
28 |
print("new shape: {}".format(df.shape)) |
|
|
29 |
|
|
|
30 |
df.to_csv(output_file) |