--- a +++ b/helpers/randomundersampler.py @@ -0,0 +1,30 @@ +import sys +import pandas as pd +from imblearn.under_sampling import RandomUnderSampler + + +def resample_data(t): + t = t[['text', 'readm_30d']] + label = t.pop('readm_30d') + + rus = RandomUnderSampler(random_state=42) + X, y = rus.fit_resample(t, label.astype('category')) + + df = pd.DataFrame(X[:, 0]) + df.columns = ['text'] + df['readm_30d'] = pd.Series(y) + + return df + + +if __name__ == '__main__': + input_file = args = sys.argv[1] + output_file = args = sys.argv[2] + + df = pd.read_csv(input_file) + + df = resample_data(df) + + print("new shape: {}".format(df.shape)) + + df.to_csv(output_file)