Switch to side-by-side view

--- a
+++ b/chexbert/src/bert_tokenizer.py
@@ -0,0 +1,54 @@
+import pandas as pd
+from transformers import BertTokenizer, AutoTokenizer
+import json
+from tqdm import tqdm
+import argparse
+
+def get_impressions_from_csv(path):
+        df = pd.read_csv(path, header=None)
+        imp = df[0]
+        # if nan
+        imp = imp.fillna('')
+        imp = imp.str.strip()
+        imp = imp.replace('\n',' ', regex=True)
+        imp = imp.replace('\s+', ' ', regex=True)
+        imp = imp.str.strip()
+        return imp
+
+def tokenize(impressions, tokenizer):
+        new_impressions = []
+        print("\nTokenizing report impressions. All reports are cut off at 512 tokens.")
+        for i in tqdm(range(impressions.shape[0])):
+                tokenized_imp = tokenizer.tokenize(impressions.iloc[i])
+                if tokenized_imp: #not an empty report
+                        res = tokenizer.encode_plus(tokenized_imp)['input_ids']
+                        if len(res) > 512: #length exceeds maximum size
+                                #print("report length bigger than 512")
+                                res = res[:511] + [tokenizer.sep_token_id]
+                        new_impressions.append(res)
+                else: #an empty report
+                        new_impressions.append([tokenizer.cls_token_id, tokenizer.sep_token_id]) 
+        return new_impressions
+
+def load_list(path):
+        with open(path, 'r') as filehandle:
+                impressions = json.load(filehandle)
+                return impressions
+
+if __name__ == "__main__":
+        parser = argparse.ArgumentParser(description='Tokenize radiology report impressions and save as a list.')
+        parser.add_argument('-d', '--data', type=str, nargs='?', required=True,
+                            help='path to csv containing reports. The reports should be \
+                            under the \"Report Impression\" column')
+        parser.add_argument('-o', '--output_path', type=str, nargs='?', required=True,
+                            help='path to intended output file')
+        args = parser.parse_args()
+        csv_path = args.data
+        out_path = args.output_path
+        
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+        impressions = get_impressions_from_csv(csv_path)
+        new_impressions = tokenize(impressions, tokenizer)
+        with open(out_path, 'w') as filehandle:
+                json.dump(new_impressions, filehandle)