b/bert_train_predict.py
+import transformers
+import torch
+import pandas as pd
+import argparse
+import random
+import numpy as np
+from sklearn.metrics import classification_report, roc_auc_score, precision_recall_fscore_support
+from sklearn.preprocessing import MultiLabelBinarizer
+from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, BertForSequenceClassification
+from ray.tune.schedulers import PopulationBasedTraining, ASHAScheduler
+import ray
+from ray import tune
+from ray.tune import CLIReporter
+from datasets import Dataset, load_dataset, DatasetDict, concatenate_datasets
+from functools import partial
+from utils import grade_preproc, group_labels, undersample_dataset, data_split
+import os
+from collections import Counter
+import pathlib
+from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
+from torch import nn
+from ray.tune.search.bayesopt import BayesOptSearch
+from ray.tune.search.hyperopt import HyperOptSearch
+from sklearn.utils import class_weight
+# Disable logging for raytune, but it will still make folders and jsons for experiment states
+# They're not big files, but should be deleted PATH: ./to_be_deleted_rayArtifact
+os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"
+parser = argparse.ArgumentParser()
+parser.add_argument('--logdir', type=str, help='The path to the directory to temporarily store checkpoints')
+parser.add_argument('--evaldir', type=str, help='The path to the directory to store model evaluation results')
+parser.add_argument('--num_trials', type=int, help='Number hyperparameter trials', default=5)
+parser.add_argument('--seqlens', type=str, help='list of sequence lengths to search for ray', default='20,35,50')
+parser.add_argument('--batches', type=str, help='list of batch sizes to search for ray', default='32,64,128')
+parser.add_argument('--model', type=str, help='select model to run classification: (BERT, ROBERTA, BIOBERT)', default='bert-base-uncased')
+parser.add_argument('--synth_data', type=str, help='path to synthetic data file', default='')
+parser.add_argument('--undersample', type=float, default=0.0, help='undersample majority class in train set by proportion. E.g. 0.2 will keep 20 percent of majority class data')
+parser.add_argument('--ray', action='store_true', help='tune hyperparameters')
+parser.add_argument('--adverse', action='store_true', help='for non adverse synthetic data')
+parser.add_argument('--epochs', type=int, default=5)
+args = parser.parse_args()
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+SEED_VAL = 42
+random.seed(SEED_VAL)
+np.random.seed(SEED_VAL)
+torch.manual_seed(SEED_VAL)
+torch.cuda.manual_seed_all(SEED_VAL)
+MLB = MultiLabelBinarizer()
+if args.adverse:
+    LABELS = {'TRANSPORTATION_distance', 'TRANSPORTATION_resource',
+        'TRANSPORTATION_other', 'HOUSING_poor', 'HOUSING_undomiciled','HOUSING_other',
+        'RELATIONSHIP_divorced', 'RELATIONSHIP_widowed', 'RELATIONSHIP_single',
+        'PARENT','EMPLOYMENT_underemployed','EMPLOYMENT_unemployed', 'EMPLOYMENT_disability','SUPPORT_minus'}
+else:
+    LABELS = {'TRANSPORTATION_distance', 'TRANSPORTATION_resource',
+        'TRANSPORTATION_other', 'HOUSING_poor', 'HOUSING_undomiciled',
+        'HOUSING_other', 'RELATIONSHIP_married', 'RELATIONSHIP_partnered',
+        'RELATIONSHIP_divorced', 'RELATIONSHIP_widowed', 'RELATIONSHIP_single',
+        'PARENT','EMPLOYMENT_employed', 'EMPLOYMENT_underemployed',
+        'EMPLOYMENT_unemployed', 'EMPLOYMENT_disability', 'EMPLOYMENT_retired',
+        'EMPLOYMENT_student', 'SUPPORT_plus', 'SUPPORT_minus'}
+BROAD_LABELS = {lab.split('_')[0] for lab in LABELS}
+BROAD_LABELS.add('<NO_SDOH>')
+LABEL_BROAD_NARROW = LABELS.union(BROAD_LABELS)
+if args.ray:
+    ray.init(log_to_driver=False)
+class BCETrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.get("labels").to(DEVICE) # batch[0, 1, 0, 1, 0, 0]
+        # forward pass
+        outputs = model(inputs['input_ids'])
+        logits = outputs.get("logits").to(DEVICE)
+        # compute custom loss (suppose one has 3 labels with different weights)
+        loss_fct = nn.BCEWithLogitsLoss().to(DEVICE)
+        loss = loss_fct(logits.to(DEVICE), labels.float().to(DEVICE))
+        return (loss, outputs) if return_outputs else loss
+def undersample(df, label, keep_percent):
+    """
+    Undersamples the majority class in a Pandas dataframe to balance the classes.
+    Parameters:
+    df (pandas.DataFrame): The dataframe to undersample.
+    keep_percent (float): The percentage of the majority class to keep.
+    Returns:
+    pandas.DataFrame: The undersampled dataframe.
+    """
+    # Find the majority class based on the labels column
+    counts = df[label].value_counts()
+    majority_class = counts.idxmax()
+    # Get the indices of rows in the majority class
+    majority_indices = df[df[label] == majority_class].index
+    # Calculate the number of majority class rows to keep
+    num_majority_keep = int(keep_percent * counts[majority_class])
+    # Get a random subset of the majority class rows to keep
+    majority_keep_indices = np.random.choice(majority_indices, num_majority_keep, replace=False)
+    # Get the indices of rows in the minority class
+    minority_indices = df[df[label] != majority_class].index
+    # Combine the majority class subset and the minority class rows
+    undersampled_indices = np.concatenate([majority_keep_indices, minority_indices])
+    # Return the undersampled dataframe
+    return df.loc[undersampled_indices]
+def generate_label_list(row: pd.DataFrame) -> str:
+    """
+    Generate a label list based on the given row from a Pandas DataFrame.
+    Args:
+        row (pd.DataFrame): A row from a Pandas DataFrame.
+    Returns:
+        str: A comma-separated string of labels extracted from the row.
+    Examples:
+        >>> df = pd.DataFrame({'label1_1': [1], 'label2_0': [0], 'label3_1': [1]})
+        >>> generate_label_list(df.iloc[0])
+        'label1,label3'
+        >>> df = pd.DataFrame({'label2_0': [0], 'label3_0': [0]})
+        >>> generate_label_list(df.iloc[0])
+        '<NO_SDOH>'
+    """
+    labels = set()
+    for col_name, value in row.items():
+        if col_name in LABELS and value == 1:
+            labels.add(col_name.split('_')[0])
+    if len(labels) == 0:
+        labels.add('<NO_SDOH>')
+    return ','.join(list(labels))
+def compute_metrics(pred):
+    """
+    Calculate Evaluation metrics
+    """
+    labels = pred.label_ids
+    logits = torch.tensor(pred.predictions)
+    act = nn.Sigmoid()
+    probs = act(logits)
+    preds = (probs>= 0.5).int()
+    # labels = mlb.fit_transform(labels)
+    # preds = MLB.transform(preds)
+    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds)
+    micro_f1  = precision_recall_fscore_support(labels, preds, average='micro')[2]
+    weight_f1 = precision_recall_fscore_support(labels, preds, average='weighted')[2]
+    macro_f1 = precision_recall_fscore_support(labels, preds, average='macro')[2]
+    metrics_out = {'macro_f1':macro_f1, 'micro_f1': micro_f1, 'weighted_f1': weight_f1}
+    for i, lab in enumerate(list(MLB.classes_)):
+        metrics_out['precision_'+str(lab)] = prec[i]
+        metrics_out['recall_'+str(lab)] = rec[i]
+        metrics_out['f1_'+str(lab)] = f1[i]
+    print(classification_report(labels, preds, target_names=MLB.classes_))
+    return metrics_out
+def train_hf(config, dataset):
+    # Define the Trainer and TrainingArguments objects
+    # Initialize the tokenizer with the sequence_length parameter
+    tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=True)
+    def tokenize(batch):
+        return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt", max_length=config["sequence_length"])
+    tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
+    training_args = TrainingArguments(
+        output_dir=args.logdir,
+        per_device_train_batch_size=config["batch_size"],
+        per_device_eval_batch_size=config["batch_size"],
+        learning_rate=config["learning_rate"],
+        num_train_epochs=config["epochs"],
+        disable_tqdm=False,
+        bf16=True, # bfloat16 training
+        optim='adamw_hf',
+        logging_dir=f"{args.logdir}/logs",
+        overwrite_output_dir = True,
+        evaluation_strategy = 'epoch',
+        weight_decay= config["weight_decay"],
+        save_strategy='epoch',
+        save_total_limit = 1,
+        load_best_model_at_end=True,
+        metric_for_best_model="macro_f1",
+        seed = SEED_VAL,
+        gradient_accumulation_steps = config["gradient_accumulation_steps"]
+        )
+    model = AutoModelForSequenceClassification.from_pretrained(
+        pretrained_model_name_or_path=args.model,
+        num_labels=len(dataset['train']['labels'][0]),
+        attention_probs_dropout_prob=config["hidden_dropout_prob"],
+        hidden_dropout_prob=config["hidden_dropout_prob"]
+        )
+    # clws = torch.tensor([config["class_weight0"], config["class_weight1"]], dtype=torch.float).to(DEVICE)
+    trainer = BCETrainer(
+        args=training_args,
+        tokenizer=tokenizer,
+        train_dataset=tokenized_dataset['train'],
+        eval_dataset=tokenized_dataset['dev'],
+        model=model,
+        compute_metrics=compute_metrics,
+        )
+    # Train the model and return the evaluation
+    trainer.train()
+    eval_result = trainer.evaluate()
+    if args.ray:
+        tune.report(eval_result)
+    else:
+        return eval_result
+def main(args):
+    train_data = pd.read_csv('./data/train_sents.csv')
+    dev_data = pd.read_csv('./data/dev_sents.csv')
+    train_data.fillna(value={'text':''}, inplace=True)
+    dev_data.fillna(value={'text':''}, inplace=True)
+    dev_text = dev_data['text'].tolist()
+    dev_labels = dev_data.apply(generate_label_list, axis=1).tolist()
+    train_data['LABEL'] = train_data.apply(generate_label_list, axis=1).tolist()
+    if args.undersample:
+        train_data = undersample(train_data, label='LABEL', keep_percent=args.undersample)
+    train_text = train_data['text'].tolist()
+    train_labels = train_data['LABEL'].tolist()
+    if args.synth_data:
+        synthetic_data = pd.read_csv(args.synth_data)
+        if args.adverse:
+            synthetic_data = synthetic_data[synthetic_data['adverse']=='adverse']
+        synthetic_data.reset_index(inplace=True, drop=True)
+        binary_synthetic = pd.get_dummies(synthetic_data['label'])
+        binary_synthetic['text'] = synthetic_data['text']
+        synth_labels = binary_synthetic.apply(generate_label_list, axis=1).tolist()
+        synth_text = synthetic_data['text'].tolist()
+        train_text.extend(synth_text)
+        train_labels.extend(synth_labels)
+    train_labels = [labs.split(',') for labs in train_labels]
+    train_labs_mlb = MLB.fit_transform(train_labels)
+    train_labs_mlb = [ar.tolist() for ar in train_labs_mlb]
+    dev_labels = [labs.split(',') for labs in dev_labels]
+    dev_labs_mlb = MLB.transform(dev_labels)
+    dev_labs_mlb = [ar.tolist() for ar in dev_labs_mlb]
+    train_t5 = pd.DataFrame({'text':train_text, 'labels':train_labs_mlb})
+    dev_t5 = pd.DataFrame({'text':dev_text, 'labels':dev_labs_mlb})
+    train_dataset = Dataset.from_pandas(train_t5)
+    dev_dataset = Dataset.from_pandas(dev_t5)
+    dataset = DatasetDict()
+    dataset['train'] = train_dataset
+    dataset['dev'] = dev_dataset
+    seq_length_search = [int(x) for x in args.seqlens.split(',')]
+    batch_size_search = [int(x) for x in args.batches.split(',')]
+    params_dict ={
+            'model':args.model,
+            'undersample_bool':args.undersample
+            }
+    if args.ray:
+        if args.undersample:
+            usample = args.undersample
+        else:
+            usample = 1
+        config_space = {
+            "learning_rate": tune.loguniform(1e-5, 1e-3),
+            "batch_size": tune.choice(batch_size_search),
+            "hidden_dropout_prob": tune.uniform(0.1, 0.5),
+            "undersample": usample,
+            "weight_decay": tune.loguniform(1e-8, 1e-5),
+            "sequence_length": tune.choice(seq_length_search),
+            "gradient_accumulation_steps": 3,
+            "epochs": args.epochs
+            }
+        scheduler = ASHAScheduler(
+            metric="_metric/eval_macro_f1",
+            mode="max",
+            grace_period=1,
+            reduction_factor=2
+            )
+        met_cols = ["training_iteration","macro_f1", "micro_f1", "precision", "recall"]
+        for i in range(len(train_labs_mlb[0])):
+            met_cols.append('precision_'+str(i))
+            met_cols.append('recall_'+str(i))
+            met_cols.append('f1_'+str(i))
+        reporter = CLIReporter(
+            parameter_columns=list(config_space.keys()),
+            metric_columns=met_cols,
+        )
+        result = tune.run(
+            partial(train_hf,dataset=dataset),
+            config=config_space,
+            num_samples=args.num_trials,
+            resources_per_trial={"gpu": 1},
+            scheduler=scheduler,
+            progress_reporter=reporter,
+            local_dir="./to_be_deleted_rayArtifact",
+            name='empty_folders',
+            log_to_file=False,
+            )
+        best_trial = result.get_best_trial(metric='_metric/eval_macro_f1', mode='max', scope="all")
+        config_dict = best_trial.config
+        dev_eval_dict = best_trial.last_result['_metric']
+        output_dict = {**params_dict, **config_dict, **dev_eval_dict}
+        outpath = pathlib.Path().joinpath(args.evaldir, 'multi_BERT_ray.csv')
+        print(output_dict)
+        if os.path.isfile(outpath):
+            indf = pd.read_csv(outpath)
+            outdf = pd.concat([indf, pd.DataFrame([output_dict])], ignore_index=True)
+        else:
+            outdf = pd.DataFrame([output_dict])
+        outdf.to_csv(outpath, index=False)
+    else:
+        config_space = {
+            "learning_rate": 5e-5,
+            "batch_size":32, #32
+            "hidden_dropout_prob": 0.1,
+            "undersample": 1.0,
+            "weight_decay": 2e-8,
+            "sequence_length": 100,
+            "gradient_accumulation_steps": 3,
+            "epochs": 10
+        }
+        dev_eval_dict = train_hf(config_space, dataset)
+        output_dict = {**params_dict, **config_space, **dev_eval_dict}
+        outpath = pathlib.Path().joinpath(args.evaldir, 'multi_BERT_noray.csv')
+        print(output_dict)
+        if os.path.isfile(outpath):
+            indf = pd.read_csv(outpath)
+            outdf = pd.concat([indf, pd.DataFrame([output_dict])], ignore_index=True)
+        else:
+            outdf = pd.DataFrame([output_dict])
+        outdf.to_csv(outpath, index=False)
+if __name__ =='__main__':
+    main(args)