Switch to unified view

a b/src/Preporcessor/preprocessing.py
1
import joblib
2
from tqdm.auto import tqdm
3
from preprocessing_utils import eic_text_preprocessing
4
from TrialMatchAI.src.Preporcessor.preprocess_clinical_notes import tokenize_clinical_note
5
import pandas as pd
6
import os
7
8
memory = joblib.Memory(".")
9
def ParallelExecutor(use_bar="tqdm", **joblib_args):
10
    """Utility for tqdm progress bar in joblib.Parallel"""
11
    all_bar_funcs = {
12
        "tqdm": lambda args: lambda x: tqdm(x, **args),
13
        "False": lambda args: iter,
14
        "None": lambda args: iter,
15
    }
16
    def aprun(bar=use_bar, **tq_args):
17
        def tmp(op_iter):
18
            if str(bar) in all_bar_funcs.keys():
19
                bar_func = all_bar_funcs[str(bar)](tq_args)
20
            else:
21
                raise ValueError("Value %s not supported as bar type" % bar)
22
            # Pass n_jobs from joblib_args
23
            return joblib.Parallel(n_jobs=joblib_args.get("n_jobs", 10))(bar_func(op_iter))
24
25
        return tmp
26
    return aprun
27
28
29
class Preprocessor:
30
    def __init__(self, id_list, n_jobs):
31
        self.id_list = id_list
32
        self.n_jobs = n_jobs
33
34
    def preprocess_clinical_trials_text(self):
35
        parallel_runner = ParallelExecutor(n_jobs=self.n_jobs)(total=len(self.id_list))
36
        X = parallel_runner(
37
            joblib.delayed(eic_text_preprocessing)(
38
            [_id]
39
            )
40
            for _id in self.id_list
41
        )     
42
        return pd.concat(X).reset_index(drop=True)
43
    
44
    def preprocess_patient_clinical_notes(self):
45
        parallel_runner = ParallelExecutor(n_jobs=self.n_jobs)(total=len(self.id_list))
46
        X = parallel_runner(
47
            joblib.delayed(tokenize_clinical_note)(
48
            [_id]
49
            )
50
            for _id in self.id_list
51
        )     
52
        return pd.concat(X).reset_index(drop=True)
53