Diff of /Crawler/utilities.py [000000] .. [7c5f70]

Switch to side-by-side view

--- a
+++ b/Crawler/utilities.py
@@ -0,0 +1,608 @@
+import os
+import subprocess
+from glob2 import glob
+from pydicom import dcmread
+import numpy as np
+from time import time
+import json
+import jsbeautifier
+from datetime import datetime
+import re
+import nibabel as nib
+from Crawler.crawler_segmentation import seg_from_model, display_segmentations
+from Crawler.crawler_radiomics import gen_images_csv, run_radiomics, load_study_data, init_dilation_class
+
+
+def load_log_file(log_file):
+
+    # Load database
+    if os.path.exists(log_file):
+        with open(log_file) as f:
+            log = json.load(f)
+
+    else:
+        log = {}
+
+    return log
+
+
+class DataLoader:
+    def __init__(self, base_path, folder_regex, log_file):
+        self.base_path = base_path
+        self.folder_regex = folder_regex
+        self.log_file = log_file
+        self.ind = 0
+        self.current_files = False
+        self.patient_name = []
+        self.study_date = []
+        self.acq_time = []
+        self.protocol = []
+        self.next_folder = ''
+
+        print('Searching for data on the NAS')
+        t1 = time()
+        # Get folder names
+        walker = [x[0] for x in os.walk(base_path)]
+        walker.remove(base_path)
+
+        # Get subdirectory names
+        subdirs = [os.path.split(i) for i in walker]
+
+        # # Select folders which match the regex
+        # image_paths = [name for name in subdirs
+        #                if name[1].lower().find(folder_regex.lower()) != -1]
+        image_paths = []
+        for name in subdirs:
+
+            files = glob(os.path.join(os.path.join(name[0], name[1]), folder_regex))
+
+            if files:
+                image_paths.append(name)
+
+        self.subdirs = [os.path.join(p[0], p[1]) for p in image_paths]
+
+        # Sort files by date (oldest to newest)
+        self.subdirs.sort()
+
+        # Bundle scans
+        self._bundle_scans()
+
+        print('\tTime to build folder structure: %0.2f seconds' % (time() - t1))
+
+        # Load log file
+        print('Loading log file\n\t%s' % self.log_file)
+        self.log = load_log_file(log_file)
+
+    def _bundle_scans(self):
+        """
+        Bundles directories together by study using similar base folders.
+        Returns:
+
+        """
+
+        # Set up used index
+        bases = []
+        bundles = []
+
+        for i in range(len(self.subdirs)):
+
+            # Get base directory
+            bs = self.subdirs[i]
+            flag = True
+            while flag:
+                bs1 = os.path.split(bs)[0]
+
+                if bs1 == self.base_path.rstrip('/').rstrip('\\'):
+                    flag = False
+                else:
+                    bs = bs1
+
+            # Proceed if the base name as not been processed
+            if bs not in bases:
+
+                bases.append(bs)
+
+                # Find base path matches
+                matches = [k for (k, name) in enumerate(self.subdirs) if re.findall(bs, name)]
+
+                # Fill in indexes and create output vector
+                bundles.append([self.subdirs[f] for f in matches])
+
+        # Update subdirectories
+        self.subdirs = bundles
+
+    def get_folder(self):
+        """
+        Pulls the next folder containing Dicom data
+        Returns:
+            (str): path to a folder containing Dicom files
+        """
+
+        if self.ind == len(self.subdirs):
+
+            folder_exists = False
+
+        else:
+
+            self.next_folder = self.subdirs[self.ind]
+            self.ind += 1
+            self.current_files = False
+            folder_exists = True
+
+        return folder_exists
+
+    def load_study_data(self):
+        """
+        Loads the patient name and study date using the data's Dicom header.
+        Returns:
+
+        """
+
+        # Get filenames
+        if not self.current_files:
+            self._get_dicom_files()
+
+        self.patient_name = []
+        self.study_date = []
+        self.acq_time = []
+        self.protocol = []
+
+        # Return animal ID (K-number, found in PatientName)
+        for i in range(len(self.names)):
+            dcm = dcmread(self.names[i][0])
+
+            # Get and decode animal ID (K-number, found in PatientName)
+            self.patient_name = dcm.PatientName.original_string.decode()
+            self.patient_name = self.patient_name.strip('^')  # Some animals have following ^^^^
+            self.patient_name = 'K' + self.patient_name
+
+            # Get study date and time
+            self.study_date = dcm.StudyDate
+            self.acq_time.append(dcm.AcquisitionTime)
+
+            # Get protocol data
+            self.protocol.append(dcm.ProtocolName)
+
+        return [self.patient_name, self.study_date]
+
+    def load_dicom(self):
+        """
+        Loads Dicom image data from the last returned directory
+
+        Returns:
+            numpy array: a 3D image volume
+        """
+
+        if not self.current_files:
+            self._get_dicom_files()
+
+        # Sort by image protocol
+        sort_inds = self._sort_protocols()
+
+        # For each of the three contrasts
+        for i in range(len(self.names)):
+
+            # Get index corresponding to the correct contrast (T1, T1c, T2)
+            ind = sort_inds[i]
+
+            if i == 0:
+                # Load Dicom data
+                dcm = dcmread(self.names[i][0])
+                dims = (dcm.Rows, dcm.Columns, len(self.names[0]), len(self.names))
+                im_vol = np.zeros(shape=dims)
+
+            for (ii, name) in enumerate(self.names[i]):
+
+                # Load Dicom
+                dcm = dcmread(name)
+
+                # Load files as a numpy array
+                im_vol[:, :, ii, ind] = dcm.pixel_array
+
+        self.current_files = False
+
+        return im_vol
+
+    def compare_with_log(self):
+        """
+        Log data is a dictionary with fields: 'AnimalID', 'Scan1', 'Scan2'
+        Returns:
+            bool: True if the animal and scan date appear in the processed log
+
+        """
+
+        # Initialize variables
+        animal_scanned = False
+        date_scanned = False
+
+        if self.patient_name in self.log.keys():  # If the animal exists in the DB
+
+            animal_scanned = True
+
+            if self.study_date in self.log[self.patient_name]['StudyDate']:  # If the scan date exists for the animal
+
+                date_scanned = True
+
+        already_processed = animal_scanned and date_scanned
+
+        if already_processed:
+
+            print('\t\tScan already processed!')
+
+        return already_processed
+
+    def _get_dicom_files(self, file_regexp='*.dcm'):
+
+        self.names = []
+
+        # For each contrast
+        print('Processing:')
+        for c in range(3):
+            image_path = os.path.join(self.next_folder[c], file_regexp)
+            print('\t %s' % self.next_folder[c])
+
+            # Get file names
+            self.names.append(glob(image_path))
+            self.names[c].sort()
+
+        # Flag file list as current
+        self.current_files = True
+
+    def _sort_protocols(self):
+        """
+        Sort scans so that the data can be returned as T1, T1 with contrast, and T2.
+        Returns:
+
+        """
+
+        # Initialize the index for T2 = 2, only need to rearrange T1 contrasts
+        sort_inds = [2 for _ in range(len(self.names))]
+        times = []
+
+        # Collect protocol and time data
+        for i in range(len(self.names)):
+
+            if re.findall('T1', self.protocol[i]):
+                sort_inds[i] = 1
+
+            times.append(datetime.strptime(self.acq_time[i], '%H%M%S'))
+
+        # Sort T1 scans by time
+        t = [(i, times[i]) for (i, ind) in enumerate(sort_inds) if ind == 1]
+
+        if t[0][1] < t[1][1]:  # if the first index happened first
+            sort_inds[t[0][0]] = 0
+        else:
+            sort_inds[t[1][0]] = 0
+
+        return sort_inds
+
+
+class SaveResults:
+    def __init__(self, data_base_path, save_folder, log_file):
+
+        # Initialize variables
+        self.animal_folder = None
+        self.curr_scan_path = None
+        self.snames = []
+
+        # Set up paths
+        self.base_save_path = os.path.join(data_base_path, save_folder)
+        self.log_file = log_file
+
+        if not os.path.exists(self.base_save_path):
+            os.mkdir(self.base_save_path)
+
+        # Load log file
+        self.log = load_log_file(log_file)
+
+    def append_to_log(self, animal_id, study_date):
+        """
+        Appends the animal and study date to the log of processed studies
+        Args:
+            animal_id (str): The animal K-number
+            study_date (str): The study date (YYYYMMDD)
+
+        Returns:
+
+        """
+
+        if animal_id in self.log.keys():
+
+            # Animal has been scanned before
+            self.log[animal_id]['StudyDate'].append(study_date)
+
+        else:
+
+            # New entry
+            self.log[animal_id] = {'StudyDate': [study_date]}
+
+    def save_log(self):
+        """
+        Saves the dictionary detailing which sets have been processed
+        Returns:
+
+        """
+
+        # Reformat the json file for easier reading
+        formatted_log = jsbeautifier.beautify(json.dumps(self.log))
+
+        with open(self.log_file, 'w') as f:
+            f.write(formatted_log)
+
+    def gen_save_path(self, animal_id, study_date):
+        """
+        Generates output folders for analyzed data.
+        Args:
+            animal_id (str): The animal K-number
+            study_date (str): The study date (YYYYMMDD)
+
+        Returns:
+            str: the base path for the animal
+            str: the study date path
+        """
+
+        # Generate animal folder
+        animal_folder = os.path.join(self.base_save_path, animal_id)
+
+        # Make the animal folder if it does not exist
+        if not os.path.exists(animal_folder):
+            os.mkdir(animal_folder)
+
+        # Set up current animal save path
+        curr_animal_path = os.path.join(animal_folder, study_date)
+
+        if not os.path.exists(curr_animal_path):
+            os.mkdir(curr_animal_path)
+
+        self.animal_folder = animal_folder
+        self.curr_scan_path = curr_animal_path
+
+        # Set up save names
+        snames = ['T1.nii.gz', 'T1c.nii.gz', 'T2.nii.gz']
+        self.snames = [os.path.join(self.curr_scan_path, name) for name in snames]
+
+        return self.snames
+
+    def resave_image_volumes(self, X):
+        """
+        Re-saves image volumes into the processing folder for ease of access
+        Args:
+            X (4D numpy array): image volume (dims: Z, X, Y, num_vols)
+
+        Returns:
+
+        """
+
+        # Get image dimensions
+        sz = X.shape
+
+        # Save image volumes
+        for i in range(sz[-1]):
+
+            tmp = X[:, :, :, i].squeeze().swapaxes(0, 1)
+            nib.save(nib.Nifti1Image(tmp, np.eye(4)), self.snames[i])
+
+    def save_dicom_header(self, dicom_fname):
+
+        # Collect Dicom header
+        dcm = dcmread(dicom_fname)
+
+        # Set up output file
+        sname = os.path.join(self.curr_scan_path, 'dicom_header.txt')
+
+        # Write output file
+        with open(sname, 'w') as f:
+            f.write(dcm.__str__())
+
+    @staticmethod
+    def clear_working_directory():
+        """
+        Clears all files from the working directory.
+        Returns:
+
+        """
+
+        # Get working directory
+        working_path = os.path.join(os.getcwd(), 'Working')
+
+        # Get a list if items in the working directory
+        working_files = glob(os.path.join(working_path, '*.*'))
+
+        # Delete working files
+        for file in working_files:
+            os.remove(file)
+
+
+class ProcessAnimal:
+    def __init__(self, snames):
+
+        self.T1_file = snames[0]
+        self.T1c_file = snames[1]
+        self.T2_file = snames[2]
+        self.cur_scan_path = os.path.split(snames[0])[0]
+        self.mask_file = ''
+        self.radiomic_files = []
+        self.radiomics_sfile = []
+        self.dilate = 25
+
+        # init_dilation_class(self.dilate)
+
+    def bias_correct(self):
+
+        # Set up correction parameters
+
+        # Get weighted image
+        weight_im = self.T1_file
+
+        # Get the T2 weighted image (the only one that needs bias correction)
+        T2 = self.T2_file
+
+        # Set up output image
+        T2_out = os.path.join(self.cur_scan_path, 'T2_cor.nii.gz')
+
+        # Set up the bias corrector command
+        cmd = 'N4BiasFieldCorrection ' \
+              '--bspline-fitting [ 1x1x1, 3 ] ' \
+              '-d 3 ' \
+              '--input-image "%s" ' \
+              '--convergence [ 100x100x100, 0.005 ] ' \
+              '--output "%s" ' \
+              '--shrink-factor 4 ' \
+              '--weight-image "%s" ' \
+              '--histogram-sharpening [0.3, 0.01, 200]' % (T2, T2_out, weight_im)
+
+        subprocess.Popen(cmd, shell=True).wait()
+
+        self.T2_file = T2_out
+
+    def segment_tumor(self, model_path):
+
+        # Load threshold from metrics file
+        mfile = os.path.join(model_path, 'metrics.txt')
+        with open(mfile, 'r') as f:
+            dat = f.readlines()
+
+        # Find last threshold calculated from the training set
+        threshold = 0.5
+        for z in range(-1, -12, -1):
+            if 'Best threshold' in dat[z]:
+                threshold = [i for i in dat[z] if i.isdigit() or i == '.']
+                threshold = float(''.join(threshold))
+                break
+
+        # Run segmentation
+        t2, y_pred = seg_from_model(model_path=model_path,
+                                    im_paths=[self.T1_file, self.T1c_file, self.T2_file],
+                                    threshold=threshold)
+
+        # Save the new segmentation
+        self.mask_file = os.path.join(self.cur_scan_path, 'tumor_seg.nii.gz')
+        nib.save(nib.Nifti1Image(y_pred, np.eye(4)), self.mask_file)
+
+        # Make segmentation images
+        display_segmentations(t2, y_pred, self.cur_scan_path)
+
+    def compute_radiomics(self, animal_id):
+
+        # Set up values for multiple mask configurations
+        sfiles = ['radiomic_features.csv',
+                  'radiomic_features_bed.csv',
+                  'radiomic_features_edge.csv']
+        dilate = [0, self.dilate, self.dilate]
+        diff_mask = [False, False, True]
+
+        # Set up empty list for radiomics files
+        self.radiomics_sfile = []
+
+        for i in range(3):
+            if i == 0:
+                regen = True
+            else:
+                regen = False
+
+            # Data paths
+            base_path = self.cur_scan_path
+            save_base_path = os.path.join(os.getcwd(), 'Working')
+
+            # Append the current radiomics file
+            self.radiomics_sfile.append(os.path.join(base_path, sfiles[i]))
+
+            # Generate CSV file of images/masks and re-save images as 16 bit
+            csv_file = gen_images_csv([self.T1_file, self.T1c_file, self.T2_file],
+                                      mask_file=self.mask_file,
+                                      save_base_path=save_base_path,
+                                      dilate=dilate[i],
+                                      ncontrasts=3,
+                                      regen=regen,
+                                      diff_mask=diff_mask[i],
+                                      animal_id=animal_id)
+
+            # Run radiomics
+            run_radiomics(self.cur_scan_path, csv_file, self.radiomics_sfile[i])
+
+    def sort_radiomics(self, animal_id, study_date, base_path, summary_file):
+
+        # Define data base path
+        base_path = os.path.join(base_path, 'Results')
+
+        # Load study data
+        df = load_study_data(summary_file)
+
+        # Define dictionary keys
+        kid_key = 'Kirsch lab iD'
+        rtd_key = 'Date of irradiation 20Gy x'
+
+        # Convert study date to Datetime object for comparison
+        study_dt = datetime.strptime(study_date, '%Y%m%d')
+
+        # Convert the animal ID to a number for comparison
+        an_id = int(animal_id.strip('K').strip('^'))
+
+        # Get dataset animals
+        animals = df[kid_key].astype('int')
+
+        # Set up flags
+        classified = False
+        control_flag = False
+        post_rt = False
+
+        # Determine if the animal appears in the control of PD1 groups
+        if any(an_id == animals):
+
+            # Animal exists in database
+            classified = True
+
+            # Get animal study group
+            group = df['Group'][an_id == animals].to_list()
+            group = group[0]
+
+            if group == 'Control':
+                control_flag = True
+
+            if group == 'PD1':
+                control_flag = False
+
+        else:
+            print('Unclassified')
+
+        # Set up paths to radiomics files in the animal directory
+        rad_files = [os.path.join(self.cur_scan_path, file) for file in
+                     ['radiomic_features.csv',
+                      'radiomic_features_bed.csv',
+                      'radiomic_features_edge.csv']
+                     ]
+
+        # Determine if the imaging was performed pre or post RT
+        if classified:
+
+            # Get RT date (already in Datetime)
+            rt_date = df[rtd_key][an_id == animals]
+
+            # Compare the RT and study date
+            if all(rt_date < study_dt):
+
+                # The study happened post-RT
+                post_rt = True
+
+
+            # Pre-RT, all groups are the same
+            if not post_rt:
+                fname = os.path.join(base_path, 'Radiomics_preRT.txt')
+
+            # Control group, post RT
+            if control_flag and post_rt:
+                fname = os.path.join(base_path, 'Radiomics_control_postRT.txt')
+
+            # PD1 group, post RT
+            if not control_flag and post_rt:
+                fname = os.path.join(base_path, 'Radiomics_PD1_postRT.txt')
+
+        else:
+            # If the animal does not appear in the excel sheet
+            fname = os.path.join(base_path, 'Unfiled_animals.txt')
+
+        # Write log file
+        with open(fname, 'a') as f:
+            for file in rad_files:
+                f.write('%s\n' % file)