--- a +++ b/Crawler/data_management.py @@ -0,0 +1,118 @@ +import os +import json +from datetime import datetime +from Crawler.crawler_radiomics import load_study_data + + +def find_missing_filed(summary_file, log_file, out_file): + + # Load summary file + summary = load_study_data(summary_file) + + # Load log file + with open(log_file) as f: + log = json.load(f) + + # Summary animals + sum_animals = summary['Kirsch lab iD'] + sum_animals = ['K' + str(int(i)) for i in sum_animals] + first_scan = list(summary['Date of MRI image 1st']) + second_scan = list(summary['Date of MRI image 2nd']) + sum_df = {sum_animals[i]: [first_scan[i], second_scan[i]] for i in range(len(sum_animals))} + + # Log animals + log_animals = list(log.keys()) + + # Get all animals names, OR + all_animals = sum_animals + log_animals + seen = set() + unique = [] + for x in all_animals: + if x not in seen: + unique.append(x) + seen.add(x) + + # Get overlap + overlap_animals = [i for i in sum_animals if i in log_animals] + overlap_animals = overlap_animals + [i for i in log_animals if i in sum_animals and not overlap_animals] + + # Get unique to summary + sum_unique = [i for i in sum_animals if i not in log_animals] + + # Get unique to MR data + log_unique = [i for i in log_animals if i not in sum_animals] + + # Write results to file + n_dashes = 70 + f = open(out_file, 'w') + f.write('Comparison of animals found in MR scans directories and the Summary Excel sheet.\n') + f.write('Shown are animal IDs and the date of the first and second listed MR scans.\n') + f.write('%d unique animal IDs have been found:\n' % len(unique)) + f.write('\t%d from the Summary\n\t%d from the MR data\n' % (len(sum_animals), len(log_animals))) + f.write('-'*n_dashes + '\n\n') + + f.write('Animals found only in MR data (missing from Summary sheets): %d animals\n' % len(log_unique)) + f.write('\tID\t\t\tMR Data Dates\n') + for an in log_unique: + date1 = datetime.strptime(log[an]['StudyDate'][0], '%Y%m%d').date() + try: + date2 = datetime.strptime(log[an]['StudyDate'][1], '%Y%m%d').date() + except: + date2 = '' + + f.write('\t%s\t\t%s\t%s\n' % (an, date1, date2)) + f.write('\n' + '-' * n_dashes + '\n') + + f.write('Animals found only in Summary sheet (missing from MR data): %d animals\n' % len(sum_unique)) + f.write('\tID\t\t\tSummary Dates\n') + for an in sum_unique: + date1 = sum_df[an][0].date() + try: + date2 = sum_df[an][1].date() + except: + date2 = '' + f.write('\t%s\t\t%s\t%s\n' % (an, date1, date2)) + f.write('\n' + '-' * n_dashes + '\n') + + f.write('Animals matched in both data sources: %d animals\n' % len(overlap_animals)) + f.write('\tID\t\t\tSummary Dates\t\t\t\tMR Data Dates\n') + for an in overlap_animals: + sdate1 = sum_df[an][0].date() + try: + sdate2 = sum_df[an][1].date() + except: + sdate2 = '' + + ldate1 = datetime.strptime(log[an]['StudyDate'][0], '%Y%m%d').date() + try: + ldate2 = datetime.strptime(log[an]['StudyDate'][1], '%Y%m%d').date() + except: + ldate2 = '' + + f.write('\t%s\t\t%s\t%s\t\t%s\t%s\n' % (an, sdate1, sdate2, ldate1, ldate2)) + f.write('\n' + '-' * n_dashes + '\n') + + f.write('All unique animal IDs from both sources: %d animals\n' % len(unique)) + f.write('\tID\t\tSummary\t\tMR Data\n') + for an in unique: + s = '' + mr = '' + if an in sum_animals: s = 'X' + if an in log_animals: mr = 'X' + f.write('\t%s\t\t %s\t\t %s\n' % (an, s, mr)) + f.write('\n' + '-' * n_dashes + '\n') + + f.close() + + +if __name__ == "__main__": + + # summary_file = '/media/justify/b7TData/Results/Summary.xlsx' + # log_file = '/media/justify/b7TData/Results/processing_log.json' + # out_file = '/media/justify/b7TData/Results/animal_matching.txt' + + summary_file = '/media/matt/Seagate Expansion Drive/b7TData/Results/Summary.xlsx' + log_file = '/media/matt/Seagate Expansion Drive/b7TData/Results/processing_log.json' + out_file = '/media/matt/Seagate Expansion Drive/b7TData/Results/animal_matching.txt' + +find_missing_filed(summary_file, log_file, out_file) \ No newline at end of file