[7c5f70]: / Crawler / data_management.py

Download this file

118 lines (94 with data), 4.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import json
from datetime import datetime
from Crawler.crawler_radiomics import load_study_data
def find_missing_filed(summary_file, log_file, out_file):
# Load summary file
summary = load_study_data(summary_file)
# Load log file
with open(log_file) as f:
log = json.load(f)
# Summary animals
sum_animals = summary['Kirsch lab iD']
sum_animals = ['K' + str(int(i)) for i in sum_animals]
first_scan = list(summary['Date of MRI image 1st'])
second_scan = list(summary['Date of MRI image 2nd'])
sum_df = {sum_animals[i]: [first_scan[i], second_scan[i]] for i in range(len(sum_animals))}
# Log animals
log_animals = list(log.keys())
# Get all animals names, OR
all_animals = sum_animals + log_animals
seen = set()
unique = []
for x in all_animals:
if x not in seen:
unique.append(x)
seen.add(x)
# Get overlap
overlap_animals = [i for i in sum_animals if i in log_animals]
overlap_animals = overlap_animals + [i for i in log_animals if i in sum_animals and not overlap_animals]
# Get unique to summary
sum_unique = [i for i in sum_animals if i not in log_animals]
# Get unique to MR data
log_unique = [i for i in log_animals if i not in sum_animals]
# Write results to file
n_dashes = 70
f = open(out_file, 'w')
f.write('Comparison of animals found in MR scans directories and the Summary Excel sheet.\n')
f.write('Shown are animal IDs and the date of the first and second listed MR scans.\n')
f.write('%d unique animal IDs have been found:\n' % len(unique))
f.write('\t%d from the Summary\n\t%d from the MR data\n' % (len(sum_animals), len(log_animals)))
f.write('-'*n_dashes + '\n\n')
f.write('Animals found only in MR data (missing from Summary sheets): %d animals\n' % len(log_unique))
f.write('\tID\t\t\tMR Data Dates\n')
for an in log_unique:
date1 = datetime.strptime(log[an]['StudyDate'][0], '%Y%m%d').date()
try:
date2 = datetime.strptime(log[an]['StudyDate'][1], '%Y%m%d').date()
except:
date2 = ''
f.write('\t%s\t\t%s\t%s\n' % (an, date1, date2))
f.write('\n' + '-' * n_dashes + '\n')
f.write('Animals found only in Summary sheet (missing from MR data): %d animals\n' % len(sum_unique))
f.write('\tID\t\t\tSummary Dates\n')
for an in sum_unique:
date1 = sum_df[an][0].date()
try:
date2 = sum_df[an][1].date()
except:
date2 = ''
f.write('\t%s\t\t%s\t%s\n' % (an, date1, date2))
f.write('\n' + '-' * n_dashes + '\n')
f.write('Animals matched in both data sources: %d animals\n' % len(overlap_animals))
f.write('\tID\t\t\tSummary Dates\t\t\t\tMR Data Dates\n')
for an in overlap_animals:
sdate1 = sum_df[an][0].date()
try:
sdate2 = sum_df[an][1].date()
except:
sdate2 = ''
ldate1 = datetime.strptime(log[an]['StudyDate'][0], '%Y%m%d').date()
try:
ldate2 = datetime.strptime(log[an]['StudyDate'][1], '%Y%m%d').date()
except:
ldate2 = ''
f.write('\t%s\t\t%s\t%s\t\t%s\t%s\n' % (an, sdate1, sdate2, ldate1, ldate2))
f.write('\n' + '-' * n_dashes + '\n')
f.write('All unique animal IDs from both sources: %d animals\n' % len(unique))
f.write('\tID\t\tSummary\t\tMR Data\n')
for an in unique:
s = ''
mr = ''
if an in sum_animals: s = 'X'
if an in log_animals: mr = 'X'
f.write('\t%s\t\t %s\t\t %s\n' % (an, s, mr))
f.write('\n' + '-' * n_dashes + '\n')
f.close()
if __name__ == "__main__":
# summary_file = '/media/justify/b7TData/Results/Summary.xlsx'
# log_file = '/media/justify/b7TData/Results/processing_log.json'
# out_file = '/media/justify/b7TData/Results/animal_matching.txt'
summary_file = '/media/matt/Seagate Expansion Drive/b7TData/Results/Summary.xlsx'
log_file = '/media/matt/Seagate Expansion Drive/b7TData/Results/processing_log.json'
out_file = '/media/matt/Seagate Expansion Drive/b7TData/Results/animal_matching.txt'
find_missing_filed(summary_file, log_file, out_file)