|
a |
|
b/Radiomics/radiomics_classifications.py |
|
|
1 |
""" |
|
|
2 |
A collection of functions used to analyze radiomic features. This code appends data from RT, primary tumor |
|
|
3 |
recurrence, and survival times. The example code is at the end of this file. |
|
|
4 |
""" |
|
|
5 |
|
|
|
6 |
import os |
|
|
7 |
import sys |
|
|
8 |
file = '/home/matt/Documents/SegSarcoma' |
|
|
9 |
sys.path.append(file) |
|
|
10 |
import pptx |
|
|
11 |
import pickle |
|
|
12 |
import json |
|
|
13 |
from datetime import datetime, timedelta |
|
|
14 |
import datetime as dt |
|
|
15 |
from time import time |
|
|
16 |
from subprocess import call |
|
|
17 |
from lifelines import KaplanMeierFitter |
|
|
18 |
import numpy as np |
|
|
19 |
import pandas as pd |
|
|
20 |
from matplotlib import pyplot as plt |
|
|
21 |
from matplotlib import rcParams |
|
|
22 |
import seaborn as sns |
|
|
23 |
from glob2 import glob |
|
|
24 |
from scipy import stats |
|
|
25 |
from Crawler.crawler_radiomics import load_study_data |
|
|
26 |
from Radiomics.radiomic_functions import * |
|
|
27 |
from PIL import Image, ImageDraw, ImageFont |
|
|
28 |
|
|
|
29 |
# Set up plotting properties |
|
|
30 |
sns.set() |
|
|
31 |
sns.set_style('whitegrid') |
|
|
32 |
sns.set_context("paper") |
|
|
33 |
|
|
|
34 |
PATH_TO_MRMR = '/home/matt/Downloads/mrmr_c_src/mrmr' |
|
|
35 |
|
|
|
36 |
|
|
|
37 |
def concate_contrasts_small(df): |
|
|
38 |
""" |
|
|
39 |
Takes a dataframe containing radiomcis data for all animals and contrasta. |
|
|
40 |
Returns a dataframe with contrasts of the same animal concatenated together. |
|
|
41 |
Args: |
|
|
42 |
df: |
|
|
43 |
|
|
|
44 |
Returns: |
|
|
45 |
|
|
|
46 |
""" |
|
|
47 |
|
|
|
48 |
# Turn multiple contrast images into a single vector of data |
|
|
49 |
keys = df.keys() |
|
|
50 |
cat_cols = [i + '_T1' for i in keys] + \ |
|
|
51 |
[i + '_T1C' for i in keys] + \ |
|
|
52 |
[i + '_T2' for i in keys] |
|
|
53 |
key = [[i + '_T1' for i in keys], |
|
|
54 |
[i + '_T1C' for i in keys], |
|
|
55 |
[i + '_T2' for i in keys]] |
|
|
56 |
df_cat = pd.DataFrame(columns=cat_cols, index=range(1)) |
|
|
57 |
|
|
|
58 |
ind = 0 |
|
|
59 |
for i in range(3): |
|
|
60 |
|
|
|
61 |
tmp = df.iloc[i] |
|
|
62 |
|
|
|
63 |
# Append data into larger array |
|
|
64 |
df_cat[key[i]] = [n for n in tmp.get_values().ravel()] |
|
|
65 |
|
|
|
66 |
return df_cat |
|
|
67 |
|
|
|
68 |
|
|
|
69 |
def strip_excluded(rfiles, exclude): |
|
|
70 |
""" |
|
|
71 |
Remove animals which either did not have two scans or died shortly after surgery. |
|
|
72 |
Args: |
|
|
73 |
rfiles (list): list of all animals |
|
|
74 |
exclude (list): list of animals to exclude |
|
|
75 |
|
|
|
76 |
Returns: |
|
|
77 |
(list): filtered list |
|
|
78 |
""" |
|
|
79 |
|
|
|
80 |
out_rfiles = rfiles |
|
|
81 |
|
|
|
82 |
for file in rfiles: |
|
|
83 |
|
|
|
84 |
path, _ = os.path.split(file) |
|
|
85 |
path, _ = os.path.split(path) |
|
|
86 |
_, animal = os.path.split(path) |
|
|
87 |
|
|
|
88 |
if animal in exclude: |
|
|
89 |
out_rfiles.remove(file) |
|
|
90 |
|
|
|
91 |
return out_rfiles |
|
|
92 |
|
|
|
93 |
|
|
|
94 |
def keep_animals(rfiles, keep): |
|
|
95 |
""" |
|
|
96 |
Only keep specific animals. |
|
|
97 |
Args: |
|
|
98 |
rfiles (list): list of all animals |
|
|
99 |
keep (list): list of animals to keep |
|
|
100 |
|
|
|
101 |
Returns: |
|
|
102 |
(list): filtered list |
|
|
103 |
""" |
|
|
104 |
|
|
|
105 |
out_rfiles = [] |
|
|
106 |
|
|
|
107 |
for file in rfiles: |
|
|
108 |
|
|
|
109 |
path, _ = os.path.split(file) |
|
|
110 |
path, _ = os.path.split(path) |
|
|
111 |
_, animal = os.path.split(path) |
|
|
112 |
|
|
|
113 |
if animal in keep: |
|
|
114 |
out_rfiles.append(file) |
|
|
115 |
|
|
|
116 |
return out_rfiles |
|
|
117 |
|
|
|
118 |
|
|
|
119 |
def select_region(rfiles, region): |
|
|
120 |
""" |
|
|
121 |
Select features based on the region they were calculated in. |
|
|
122 |
Args: |
|
|
123 |
rfiles (list): list of all animals |
|
|
124 |
region (str): 'tumor', 'bed', or 'edge' |
|
|
125 |
|
|
|
126 |
Returns: |
|
|
127 |
|
|
|
128 |
""" |
|
|
129 |
|
|
|
130 |
out_rfiles = rfiles |
|
|
131 |
|
|
|
132 |
for _ in range(2): |
|
|
133 |
|
|
|
134 |
for file in rfiles: |
|
|
135 |
|
|
|
136 |
if region == 'tumor': |
|
|
137 |
|
|
|
138 |
if '_bed' in file: |
|
|
139 |
out_rfiles.remove(file) |
|
|
140 |
elif 'edge' in file: |
|
|
141 |
out_rfiles.remove(file) |
|
|
142 |
|
|
|
143 |
elif region == 'bed': |
|
|
144 |
|
|
|
145 |
if '_bed' not in file: |
|
|
146 |
out_rfiles.remove(file) |
|
|
147 |
|
|
|
148 |
elif region == 'edge': |
|
|
149 |
|
|
|
150 |
if '_edge' not in file: |
|
|
151 |
out_rfiles.remove(file) |
|
|
152 |
|
|
|
153 |
rfiles = out_rfiles |
|
|
154 |
|
|
|
155 |
return out_rfiles |
|
|
156 |
|
|
|
157 |
|
|
|
158 |
def load_radiomics(radiomics_paths, exclude, region, keep, group): |
|
|
159 |
""" |
|
|
160 |
Load radimic data (output of PyRadiomics) |
|
|
161 |
Args: |
|
|
162 |
radiomics_paths (str): path to radiomic files |
|
|
163 |
exclude (str): amimals to exclude |
|
|
164 |
region (str): which region to select |
|
|
165 |
keep (str): animals to keep |
|
|
166 |
group (str): pre/post RT and control/treatment |
|
|
167 |
|
|
|
168 |
Returns: |
|
|
169 |
pandas dataframe: radiomic data |
|
|
170 |
""" |
|
|
171 |
|
|
|
172 |
# Load radiomics file data |
|
|
173 |
with open(radiomics_paths, 'r') as f: |
|
|
174 |
rfiles = f.readlines() |
|
|
175 |
|
|
|
176 |
# Remove line endings '\n' |
|
|
177 |
rfiles = [i.strip() for i in rfiles] |
|
|
178 |
|
|
|
179 |
# Sort radiomics files |
|
|
180 |
rfiles = strip_excluded(rfiles, exclude) |
|
|
181 |
rfiles = select_region(rfiles, region) |
|
|
182 |
|
|
|
183 |
# Only load specific animals |
|
|
184 |
rfiles = keep_animals(rfiles, keep) |
|
|
185 |
|
|
|
186 |
# Load data and combine contrasts |
|
|
187 |
df = load_and_concate(rfiles) |
|
|
188 |
|
|
|
189 |
df['Group'] = group |
|
|
190 |
# df['Region'] = region |
|
|
191 |
|
|
|
192 |
df.index = range(df.shape[0]) |
|
|
193 |
|
|
|
194 |
return df |
|
|
195 |
|
|
|
196 |
|
|
|
197 |
def load_study_logs(summary_file, log_file): |
|
|
198 |
""" |
|
|
199 |
Load study logs which contain relevant dates |
|
|
200 |
Args: |
|
|
201 |
summary_file: |
|
|
202 |
log_file: |
|
|
203 |
|
|
|
204 |
Returns: |
|
|
205 |
|
|
|
206 |
""" |
|
|
207 |
|
|
|
208 |
# Load summary file |
|
|
209 |
summary = load_study_data(summary_file) |
|
|
210 |
|
|
|
211 |
# Convert animal IDs to match the log file |
|
|
212 |
summary['Kirsch lab iD'] = ['K' + str(int(i)) for i in summary['Kirsch lab iD']] |
|
|
213 |
|
|
|
214 |
if not log_file == '': |
|
|
215 |
# Load log file |
|
|
216 |
with open(log_file) as f: |
|
|
217 |
log = json.load(f) |
|
|
218 |
else: |
|
|
219 |
log = 0 |
|
|
220 |
|
|
|
221 |
# Amputation date |
|
|
222 |
treat_date = summary['Date of Antibody treatment (0.2mg/mouse Ip) 1st'] |
|
|
223 |
|
|
|
224 |
# Compute time until today |
|
|
225 |
summary['SinceAmputation'] = (datetime.now() - treat_date) |
|
|
226 |
|
|
|
227 |
return summary, log |
|
|
228 |
|
|
|
229 |
|
|
|
230 |
def capitalize_method(key): |
|
|
231 |
""" |
|
|
232 |
Capitalizes an list of strings |
|
|
233 |
Args: |
|
|
234 |
key (str): a feature name to capitalize |
|
|
235 |
|
|
|
236 |
Returns: |
|
|
237 |
(str): capitalized name |
|
|
238 |
""" |
|
|
239 |
|
|
|
240 |
if str in key: |
|
|
241 |
|
|
|
242 |
key = key.replace(str, str.upper()) |
|
|
243 |
|
|
|
244 |
return key |
|
|
245 |
|
|
|
246 |
|
|
|
247 |
def load_and_concate(rfiles): |
|
|
248 |
""" |
|
|
249 |
Loads and concatenates radiomic features from different MR contrasts |
|
|
250 |
Args: |
|
|
251 |
rfiles (list): list of radiomic files (one for each scan) |
|
|
252 |
|
|
|
253 |
Returns: |
|
|
254 |
pandas dataframe: radiomic features |
|
|
255 |
""" |
|
|
256 |
|
|
|
257 |
animal_id = [] |
|
|
258 |
for i, file in enumerate(rfiles): |
|
|
259 |
|
|
|
260 |
df = pd.DataFrame.from_csv(file) |
|
|
261 |
|
|
|
262 |
if i == 0: |
|
|
263 |
cat_df = concate_contrasts_small(df) |
|
|
264 |
|
|
|
265 |
else: |
|
|
266 |
df = concate_contrasts_small(df) |
|
|
267 |
cat_df = pd.concat([cat_df, df]) |
|
|
268 |
|
|
|
269 |
path, _ = os.path.split(file) |
|
|
270 |
path, _ = os.path.split(path) |
|
|
271 |
_, animal = os.path.split(path) |
|
|
272 |
|
|
|
273 |
if animal not in animal_id: |
|
|
274 |
animal_id.append(animal) |
|
|
275 |
|
|
|
276 |
else: |
|
|
277 |
print(animal) |
|
|
278 |
|
|
|
279 |
# Filter out non-feature names |
|
|
280 |
features = cat_df.keys() |
|
|
281 |
feature_names = list( |
|
|
282 |
sorted(filter(lambda k: k.startswith("original_"), features))) |
|
|
283 |
cat_df = cat_df[feature_names] |
|
|
284 |
|
|
|
285 |
# Add animal name |
|
|
286 |
cat_df['ID'] = animal_id |
|
|
287 |
|
|
|
288 |
# Update df index |
|
|
289 |
cat_df.index = range(cat_df.shape[0]) |
|
|
290 |
|
|
|
291 |
return cat_df |
|
|
292 |
|
|
|
293 |
|
|
|
294 |
def load_recurrence_log(recurrence_file, threshold): |
|
|
295 |
""" |
|
|
296 |
Load information about recurrence |
|
|
297 |
Args: |
|
|
298 |
recurrence_file (str): file containing primary tumor recurrence information. |
|
|
299 |
threshold (int): a limit on how long a mouse must survive post-surgury to be included |
|
|
300 |
|
|
|
301 |
Returns: |
|
|
302 |
pandas dataframe: animal and recurrence information |
|
|
303 |
""" |
|
|
304 |
|
|
|
305 |
# Read in log file |
|
|
306 |
dat = pd.read_excel(recurrence_file) |
|
|
307 |
|
|
|
308 |
# Add 'K' to each animal name |
|
|
309 |
dat['Animal'] = ['K' + str(n) for n in dat['Animal']] |
|
|
310 |
|
|
|
311 |
# Filter out animals without amputation |
|
|
312 |
dat = dat[dat['AmputationDate'].notnull()] |
|
|
313 |
|
|
|
314 |
# Fill in death dates with today |
|
|
315 |
dat['DeathDate'][dat['DeathDate'].isnull()] = datetime.now() |
|
|
316 |
|
|
|
317 |
# Find lifespan post amputation |
|
|
318 |
dat['AmputationDate'] = [i.date() for i in dat['AmputationDate']] |
|
|
319 |
dat['DeathDate'] = [i.date() for i in dat['DeathDate']] |
|
|
320 |
dat['Recurrence'] = [i.date() for i in dat['Recurrence']] |
|
|
321 |
lifespan = dat['DeathDate'] - dat['AmputationDate'] |
|
|
322 |
dat['since_amp'] = [int(life.days) for life in lifespan] |
|
|
323 |
|
|
|
324 |
# Filter out short lifespans without recurrence |
|
|
325 |
thresh = timedelta(threshold) |
|
|
326 |
recurr = ~pd.isna(dat['Recurrence']) |
|
|
327 |
inds = np.logical_or(lifespan > thresh, recurr) |
|
|
328 |
dat = dat[inds] |
|
|
329 |
# dat = dat[lifespan > thresh] |
|
|
330 |
|
|
|
331 |
# Get recurrence |
|
|
332 |
dat['bool_recur'] = dat['Recurrence'].notnull().tolist() |
|
|
333 |
|
|
|
334 |
# Reindex |
|
|
335 |
dat = dat.reset_index(drop=True) |
|
|
336 |
|
|
|
337 |
return dat |
|
|
338 |
|
|
|
339 |
|
|
|
340 |
def sort_study_data(rec_df, sum_df, exclude): |
|
|
341 |
|
|
|
342 |
# Create a new dictionary to sort recurrence data |
|
|
343 |
df = {'animalID': [], 'group': [], 'recurrence': [], 'rec_days': []} |
|
|
344 |
|
|
|
345 |
# Populate df with data from both logs |
|
|
346 |
n = 0 |
|
|
347 |
for i, animal in enumerate(sum_df['Kirsch lab iD']): |
|
|
348 |
|
|
|
349 |
if any(animal == rec_df['Animal']) and animal not in exclude: |
|
|
350 |
|
|
|
351 |
tmp = rec_df[rec_df['Animal'].str.match(animal)] |
|
|
352 |
|
|
|
353 |
df['animalID'].append(animal) |
|
|
354 |
df['group'].append(sum_df['Group'][i]) |
|
|
355 |
df['recurrence'].append(tmp['bool_recur'].tolist()[0]) |
|
|
356 |
df['rec_days'].append(int(tmp['since_amp'])) |
|
|
357 |
|
|
|
358 |
# Make a Pandas df |
|
|
359 |
df = pd.DataFrame.from_dict(df) |
|
|
360 |
|
|
|
361 |
# Print general stats |
|
|
362 |
nrec = sum(df['recurrence']) |
|
|
363 |
norec = sum(df['recurrence']==False) |
|
|
364 |
|
|
|
365 |
nrecpd1 = sum([df['group'][i] == 'PD1' and df['recurrence'][i] for i in range(len(df))]) |
|
|
366 |
nreccon = sum([df['group'][i] == 'Control' and df['recurrence'][i] for i in range(len(df))]) |
|
|
367 |
|
|
|
368 |
norecpd1 = sum([df['group'][i] == 'PD1' and not df['recurrence'][i] for i in range(len(df))]) |
|
|
369 |
noreccon = sum([df['group'][i] == 'Control' and not df['recurrence'][i] for i in range(len(df))]) |
|
|
370 |
|
|
|
371 |
print('Number of recurrent tumors: %d' % nrec) |
|
|
372 |
print('\tControl: \t%d' % nreccon) |
|
|
373 |
print('\tPD1: \t%d' % noreccon) |
|
|
374 |
print('Number of non-recurrent tumors: %d' % norec) |
|
|
375 |
print('\tControl: \t%d' % noreccon) |
|
|
376 |
print('\tPD1: \t%d' % norecpd1) |
|
|
377 |
|
|
|
378 |
return df |
|
|
379 |
|
|
|
380 |
|
|
|
381 |
def append_rec(df_rec, df_tumor): |
|
|
382 |
""" |
|
|
383 |
Append radiomics, RT, survival, and recurrence data together |
|
|
384 |
Args: |
|
|
385 |
df_rec (pandas dataframe): recurrence |
|
|
386 |
df_tumor (pandas dataframe): radiomics |
|
|
387 |
|
|
|
388 |
Returns: |
|
|
389 |
appended pandas dataframe |
|
|
390 |
""" |
|
|
391 |
|
|
|
392 |
# Get the list of animals |
|
|
393 |
animals = df_rec['animalID'].tolist() |
|
|
394 |
keys = df_rec.keys().tolist() |
|
|
395 |
keys.remove('animalID') |
|
|
396 |
keys.remove('group') |
|
|
397 |
|
|
|
398 |
# Extend the output dataframe |
|
|
399 |
ltumor = len(df_tumor['ID']) |
|
|
400 |
df_out = df_tumor.copy() |
|
|
401 |
for key in keys: |
|
|
402 |
df_out[key] = pd.Series(np.zeros(ltumor), index=df_out.index) |
|
|
403 |
|
|
|
404 |
for animal in animals: |
|
|
405 |
|
|
|
406 |
rec = df_rec[df_rec['animalID'] == animal] |
|
|
407 |
|
|
|
408 |
for key in keys: |
|
|
409 |
df_out.loc[df_out['ID'] == animal, key] = rec[key].tolist() |
|
|
410 |
|
|
|
411 |
return df_out |
|
|
412 |
|
|
|
413 |
|
|
|
414 |
def examine_recurrence(df_tumor, spath, group='Pre'): |
|
|
415 |
""" |
|
|
416 |
Run classifiers to predict recurrence |
|
|
417 |
Args: |
|
|
418 |
df_tumor (pandas dataframe): radiomic data |
|
|
419 |
spath (str): output path |
|
|
420 |
group (str): pre/post RT |
|
|
421 |
|
|
|
422 |
Returns: |
|
|
423 |
|
|
|
424 |
""" |
|
|
425 |
|
|
|
426 |
# Ensure spath exists |
|
|
427 |
if not os.path.exists(spath): |
|
|
428 |
os.makedirs(spath) |
|
|
429 |
|
|
|
430 |
# Separate control and PD1 |
|
|
431 |
df_cnt = df_tumor[df_tumor['Group'] == group + 'Cnt'] |
|
|
432 |
df_pd1 = df_tumor[df_tumor['Group'] == group + 'PD1'] |
|
|
433 |
|
|
|
434 |
# Separate by larger pre/post |
|
|
435 |
df_tumor = df_tumor.loc[(df_tumor['Group'] == group + 'Cnt') | (df_tumor['Group'] == group + 'PD1'), :] |
|
|
436 |
|
|
|
437 |
# Get numeric data |
|
|
438 |
keys = df_tumor.keys().tolist() |
|
|
439 |
rm_keys = ['ID', 'Group', 'rec_days'] |
|
|
440 |
[keys.remove(i) for i in rm_keys] |
|
|
441 |
lab_keys = 'recurrence' |
|
|
442 |
data_keys = keys |
|
|
443 |
data_keys.remove(lab_keys) |
|
|
444 |
|
|
|
445 |
# Scale measurements |
|
|
446 |
rad = df_tumor[data_keys].copy() |
|
|
447 |
# rad = scale_features(rad) |
|
|
448 |
label = df_tumor[lab_keys] |
|
|
449 |
ids = df_tumor['ID'] |
|
|
450 |
# bar_plot(rad_data) |
|
|
451 |
|
|
|
452 |
# tmp = rad.loc[label, 'original_firstorder_10Percentile_T1'] |
|
|
453 |
# rad.loc[label, 'original_firstorder_10Percentile_T1'] = tmp.copy() |
|
|
454 |
|
|
|
455 |
# Plot metrics |
|
|
456 |
plot_radiomics(rad, label, ids, spath) |
|
|
457 |
|
|
|
458 |
# SVM classification |
|
|
459 |
svm_classifier_loc(scale_features(rad), label, spath) |
|
|
460 |
# tspath = os.path.join(spath, 'cnt') |
|
|
461 |
# svm_classifier_loc(scale_features(df_cnt[data_keys]), df_cnt[lab_keys], tspath) |
|
|
462 |
# tspath = os.path.join(spath, 'pd1') |
|
|
463 |
# svm_classifier_loc(scale_features(df_pd1[data_keys]), df_pd1[lab_keys], tspath) |
|
|
464 |
|
|
|
465 |
# NN classification |
|
|
466 |
nn_classifier_loc(scale_features(rad), label, spath) |
|
|
467 |
# tspath = os.path.join(spath, 'cnt') |
|
|
468 |
# nn_classifier_loc(scale_features(df_cnt[data_keys]), df_cnt[lab_keys], tspath) |
|
|
469 |
# tspath = os.path.join(spath, 'pd1') |
|
|
470 |
# nn_classifier_loc(scale_features(df_pd1[data_keys]), df_pd1[lab_keys], tspath) |
|
|
471 |
|
|
|
472 |
# sname = os.path.join(spath, 'full.csv') |
|
|
473 |
# pd.concat((scale_features(rad), label), axis=1).to_csv(sname, index=False) |
|
|
474 |
# sname = os.path.join(spath, 'cnt.csv') |
|
|
475 |
# pd.concat((scale_features(df_cnt[data_keys]), df_cnt[lab_keys]), axis=1).to_csv(sname, index=False) |
|
|
476 |
# sname = os.path.join(spath, 'pd1.csv') |
|
|
477 |
# pd.concat((scale_features(df_pd1[data_keys]), df_pd1[lab_keys]), axis=1).to_csv(sname, index=False) |
|
|
478 |
|
|
|
479 |
|
|
|
480 |
def svm_classifier_loc(rad, label, spath): |
|
|
481 |
""" |
|
|
482 |
Attempts at SVM classification using the full dataset and PCA-based reduced datasets. |
|
|
483 |
Args: |
|
|
484 |
rad (pandas dataframe): radiomic features |
|
|
485 |
label (pandas dataframe): classification label |
|
|
486 |
spath (str): output directory |
|
|
487 |
|
|
|
488 |
Returns: |
|
|
489 |
|
|
|
490 |
""" |
|
|
491 |
|
|
|
492 |
sns.set(font_scale=1.1) |
|
|
493 |
|
|
|
494 |
# Ensure spath exists |
|
|
495 |
if not os.path.exists(spath): |
|
|
496 |
os.makedirs(spath) |
|
|
497 |
|
|
|
498 |
old_stdout = sys.stdout |
|
|
499 |
log_file_path = os.path.join(spath, 'SVM_results.txt') |
|
|
500 |
log_file = open(log_file_path, 'w') |
|
|
501 |
sys.stdout = log_file |
|
|
502 |
|
|
|
503 |
print('Training samples: %d, test samples: %d' % (int(rad.shape[0]*0.75), int(rad.shape[0]*0.25))) |
|
|
504 |
print('Number of features %d' % rad.shape[1]) |
|
|
505 |
|
|
|
506 |
# Get index |
|
|
507 |
inds_1 = label == True |
|
|
508 |
inds_2 = label == False |
|
|
509 |
|
|
|
510 |
# Create label vector |
|
|
511 |
label = label.get_values().astype(np.float64) |
|
|
512 |
rad = rad.get_values().astype(np.float64) |
|
|
513 |
|
|
|
514 |
# Plot euclidean distances - multidimensional scaling |
|
|
515 |
fig = plot_euclidean_distances(rad, inds_1, inds_2) |
|
|
516 |
fig.savefig(os.path.join(spath, 'SVM_EuclDist_SVM.png'), dpi=300) |
|
|
517 |
plt.close(fig) |
|
|
518 |
|
|
|
519 |
# SVM without reduction |
|
|
520 |
print('Trying SVM classifier without dimensionality reduction:') |
|
|
521 |
print('-' * 50) |
|
|
522 |
# fig, clf = svm_classifier(rad.get_values(), inds_1, inds_2) |
|
|
523 |
fig = svm_cross(rad, label) |
|
|
524 |
fig.savefig(os.path.join(spath, 'SVM_NoPCA_roc.svg')) |
|
|
525 |
fig.savefig(os.path.join(spath, 'SVM_NoPCA_roc.png'), dpi=200) |
|
|
526 |
plt.close(fig) |
|
|
527 |
|
|
|
528 |
with open(os.path.join(spath, 'NoReduction_SVM.model'), 'wb') as f: |
|
|
529 |
pickle.dump(clf, f) |
|
|
530 |
|
|
|
531 |
# PCA analysis |
|
|
532 |
# a look at 2 components and SVM |
|
|
533 |
print('\n') |
|
|
534 |
print('Trying SVM classifier with 2 principal components:') |
|
|
535 |
print('-' * 50) |
|
|
536 |
rad_pca = pca_reduction(rad, npcomps=2) |
|
|
537 |
fig = svm_cross(rad_pca, label) |
|
|
538 |
fig.savefig(os.path.join(spath, 'SVM_PCA.svg')) |
|
|
539 |
plt.close(fig) |
|
|
540 |
fig = svm_cross(rad_pca, label) |
|
|
541 |
fig.savefig(os.path.join(spath, 'SVM_PCA2_roc.svg')) |
|
|
542 |
fig.savefig(os.path.join(spath, 'SVM_PCA2_roc.png'), dpi=300) |
|
|
543 |
plt.close(fig) |
|
|
544 |
with open(os.path.join(spath, 'PCA2_SVM.model'), 'wb') as f: |
|
|
545 |
pickle.dump(clf, f) |
|
|
546 |
|
|
|
547 |
# a look at variance |
|
|
548 |
print('\n') |
|
|
549 |
npcomps, fig = pca_analyis(rad) |
|
|
550 |
print('Trying SVM classifier with %d principal components found from data variance:' % npcomps) |
|
|
551 |
print('-' * 50) |
|
|
552 |
rad_pca = pca_reduction(rad, npcomps) |
|
|
553 |
fig.savefig(os.path.join(spath, 'Cumulative_var.svg')) |
|
|
554 |
# fig.savefig(os.path.join(spath, 'Cumulative_var.png'), dpi=300) |
|
|
555 |
plt.close(fig) |
|
|
556 |
|
|
|
557 |
fig = svm_cross(rad_pca, label) |
|
|
558 |
fig = svm_cross(rad_pca, label) |
|
|
559 |
# fig.savefig(os.path.join(spath, 'SVM_PCA_roc_var.svg')) |
|
|
560 |
# fig.savefig(os.path.join(spath, 'SVM_PCA_roc_var.png'), dpi=300) |
|
|
561 |
plt.close(fig) |
|
|
562 |
with open(os.path.join(spath, 'SVM_PCA95.model'), 'wb') as f: |
|
|
563 |
pickle.dump(clf, f) |
|
|
564 |
|
|
|
565 |
# Close log file |
|
|
566 |
sys.stdout = old_stdout |
|
|
567 |
log_file.close() |
|
|
568 |
|
|
|
569 |
|
|
|
570 |
def nn_classifier_loc(rad, label, spath): |
|
|
571 |
""" |
|
|
572 |
Attempts at NN classification using the full dataset and PCA-based reduced datasets. |
|
|
573 |
Args: |
|
|
574 |
rad (pandas dataframe): radiomic features |
|
|
575 |
label (pandas dataframe): classification label |
|
|
576 |
spath (str): output directory |
|
|
577 |
|
|
|
578 |
Returns: |
|
|
579 |
|
|
|
580 |
""" |
|
|
581 |
|
|
|
582 |
sns.set(font_scale=1.1) |
|
|
583 |
|
|
|
584 |
# Ensure spath exists |
|
|
585 |
if not os.path.exists(spath): |
|
|
586 |
os.makedirs(spath) |
|
|
587 |
|
|
|
588 |
old_stdout = sys.stdout |
|
|
589 |
log_file_path = os.path.join(spath, 'NN_results.txt') |
|
|
590 |
log_file = open(log_file_path, 'w') |
|
|
591 |
sys.stdout = log_file |
|
|
592 |
|
|
|
593 |
print('Training samples: %d, test samples: %d' % (int(rad.shape[0]*0.75), int(rad.shape[0]*0.25))) |
|
|
594 |
print('Number of features %d' % rad.shape[1]) |
|
|
595 |
|
|
|
596 |
# Get index |
|
|
597 |
inds_1 = label == True |
|
|
598 |
inds_2 = label == False |
|
|
599 |
|
|
|
600 |
# Create label vector |
|
|
601 |
label = label.get_values().astype(np.float64) |
|
|
602 |
rad = rad.get_values().astype(np.float64) |
|
|
603 |
|
|
|
604 |
# SVM without reduction |
|
|
605 |
print('Trying NN classifier without dimensionality reduction:') |
|
|
606 |
print('-' * 50) |
|
|
607 |
# fig, clf = svm_classifier(rad.get_values(), inds_1, inds_2) |
|
|
608 |
fig = neural_network_cross(rad, label) |
|
|
609 |
fig.savefig(os.path.join(spath, 'NN_NoPCA_roc.svg')) |
|
|
610 |
fig.savefig(os.path.join(spath, 'NN_NoPCA_roc.png'), dpi=200) |
|
|
611 |
plt.close(fig) |
|
|
612 |
|
|
|
613 |
# PCA analysis |
|
|
614 |
# a look at 2 components and SVM |
|
|
615 |
print('\n') |
|
|
616 |
print('Trying NN classifier with 2 principal components:') |
|
|
617 |
print('-' * 50) |
|
|
618 |
rad_pca = pca_reduction(rad, npcomps=2) |
|
|
619 |
# fig, clf = svm_classifier(rad_pca, inds_1, inds_2) |
|
|
620 |
# fig.savefig(os.path.join(spath, 'PCA_NN.png'), dpi=300) |
|
|
621 |
# plt.close(fig) |
|
|
622 |
fig = neural_network_cross(rad_pca, label) |
|
|
623 |
# fig.savefig(os.path.join(spath, 'NN_PCA2_roc.svg')) |
|
|
624 |
# fig.savefig(os.path.join(spath, 'NN_PCA2_roc.png'), dpi=300) |
|
|
625 |
plt.close(fig) |
|
|
626 |
# with open(os.path.join(spath, 'PCA2_SVM.model'), 'wb') as f: |
|
|
627 |
# pickle.dump(clf, f) |
|
|
628 |
|
|
|
629 |
# a look at variance |
|
|
630 |
print('\n') |
|
|
631 |
npcomps, fig = pca_analyis(rad) |
|
|
632 |
print('Trying NN classifier with %d principal components found from data variance:' % npcomps) |
|
|
633 |
print('-' * 50) |
|
|
634 |
rad_pca = pca_reduction(rad, npcomps) |
|
|
635 |
# fig.savefig(os.path.join(spath, 'Cumulative_var.png'), dpi=300) |
|
|
636 |
plt.close(fig) |
|
|
637 |
|
|
|
638 |
# fig, clf = svm_classifier(rad_pca, inds_1, inds_2) |
|
|
639 |
# fig = neural_network_cross(rad_pca, label) |
|
|
640 |
# fig.savefig(os.path.join(spath, 'NN_PCA_var_roc.svg')) |
|
|
641 |
# fig.savefig(os.path.join(spath, 'NN_PCA_var_roc.png'), dpi=300) |
|
|
642 |
# plt.close(fig) |
|
|
643 |
# with open(os.path.join(spath, 'PCA95_NN.model'), 'wb') as f: |
|
|
644 |
# pickle.dump(clf, f) |
|
|
645 |
|
|
|
646 |
# Close log file |
|
|
647 |
sys.stdout = old_stdout |
|
|
648 |
log_file.close() |
|
|
649 |
|
|
|
650 |
|
|
|
651 |
def mRMR(df_tumor, base_path, group='Post', area='tumor', mrmr_file=None, num_features=10, htmaps=False): |
|
|
652 |
""" |
|
|
653 |
Compute mRMR feature selection. This funtion has been bundled with code which computes correlation |
|
|
654 |
maps and later calls training on classifiers for tumor recurrent. |
|
|
655 |
Args: |
|
|
656 |
df_tumor (panas dataframe): |
|
|
657 |
base_path (str): where to save mRMR output |
|
|
658 |
group (str): pre/post RT |
|
|
659 |
area (str): masked location |
|
|
660 |
mrmr_file (str): file of mRMR features to load or create |
|
|
661 |
num_features (int): number of features to use |
|
|
662 |
htmaps (bool): whether to create and save all heatmaps |
|
|
663 |
|
|
|
664 |
Returns: |
|
|
665 |
|
|
|
666 |
""" |
|
|
667 |
|
|
|
668 |
print('\n') |
|
|
669 |
print('-'*80) |
|
|
670 |
print('Processing %s, %s' %(group, area)) |
|
|
671 |
print('Number of features: %d' % num_features) |
|
|
672 |
|
|
|
673 |
dfn = df_tumor.loc[(df_tumor['Group'] == group + 'Cnt') | (df_tumor['Group'] == group + 'PD1'), :] |
|
|
674 |
dfn = dfn.reset_index() |
|
|
675 |
|
|
|
676 |
keys = df_tumor.keys().tolist() |
|
|
677 |
rm_keys = ['ID', 'Group', 'rec_days'] |
|
|
678 |
[keys.remove(i) for i in rm_keys] |
|
|
679 |
lab_keys = 'recurrence' |
|
|
680 |
data_keys = keys |
|
|
681 |
data_keys.remove(lab_keys) |
|
|
682 |
|
|
|
683 |
# # Separate by larger pre/post |
|
|
684 |
# df_tumor = df_tumor[(df_tumor['Group'] == group + 'Cnt') | (df_tumor['Group'] == group + 'PD1')] |
|
|
685 |
|
|
|
686 |
# Create class in the first column |
|
|
687 |
category = dfn['recurrence'] |
|
|
688 |
|
|
|
689 |
# Remove columns |
|
|
690 |
dfn = dfn[data_keys].copy() |
|
|
691 |
|
|
|
692 |
dfn['Class'] = category.astype(int) |
|
|
693 |
|
|
|
694 |
# Rearrange columns so class is first |
|
|
695 |
cols = dfn.columns.tolist() |
|
|
696 |
cols = [cols[-1]] + cols[:-1] |
|
|
697 |
dfn = dfn[cols] |
|
|
698 |
|
|
|
699 |
# Drop outliers |
|
|
700 |
dfn = dfn.loc[dfn['original_gldm_GrayLevelNonUniformity_T1'] != 5607, :] |
|
|
701 |
|
|
|
702 |
# Write as csv |
|
|
703 |
sname = 'tumor_%s_%s.csv' % (group, area) |
|
|
704 |
dfn.to_csv(sname, index=False, index_label=False) |
|
|
705 |
|
|
|
706 |
# Generate savepath |
|
|
707 |
if mrmr_file: |
|
|
708 |
if 'post' in mrmr_file.lower(): |
|
|
709 |
mrmr_time = 'Post' |
|
|
710 |
elif 'pre' in mrmr_file.lower(): |
|
|
711 |
mrmr_time = 'Pre' |
|
|
712 |
if 'tumor' in mrmr_file.lower(): |
|
|
713 |
mode = 'tumor' |
|
|
714 |
elif 'bed' in mrmr_file.lower(): |
|
|
715 |
mode = 'bed' |
|
|
716 |
else: |
|
|
717 |
mode = 'edge' |
|
|
718 |
spath = os.path.join(base_path, 'Analysis', 'mrmr_%s_%s_feat_%s_%s' % (group, area, mrmr_time, mode)) |
|
|
719 |
else: |
|
|
720 |
spath = os.path.join(base_path, 'Analysis', 'mrmr_%s_%s' % (group, area)) |
|
|
721 |
|
|
|
722 |
print('\tSaving to: ', spath) |
|
|
723 |
if not os.path.exists(spath): |
|
|
724 |
os.makedirs(spath) |
|
|
725 |
|
|
|
726 |
# Run mRMR |
|
|
727 |
if not mrmr_file: |
|
|
728 |
print('\tCreating a new mRMR file') |
|
|
729 |
|
|
|
730 |
log_file_path = os.path.join(base_path, 'Analysis', 'mRMR_results_%s_%s.txt' % (group, area)) |
|
|
731 |
log_file = open(log_file_path, 'w') |
|
|
732 |
call([PATH_TO_MRMR, '-i', sname, '-t', '1', '-n', '200'], |
|
|
733 |
stdout=log_file) |
|
|
734 |
log_file.close() |
|
|
735 |
|
|
|
736 |
# Select out predictive features |
|
|
737 |
mrmr_file = os.path.join(base_path, 'Analysis', 'mRMR_results_%s_%s.txt' % (group, area)) |
|
|
738 |
else: |
|
|
739 |
mrmr_file = os.path.join(base_path, 'Analysis', mrmr_file) |
|
|
740 |
print('\tUsing the specified mRMR file:\n\t\t%s' % mrmr_file) |
|
|
741 |
|
|
|
742 |
# Read mRMR features |
|
|
743 |
# res = pd.read_csv(mrmr_file, delimiter='\t', header=205, skipfooter=9, skipinitialspace=True, engine='python') |
|
|
744 |
res = pd.read_csv(mrmr_file, delimiter='\t', header=3, skipfooter=212, skipinitialspace=True, engine='python') |
|
|
745 |
cols = res.keys().tolist() |
|
|
746 |
cols = [i.strip() for i in cols] |
|
|
747 |
res.columns = cols |
|
|
748 |
|
|
|
749 |
features = res['Name'].tolist() |
|
|
750 |
features = [i.strip() for i in features] |
|
|
751 |
|
|
|
752 |
# Make heatmaps of control and pd1 |
|
|
753 |
if htmaps: |
|
|
754 |
# Make heatmaps for cntr and pd1 |
|
|
755 |
df_cnt = df_tumor.loc[df_tumor['Group'] == group + 'Cnt', :].copy() |
|
|
756 |
df_pd1 = df_tumor.loc[df_tumor['Group'] == group + 'PD1', :].copy() |
|
|
757 |
|
|
|
758 |
# Create class in the first column |
|
|
759 |
category_cnt = df_cnt['recurrence'].astype(int) |
|
|
760 |
category_pd1 = df_pd1['recurrence'].astype(int) |
|
|
761 |
|
|
|
762 |
# Remove columns |
|
|
763 |
df_cnt = df_cnt[features] |
|
|
764 |
df_pd1 = df_pd1[features] |
|
|
765 |
|
|
|
766 |
# Convert columns to float |
|
|
767 |
for key in features: |
|
|
768 |
df_cnt.loc[:, key] = pd.to_numeric(df_cnt.loc[:, key], errors='ignore') |
|
|
769 |
df_pd1.loc[:, key] = pd.to_numeric(df_pd1.loc[:, key], errors='ignore') |
|
|
770 |
|
|
|
771 |
# Rename to remove "original_" |
|
|
772 |
for key in df_cnt.keys(): |
|
|
773 |
nkey = key.replace('original_', '') |
|
|
774 |
df_cnt = df_cnt.rename(columns={key: nkey}) |
|
|
775 |
df_pd1 = df_pd1.rename(columns={key: nkey}) |
|
|
776 |
|
|
|
777 |
# Set up colors for the animal classification |
|
|
778 |
lut = dict(zip(dfn['Class'].unique(), 'bg')) |
|
|
779 |
row_colors_cnt = category_cnt.map(lut) |
|
|
780 |
row_colors_pd1 = category_pd1.map(lut) |
|
|
781 |
row_colors_cnt.name = 'Recurrence' |
|
|
782 |
row_colors_pd1.name = 'Recurrence' |
|
|
783 |
|
|
|
784 |
# sns.set(font_scale=0.8) |
|
|
785 |
# g = sns.clustermap(df_cnt, figsize=(18, 18), standard_scale=1, metric='correlation', row_colors=row_colors_cnt) |
|
|
786 |
# g.savefig(os.path.join(spath, 'heatmap_animals_cnt.png'), dpi=300) |
|
|
787 |
# plt.close() |
|
|
788 |
# |
|
|
789 |
# sns.set(font_scale=0.8) |
|
|
790 |
# g = sns.clustermap(df_pd1, figsize=(18, 18), standard_scale=1, metric='correlation', row_colors=row_colors_pd1) |
|
|
791 |
# g.savefig(os.path.join(spath, 'heatmap_animals_pd1.png'), dpi=300) |
|
|
792 |
# plt.close() |
|
|
793 |
|
|
|
794 |
# Remove class category |
|
|
795 |
# df_cnt = df_cnt.drop('Class', axis=1) |
|
|
796 |
# df_pd1 = df_pd1.drop('Class', axis=1) |
|
|
797 |
|
|
|
798 |
# Compute correlation matrix |
|
|
799 |
df_cnt = df_cnt.corr() |
|
|
800 |
df_pd1 = df_pd1.corr() |
|
|
801 |
|
|
|
802 |
# Sort correlation matrix |
|
|
803 |
order_cnt = df_cnt.sum(axis=0).argsort()[::-1] |
|
|
804 |
order_pd1 = df_pd1.sum(axis=0).argsort()[::-1] |
|
|
805 |
|
|
|
806 |
# Reorder row and columns |
|
|
807 |
df_cnt = df_cnt.iloc[order_cnt, order_cnt] |
|
|
808 |
df_pd1 = df_pd1.iloc[order_cnt, order_cnt] |
|
|
809 |
|
|
|
810 |
# Plot correlation - control |
|
|
811 |
sns.set(font_scale=0.8) |
|
|
812 |
f, ax = plt.subplots(figsize=(18, 10)) |
|
|
813 |
sns.heatmap(df_cnt, vmax=1.0, square=True, ax=ax) |
|
|
814 |
plt.yticks(rotation=0) |
|
|
815 |
ax.get_xaxis().set_ticks([]) |
|
|
816 |
plt.xticks(rotation=90) |
|
|
817 |
# f.savefig(os.path.join(spath, 'heatmap_control_corr.svg')) |
|
|
818 |
f.savefig(os.path.join(spath, 'heatmap_control_corr.png'), dpi=300) |
|
|
819 |
plt.close() |
|
|
820 |
|
|
|
821 |
# Plot correlation - PD1 |
|
|
822 |
sns.set(font_scale=0.8) |
|
|
823 |
f, ax = plt.subplots(figsize=(18, 10)) |
|
|
824 |
sns.heatmap(df_pd1, vmax=1.0, square=True, ax=ax) |
|
|
825 |
plt.yticks(rotation=0) |
|
|
826 |
ax.get_xaxis().set_ticks([]) |
|
|
827 |
plt.xticks(rotation=90) |
|
|
828 |
# f.savefig(os.path.join(spath, 'heatmap_pd1_corr.svg')) |
|
|
829 |
f.savefig(os.path.join(spath, 'heatmap_pd1_corr.png'), dpi=300) |
|
|
830 |
plt.close() |
|
|
831 |
|
|
|
832 |
# Plot correlation - difference |
|
|
833 |
sns.set(font_scale=0.8) |
|
|
834 |
f, ax = plt.subplots(figsize=(18, 10)) |
|
|
835 |
sns.heatmap(np.abs(df_cnt - df_pd1), vmax=1.0, square=True, ax=ax) |
|
|
836 |
plt.yticks(rotation=0) |
|
|
837 |
ax.get_xaxis().set_ticks([]) |
|
|
838 |
plt.xticks(rotation=90) |
|
|
839 |
# f.savefig(os.path.join(spath, 'heatmap_cnt_pd1_diff_corr.svg')) |
|
|
840 |
f.savefig(os.path.join(spath, 'heatmap_cnt_pd1_diff_corr.png'), dpi=300) |
|
|
841 |
plt.close() |
|
|
842 |
|
|
|
843 |
|
|
|
844 |
if htmaps: |
|
|
845 |
# Make heatmap dataframe |
|
|
846 |
htmp1 = dfn[features].copy() |
|
|
847 |
|
|
|
848 |
# Convert columns to float (from object) |
|
|
849 |
for key in htmp1.keys(): |
|
|
850 |
htmp1.loc[:, key] = pd.to_numeric(htmp1.loc[:, key], errors='ignore') |
|
|
851 |
|
|
|
852 |
# Rename to remove "original_" |
|
|
853 |
for key in htmp1.keys(): |
|
|
854 |
|
|
|
855 |
nkey = key.replace('original_', '') |
|
|
856 |
htmp1 = htmp1.rename(columns={key: nkey}) |
|
|
857 |
|
|
|
858 |
# Set up colors for the animal classification |
|
|
859 |
lut = dict(zip(dfn['Class'].unique(), 'bg')) |
|
|
860 |
row_colors = dfn['Class'].map(lut) |
|
|
861 |
row_colors.name = 'Recurrence' |
|
|
862 |
|
|
|
863 |
# sns.set(font_scale=0.8) |
|
|
864 |
# g = sns.clustermap(htmp1, figsize=(18, 18), standard_scale=1, metric='correlation', row_colors=row_colors) |
|
|
865 |
# g.savefig(os.path.join(spath, 'heatmap_animals.svg')) |
|
|
866 |
# # g.savefig(os.path.join(spath, 'heatmap_animals.png'), dpi=300) |
|
|
867 |
# plt.close() |
|
|
868 |
|
|
|
869 |
# Compute correlation matrix - recurrence and no recurrence |
|
|
870 |
htmp_nrec = dfn.loc[dfn['Class'] == 0, features] |
|
|
871 |
htmp_rec = dfn.loc[dfn['Class'] == 1, features] |
|
|
872 |
|
|
|
873 |
# Convert columns to float |
|
|
874 |
for key in features: |
|
|
875 |
htmp_nrec.loc[:, key] = pd.to_numeric(htmp_nrec.loc[:, key], errors='ignore') |
|
|
876 |
htmp_rec.loc[:, key] = pd.to_numeric(htmp_rec.loc[:, key], errors='ignore') |
|
|
877 |
|
|
|
878 |
# Rename to remove "original_" |
|
|
879 |
for key in htmp_nrec.keys(): |
|
|
880 |
nkey = key.replace('original_', '') |
|
|
881 |
htmp_nrec = htmp_nrec.rename(columns={key: nkey}) |
|
|
882 |
htmp_rec = htmp_rec.rename(columns={key: nkey}) |
|
|
883 |
|
|
|
884 |
# Compute correlation matrix |
|
|
885 |
htmp_nrec = htmp_nrec.corr() |
|
|
886 |
htmp_rec = htmp_rec.corr() |
|
|
887 |
|
|
|
888 |
# Sort correlation matrix |
|
|
889 |
order_nrec = htmp_nrec.sum(axis=0).argsort()[::-1] |
|
|
890 |
order_rec = htmp_rec.sum(axis=0).argsort()[::-1] |
|
|
891 |
|
|
|
892 |
# Reorder row and columns |
|
|
893 |
htmp_nrec = htmp_nrec.iloc[order_nrec, order_nrec] |
|
|
894 |
htmp_rec = htmp_rec.iloc[order_nrec, order_nrec] |
|
|
895 |
|
|
|
896 |
# Plot correlation - no recurrence |
|
|
897 |
sns.set(font_scale=0.8) |
|
|
898 |
f, ax = plt.subplots(figsize=(18, 10)) |
|
|
899 |
sns.heatmap(htmp_nrec, vmax=1.0, square=True, ax=ax) |
|
|
900 |
plt.yticks(rotation=0) |
|
|
901 |
ax.get_xaxis().set_ticks([]) |
|
|
902 |
plt.xticks(rotation=90) |
|
|
903 |
f.savefig(os.path.join(spath, 'heatmap_noRecur_corr.svg')) |
|
|
904 |
# f.savefig(os.path.join(spath, 'heatmap_noRecur_corr.png'), dpi=300) |
|
|
905 |
plt.close() |
|
|
906 |
|
|
|
907 |
# Plot correlation - recurrence |
|
|
908 |
sns.set(font_scale=0.8) |
|
|
909 |
f, ax = plt.subplots(figsize=(18, 10)) |
|
|
910 |
sns.heatmap(htmp_rec, vmax=1.0, square=True, ax=ax) |
|
|
911 |
plt.yticks(rotation=0) |
|
|
912 |
ax.get_xaxis().set_ticks([]) |
|
|
913 |
plt.xticks(rotation=90) |
|
|
914 |
f.savefig(os.path.join(spath, 'heatmap_Recur_corr.svg')) |
|
|
915 |
# f.savefig(os.path.join(spath, 'heatmap_Recur_corr.png'), dpi=300) |
|
|
916 |
plt.close() |
|
|
917 |
|
|
|
918 |
# Plot correlation - difference |
|
|
919 |
sns.set(font_scale=0.8) |
|
|
920 |
f, ax = plt.subplots(figsize=(18, 10)) |
|
|
921 |
sns.heatmap(np.abs(htmp_nrec - htmp_rec), vmax=1.0, square=True, ax=ax) |
|
|
922 |
plt.yticks(rotation=0) |
|
|
923 |
ax.get_xaxis().set_ticks([]) |
|
|
924 |
plt.xticks(rotation=90) |
|
|
925 |
f.savefig(os.path.join(spath, 'heatmap_Recur_diff_corr.svg')) |
|
|
926 |
# f.savefig(os.path.join(spath, 'heatmap_Recur_diff_corr.png'), dpi=300) |
|
|
927 |
plt.close() |
|
|
928 |
|
|
|
929 |
# Use only top 10 features |
|
|
930 |
features = features[:num_features] |
|
|
931 |
|
|
|
932 |
# Make heatmap dataframe - 10 features |
|
|
933 |
htmp1 = dfn[features].copy() |
|
|
934 |
|
|
|
935 |
# Convert columns to float (from object) |
|
|
936 |
for key in htmp1.keys(): |
|
|
937 |
htmp1.loc[:, key] = pd.to_numeric(htmp1.loc[:, key], errors='ignore') |
|
|
938 |
|
|
|
939 |
# Rename to remove "original_" |
|
|
940 |
for key in htmp1.keys(): |
|
|
941 |
|
|
|
942 |
nkey = key.replace('original_', '') |
|
|
943 |
htmp1 = htmp1.rename(columns={key: nkey}) |
|
|
944 |
|
|
|
945 |
# Set up colors for the animal classification |
|
|
946 |
lut = dict(zip(dfn['Class'].unique(), 'bg')) |
|
|
947 |
row_colors = dfn['Class'].map(lut) |
|
|
948 |
row_colors.name = 'Recurrence' |
|
|
949 |
|
|
|
950 |
# sns.set(font_scale=0.8) |
|
|
951 |
# g = sns.clustermap(htmp1, figsize=(18, 18), standard_scale=1, metric='correlation', row_colors=row_colors) |
|
|
952 |
# g.savefig(os.path.join(spath, 'heatmap_animals.svg')) |
|
|
953 |
# # g.savefig(os.path.join(spath, 'heatmap_animals.png'), dpi=300) |
|
|
954 |
# plt.close() |
|
|
955 |
|
|
|
956 |
# Compute correlation matrix - recurrence and no recurrence |
|
|
957 |
htmp_nrec = dfn.loc[dfn['Class'] == 0, features] |
|
|
958 |
htmp_rec = dfn.loc[dfn['Class'] == 1, features] |
|
|
959 |
|
|
|
960 |
# Convert columns to float |
|
|
961 |
for key in features: |
|
|
962 |
htmp_nrec.loc[:, key] = pd.to_numeric(htmp_nrec.loc[:, key], errors='ignore') |
|
|
963 |
htmp_rec.loc[:, key] = pd.to_numeric(htmp_rec.loc[:, key], errors='ignore') |
|
|
964 |
|
|
|
965 |
# Rename to remove "original_" |
|
|
966 |
for key in htmp_nrec.keys(): |
|
|
967 |
nkey = key.replace('original_', '') |
|
|
968 |
htmp_nrec = htmp_nrec.rename(columns={key: nkey}) |
|
|
969 |
htmp_rec = htmp_rec.rename(columns={key: nkey}) |
|
|
970 |
|
|
|
971 |
# Compute correlation matrix |
|
|
972 |
htmp_nrec = htmp_nrec.corr() |
|
|
973 |
htmp_rec = htmp_rec.corr() |
|
|
974 |
|
|
|
975 |
# Sort correlation matrix |
|
|
976 |
order_nrec = htmp_nrec.sum(axis=0).argsort()[::-1] |
|
|
977 |
order_rec = htmp_rec.sum(axis=0).argsort()[::-1] |
|
|
978 |
|
|
|
979 |
# Reorder row and columns |
|
|
980 |
htmp_nrec = htmp_nrec.iloc[order_nrec, order_nrec] |
|
|
981 |
htmp_rec = htmp_rec.iloc[order_nrec, order_nrec] |
|
|
982 |
|
|
|
983 |
# Plot correlation - no recurrence |
|
|
984 |
sns.set(font_scale=0.8) |
|
|
985 |
f, ax = plt.subplots(figsize=(18, 10)) |
|
|
986 |
sns.heatmap(htmp_nrec, vmax=1.0, square=True, ax=ax) |
|
|
987 |
plt.yticks(rotation=0) |
|
|
988 |
ax.get_xaxis().set_ticks([]) |
|
|
989 |
plt.xticks(rotation=90) |
|
|
990 |
f.savefig(os.path.join(spath, 'heatmap_noRecur_corr_10.svg')) |
|
|
991 |
# f.savefig(os.path.join(spath, 'heatmap_noRecur_corr_10.png'), dpi=300) |
|
|
992 |
plt.close() |
|
|
993 |
|
|
|
994 |
# Plot correlation - recurrence |
|
|
995 |
sns.set(font_scale=0.8) |
|
|
996 |
f, ax = plt.subplots(figsize=(18, 10)) |
|
|
997 |
sns.heatmap(htmp_rec, vmax=1.0, square=True, ax=ax) |
|
|
998 |
plt.yticks(rotation=0) |
|
|
999 |
ax.get_xaxis().set_ticks([]) |
|
|
1000 |
plt.xticks(rotation=90) |
|
|
1001 |
f.savefig(os.path.join(spath, 'heatmap_Recur_corr_10.svg')) |
|
|
1002 |
# f.savefig(os.path.join(spath, 'heatmap_Recur_corr_10.png'), dpi=300) |
|
|
1003 |
plt.close() |
|
|
1004 |
|
|
|
1005 |
# Plot correlation - difference |
|
|
1006 |
sns.set(font_scale=0.8) |
|
|
1007 |
f, ax = plt.subplots(figsize=(18, 10)) |
|
|
1008 |
sns.heatmap(np.abs(htmp_nrec - htmp_rec), vmax=1.0, square=True, ax=ax) |
|
|
1009 |
plt.yticks(rotation=0) |
|
|
1010 |
ax.get_xaxis().set_ticks([]) |
|
|
1011 |
plt.xticks(rotation=90) |
|
|
1012 |
f.savefig(os.path.join(spath, 'heatmap_Recur_diff_corr_10.svg')) |
|
|
1013 |
# f.savefig(os.path.join(spath, 'heatmap_Recur_diff_corr_10.png'), dpi=300) |
|
|
1014 |
plt.close() |
|
|
1015 |
|
|
|
1016 |
# Create a DataFrame with mRMR features |
|
|
1017 |
df_sub = df_tumor[features + rm_keys + [lab_keys]].copy() |
|
|
1018 |
|
|
|
1019 |
# Perform computations |
|
|
1020 |
examine_recurrence(df_sub, spath, group=group) |
|
|
1021 |
|
|
|
1022 |
return spath |
|
|
1023 |
|
|
|
1024 |
|
|
|
1025 |
def plot_radiomics(rad, label, ids, spath): |
|
|
1026 |
""" |
|
|
1027 |
Plot radiomic features used for recurrence classification. |
|
|
1028 |
Args: |
|
|
1029 |
rad (pandas dataframe): radiomic features |
|
|
1030 |
label (pandas dataframe): recurrence labels |
|
|
1031 |
ids (list): animal IDs |
|
|
1032 |
spath (str): savepath |
|
|
1033 |
|
|
|
1034 |
Returns: |
|
|
1035 |
|
|
|
1036 |
""" |
|
|
1037 |
|
|
|
1038 |
sns.set_style('whitegrid') |
|
|
1039 |
|
|
|
1040 |
# Reduce the number of features to plot to 5 |
|
|
1041 |
keys = rad.keys() |
|
|
1042 |
rad = rad[keys[:6]] |
|
|
1043 |
|
|
|
1044 |
# Set up figure |
|
|
1045 |
fig = plt.figure(figsize=(10, 6)) |
|
|
1046 |
ax = fig.add_axes([0.1, 0.5, 0.80, 0.47]) |
|
|
1047 |
|
|
|
1048 |
# Get the list of radiomic features |
|
|
1049 |
keys = rad.keys() |
|
|
1050 |
|
|
|
1051 |
# Create a dictionary (later a Dataframe) for plotting with Seaborn |
|
|
1052 |
df = {'Features': [], 'Value': [], 'Recurrence': [], 'ID': []} |
|
|
1053 |
|
|
|
1054 |
# Populate dictionary, df, with features and values |
|
|
1055 |
for key in keys: |
|
|
1056 |
|
|
|
1057 |
# Get feature values |
|
|
1058 |
tmp = rad[key].to_numpy() |
|
|
1059 |
|
|
|
1060 |
# Normalize between -1 and 1 |
|
|
1061 |
tmp -= tmp.min() |
|
|
1062 |
tmp /= 0.5 * tmp.max() |
|
|
1063 |
tmp -= 1 |
|
|
1064 |
|
|
|
1065 |
for i in range(len(tmp)): |
|
|
1066 |
|
|
|
1067 |
# Remove "original" from the plots |
|
|
1068 |
nkey = key.replace('original_', '') |
|
|
1069 |
|
|
|
1070 |
# Capitalize method |
|
|
1071 |
ind1 = nkey.find('_') |
|
|
1072 |
method = nkey[:ind1] |
|
|
1073 |
if method == 'firstorder': |
|
|
1074 |
method = 'First Order' |
|
|
1075 |
elif method == 'shape': |
|
|
1076 |
method = method.capitalize() |
|
|
1077 |
else: |
|
|
1078 |
method = method.upper() |
|
|
1079 |
|
|
|
1080 |
# Get modality |
|
|
1081 |
ind2 = nkey.rfind('_') |
|
|
1082 |
modality = nkey[ind2 + 1:] |
|
|
1083 |
if method != 'Shape': |
|
|
1084 |
nkey = '%s\n%s\n%s' % (modality, method, nkey[ind1 + 1:ind2]) |
|
|
1085 |
else: |
|
|
1086 |
nkey = '%s\n%s' % (method, nkey[ind1 + 1:ind2]) |
|
|
1087 |
|
|
|
1088 |
|
|
|
1089 |
# Append features, values, recurrence, and IDs to the dictionary |
|
|
1090 |
df['Features'].append(nkey) |
|
|
1091 |
df['Value'].append(tmp[i]) |
|
|
1092 |
|
|
|
1093 |
if label.iloc[i]: |
|
|
1094 |
df['Recurrence'].append('Yes') |
|
|
1095 |
else: |
|
|
1096 |
df['Recurrence'].append('No') |
|
|
1097 |
|
|
|
1098 |
df['ID'].append(ids.iloc[i]) |
|
|
1099 |
|
|
|
1100 |
# Convert dictionary to Dataframe |
|
|
1101 |
df = pd.DataFrame.from_dict(df) |
|
|
1102 |
|
|
|
1103 |
# Plot |
|
|
1104 |
g = sns.boxplot(x='Features', y='Value', hue='Recurrence', data=df, ax=ax, palette="Set1") |
|
|
1105 |
|
|
|
1106 |
# Rotate feature names |
|
|
1107 |
g.set_xticklabels(g.get_xticklabels(), rotation=60, fontsize=10) |
|
|
1108 |
# g.set_yticklabels(g.get_yticklabels(), fontsize=10) |
|
|
1109 |
g.set_xlabel('Features', fontsize=10) |
|
|
1110 |
g.set_ylabel('Normalized Value') |
|
|
1111 |
|
|
|
1112 |
# Fix legend |
|
|
1113 |
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='Recurrence') |
|
|
1114 |
|
|
|
1115 |
# Save figure |
|
|
1116 |
fig.savefig(os.path.join(spath, 'feature_box.svg'), format='svg') |
|
|
1117 |
# fig.savefig(os.path.join(spath, 'feature_box.png'), format='png', dpi=300) |
|
|
1118 |
|
|
|
1119 |
|
|
|
1120 |
def clean_key(key): |
|
|
1121 |
# Remove "original" from the plots |
|
|
1122 |
nkey = key.replace('original_', '') |
|
|
1123 |
|
|
|
1124 |
# Capitalize method |
|
|
1125 |
ind1 = nkey.find('_') |
|
|
1126 |
method = nkey[:ind1] |
|
|
1127 |
if method == 'firstorder': |
|
|
1128 |
method = 'First Order' |
|
|
1129 |
elif method == 'shape': |
|
|
1130 |
method = method.capitalize() |
|
|
1131 |
else: |
|
|
1132 |
method = method.upper() |
|
|
1133 |
|
|
|
1134 |
# Get modality |
|
|
1135 |
ind2 = nkey.rfind('_') |
|
|
1136 |
modality = nkey[ind2 + 1:] |
|
|
1137 |
if method != 'Shape': |
|
|
1138 |
nkey = '%s\n%s\n%s' % (modality, method, nkey[ind1 + 1:ind2]) |
|
|
1139 |
else: |
|
|
1140 |
nkey = '%s\n%s' % (method, nkey[ind1 + 1:ind2]) |
|
|
1141 |
|
|
|
1142 |
return nkey |
|
|
1143 |
|
|
|
1144 |
|
|
|
1145 |
def paired_ttests_loc(df, base_path, area): |
|
|
1146 |
""" |
|
|
1147 |
Compute paired t-test for data |
|
|
1148 |
Args: |
|
|
1149 |
df (Pandas df): full radiomics data (with label columns) |
|
|
1150 |
area (str): mask area |
|
|
1151 |
|
|
|
1152 |
Returns: |
|
|
1153 |
|
|
|
1154 |
""" |
|
|
1155 |
sns.set_style('whitegrid') |
|
|
1156 |
|
|
|
1157 |
# Set up spath |
|
|
1158 |
spath = os.path.join(base_path, 'Analysis', 'ttests') |
|
|
1159 |
if not os.path.exists(spath): |
|
|
1160 |
os.makedirs(spath) |
|
|
1161 |
|
|
|
1162 |
# Set up output |
|
|
1163 |
log_file_path = os.path.join(spath, 'ttest_%s.txt' % area) |
|
|
1164 |
f = open(log_file_path, 'w') |
|
|
1165 |
|
|
|
1166 |
# Get indicies of pre/post RT |
|
|
1167 |
inds_1 = df['Group'].str.contains('Pre') |
|
|
1168 |
inds_2 = df['Group'].str.contains('Post') |
|
|
1169 |
keys = df.keys() |
|
|
1170 |
n_sets = len(df['ID'].unique()) |
|
|
1171 |
label = np.empty(shape=len(inds_1), dtype=str) |
|
|
1172 |
non_data_labels = ['Group', 'recurrence', 'rec_days', 'ID'] |
|
|
1173 |
|
|
|
1174 |
# Create a dictionary (later a Dataframe) for evaluation and plotting with Seaborn |
|
|
1175 |
dfn = {'Features': [], 'P-Value': [], 'OrigFeatures': [], 'Contrast': []} |
|
|
1176 |
|
|
|
1177 |
|
|
|
1178 |
# kkeys = [key for key in keys if key not in non_data_labels] |
|
|
1179 |
# df_k = df[kkeys] |
|
|
1180 |
# ob = MultiComparison(df_k, inds_1) |
|
|
1181 |
# ob.allpairtest(stats.ttest_rel) |
|
|
1182 |
|
|
|
1183 |
# Populate dictionary, df, with features and values |
|
|
1184 |
for key in keys: |
|
|
1185 |
|
|
|
1186 |
if key not in non_data_labels: |
|
|
1187 |
|
|
|
1188 |
# Get feature values |
|
|
1189 |
tmp = df[key].to_numpy() |
|
|
1190 |
|
|
|
1191 |
p1 = tmp[inds_1] |
|
|
1192 |
p2 = tmp[inds_2] |
|
|
1193 |
|
|
|
1194 |
results = stats.ttest_rel(p1, p2) |
|
|
1195 |
|
|
|
1196 |
if not np.isnan(results.pvalue): |
|
|
1197 |
|
|
|
1198 |
# Remove "original" from the plots |
|
|
1199 |
nkey = key.replace('original_', '') |
|
|
1200 |
|
|
|
1201 |
# Capitalize method |
|
|
1202 |
ind1 = nkey.find('_') |
|
|
1203 |
method = nkey[:ind1] |
|
|
1204 |
if method == 'firstorder': |
|
|
1205 |
method = 'First Order' |
|
|
1206 |
elif method == 'shape': |
|
|
1207 |
method = method.capitalize() |
|
|
1208 |
else: |
|
|
1209 |
method = method.upper() |
|
|
1210 |
|
|
|
1211 |
# Get modality |
|
|
1212 |
ind2 = nkey.rfind('_') |
|
|
1213 |
modality = nkey[ind2+1:] |
|
|
1214 |
if method != 'Shape': |
|
|
1215 |
nkey = '%s\n%s\n%s' % (modality, method, nkey[ind1+1:ind2]) |
|
|
1216 |
else: |
|
|
1217 |
nkey = '%s\n%s' % (method, nkey[ind1+1:ind2]) |
|
|
1218 |
|
|
|
1219 |
# Add contrast |
|
|
1220 |
if '_T1C' in key: |
|
|
1221 |
con = 'T1C' |
|
|
1222 |
|
|
|
1223 |
elif 'T1' in key: |
|
|
1224 |
con = 'T1' |
|
|
1225 |
|
|
|
1226 |
elif 'T2' in key: |
|
|
1227 |
con = 'T2' |
|
|
1228 |
|
|
|
1229 |
else: |
|
|
1230 |
con = None |
|
|
1231 |
|
|
|
1232 |
if not nkey in dfn['Features']: |
|
|
1233 |
# Append features, values, recurrence, and IDs to the dictionary |
|
|
1234 |
dfn['Features'].append(nkey) |
|
|
1235 |
dfn['OrigFeatures'].append(key) |
|
|
1236 |
dfn['P-Value'].append(results.pvalue) |
|
|
1237 |
dfn['Contrast'].append(con) |
|
|
1238 |
|
|
|
1239 |
# Convert dictionary to Dataframe |
|
|
1240 |
dfn = pd.DataFrame.from_dict(dfn) |
|
|
1241 |
|
|
|
1242 |
f.write('%d significant features found after t-tests!\n' % np.sum(dfn['P-Value'] < 0.05)) |
|
|
1243 |
|
|
|
1244 |
from statsmodels.sandbox.stats.multicomp import multipletests |
|
|
1245 |
mod = multipletests(dfn['P-Value'], alpha=0.05) |
|
|
1246 |
|
|
|
1247 |
# Plot corrected P-values |
|
|
1248 |
fig = plt.figure(figsize=(10, 6)) |
|
|
1249 |
plt.plot(dfn['P-Value'], 'bo', label='Original') |
|
|
1250 |
plt.plot(mod[1], 'ro', label='Corrected') |
|
|
1251 |
plt.legend() |
|
|
1252 |
plt.xlim([0, 400]) |
|
|
1253 |
plt.ylabel('P-Value') |
|
|
1254 |
plt.xlabel('Features') |
|
|
1255 |
# plt.savefig(os.path.join(spath, 'compare_corrected_pvalues.png'), dpi=300) |
|
|
1256 |
plt.savefig(os.path.join(spath, 'compare_corrected_pvalues.svg'), dpi=300) |
|
|
1257 |
plt.close(fig) |
|
|
1258 |
|
|
|
1259 |
# Remove all but the smallest p-values |
|
|
1260 |
dfn['P-Value'] = mod[1] |
|
|
1261 |
dfn = dfn[mod[0]] |
|
|
1262 |
dfn = dfn.sort_values('P-Value', ascending=True) # Sort by P-Value, ascending |
|
|
1263 |
dfn = dfn.reset_index(drop=True) |
|
|
1264 |
f.write('%d significant features found after multiple p-value correction!\n' % len(dfn)) |
|
|
1265 |
|
|
|
1266 |
# p_thresh = 0.05 |
|
|
1267 |
# dfn = dfn.drop(dfn[dfn['P-Value'] > p_thresh].index) |
|
|
1268 |
# f.write('%d significant features found!\n' % len(dfn)) |
|
|
1269 |
# dfn = dfn.reset_index(drop=True) |
|
|
1270 |
|
|
|
1271 |
# Get stats on which contrasts are most significant |
|
|
1272 |
t1 = 0 |
|
|
1273 |
t1c = 0 |
|
|
1274 |
t2 = 0 |
|
|
1275 |
shape = 0 |
|
|
1276 |
feature_cats = {'firstorder': 0, |
|
|
1277 |
'glcm': 0, |
|
|
1278 |
'glszm': 0, |
|
|
1279 |
'glrlm': 0, |
|
|
1280 |
'glrlm': 0, |
|
|
1281 |
'ngtdm': 0, |
|
|
1282 |
'gldm': 0} |
|
|
1283 |
|
|
|
1284 |
for key in dfn['Features']: |
|
|
1285 |
|
|
|
1286 |
if 'T1C' in key: |
|
|
1287 |
t1c += 1 |
|
|
1288 |
|
|
|
1289 |
elif 'T1' in key: |
|
|
1290 |
t1 += 1 |
|
|
1291 |
|
|
|
1292 |
elif 'T2' in key: |
|
|
1293 |
t2 += 1 |
|
|
1294 |
|
|
|
1295 |
elif 'Shape' in key: |
|
|
1296 |
shape += 1 |
|
|
1297 |
|
|
|
1298 |
if 'first order' in key.lower(): |
|
|
1299 |
feature_cats['firstorder'] += 1 |
|
|
1300 |
elif 'glcm' in key.lower(): |
|
|
1301 |
feature_cats['glcm'] += 1 |
|
|
1302 |
elif 'glszm' in key.lower(): |
|
|
1303 |
feature_cats['glszm'] += 1 |
|
|
1304 |
elif 'glrlm' in key.lower(): |
|
|
1305 |
feature_cats['glrlm'] += 1 |
|
|
1306 |
elif 'ngtdm' in key.lower(): |
|
|
1307 |
feature_cats['ngtdm'] += 1 |
|
|
1308 |
elif 'gldm' in key.lower(): |
|
|
1309 |
feature_cats['gldm'] += 1 |
|
|
1310 |
|
|
|
1311 |
counts_df = pd.DataFrame({'Contrast': ['Shape', 'T1', 'T1C', 'T2'], 'Count': [shape, t1, t1c, t2]}) |
|
|
1312 |
|
|
|
1313 |
f.write('Features summary for radiomics computed from %s mask\n' % area) |
|
|
1314 |
f.write('-'*40 + '\n') |
|
|
1315 |
|
|
|
1316 |
f.write('Contrast\tCount\n') |
|
|
1317 |
for i, con in enumerate(counts_df['Contrast']): |
|
|
1318 |
|
|
|
1319 |
f.write('%s\t\t%d\n' % (con, counts_df['Count'].iloc[i])) |
|
|
1320 |
|
|
|
1321 |
f.write('-'*40 + '\n') |
|
|
1322 |
f.write('Feature\tCount\n') |
|
|
1323 |
for key in feature_cats.keys(): |
|
|
1324 |
|
|
|
1325 |
f.write('%s\t\t%s\n' % (key, feature_cats[key])) |
|
|
1326 |
|
|
|
1327 |
|
|
|
1328 |
|
|
|
1329 |
f.write('-'*40 + '\n') |
|
|
1330 |
for con, pval in zip(dfn['Features'], dfn['P-Value']): |
|
|
1331 |
|
|
|
1332 |
con = con.splitlines() |
|
|
1333 |
if len(con) == 3: |
|
|
1334 |
f.write('%s\n%s\n%s\nP-value: %0.5f\n\n' % (con[1], con[2], con[0], pval) ) |
|
|
1335 |
else: |
|
|
1336 |
f.write('%s\n%s\nP-Value: %0.5f\n\n' % (con[1], con[0], pval) ) |
|
|
1337 |
|
|
|
1338 |
|
|
|
1339 |
fig = plt.figure(figsize=(10, 6)) |
|
|
1340 |
ax = fig.add_axes([0.11, 0.15, 0.87, 0.65]) |
|
|
1341 |
|
|
|
1342 |
sns.barplot(x='Contrast', y='Count', data=counts_df, ax=ax, color='midnightblue', saturation=0.5) |
|
|
1343 |
plt.ylabel('Num. features (p < 0.05)') |
|
|
1344 |
# plt.show() |
|
|
1345 |
fig.savefig(os.path.join(spath, 'ttest_feature_count_%s.svg' % area)) |
|
|
1346 |
|
|
|
1347 |
# Plot specific categories |
|
|
1348 |
cats = ['firstorder', 'shape', 'GLCM'] |
|
|
1349 |
mk_size = 6 |
|
|
1350 |
cmap = sns.color_palette() |
|
|
1351 |
|
|
|
1352 |
# Set up a df for plotting all values |
|
|
1353 |
df_plot = {'Features': [], 'Value': [], 'RT': [], } |
|
|
1354 |
|
|
|
1355 |
for key, nkey in zip(dfn['OrigFeatures'], dfn['Features']): |
|
|
1356 |
|
|
|
1357 |
# Get feature values |
|
|
1358 |
tmp = df[key].to_numpy() |
|
|
1359 |
|
|
|
1360 |
# Normalize between -1 and 1 |
|
|
1361 |
tmp -= tmp.min() |
|
|
1362 |
tmp /= 0.5 * tmp.max() |
|
|
1363 |
tmp -= 1 |
|
|
1364 |
|
|
|
1365 |
for i in range(len(tmp)): |
|
|
1366 |
|
|
|
1367 |
# Append features, values, recurrence, and IDs to the dictionary |
|
|
1368 |
df_plot['Features'].append(nkey) |
|
|
1369 |
df_plot['Value'].append(tmp[i]) |
|
|
1370 |
|
|
|
1371 |
if inds_1.iloc[i]: |
|
|
1372 |
df_plot['RT'].append('Pre RT') |
|
|
1373 |
else: |
|
|
1374 |
df_plot['RT'].append('Post RT') |
|
|
1375 |
|
|
|
1376 |
df_plot = pd.DataFrame.from_dict(df_plot) |
|
|
1377 |
|
|
|
1378 |
# Set up figure |
|
|
1379 |
fig = plt.figure(figsize=(20 * len(dfn['Features'])/10, 8)) |
|
|
1380 |
ax = fig.add_axes([0.08, 0.4, 0.9, 0.60]) |
|
|
1381 |
|
|
|
1382 |
# Plot all |
|
|
1383 |
g = sns.boxplot(x='Features', y='Value', hue='RT', data=df_plot, ax=ax, palette="Set1") |
|
|
1384 |
|
|
|
1385 |
# Rotate feature names |
|
|
1386 |
# g.set_xticklabels(g.get_xticklabels(), rotation=35, fontsize=18) |
|
|
1387 |
# ax.set_ylabel('') |
|
|
1388 |
g.set_xticklabels(g.get_xticklabels(), rotation=35, fontsize=10) |
|
|
1389 |
g.set_yticklabels(g.get_yticklabels(), fontsize=10) |
|
|
1390 |
g.set_xlabel('Features', fontsize=10) |
|
|
1391 |
ax.set_ylabel('') |
|
|
1392 |
|
|
|
1393 |
# Fix legend |
|
|
1394 |
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='RT') |
|
|
1395 |
|
|
|
1396 |
# Save figure |
|
|
1397 |
fig.savefig(os.path.join(spath, 'ttest_feature_box_%s.svg' % area), format='svg') |
|
|
1398 |
# fig.savefig(os.path.join(spath, 'ttest_feature_box_%s.png' % area), format='png', dpi=300) |
|
|
1399 |
|
|
|
1400 |
# Plot top features |
|
|
1401 |
nkeys = 6 |
|
|
1402 |
keep_keys = dfn['Features'].iloc[:nkeys] |
|
|
1403 |
|
|
|
1404 |
small_df_plot = df_plot.loc[df_plot['Features'].isin(keep_keys), :] |
|
|
1405 |
|
|
|
1406 |
fs = 10 |
|
|
1407 |
fig = plt.figure() #plt.figure(figsize=(10, 6)) |
|
|
1408 |
ax = fig.add_axes([0.08, 0.4, 0.80, 0.59]) |
|
|
1409 |
|
|
|
1410 |
# Plot |
|
|
1411 |
g = sns.boxplot(x='Features', y='Value', hue='RT', data=small_df_plot, ax=ax, palette="Set1") |
|
|
1412 |
|
|
|
1413 |
# Rotate feature names |
|
|
1414 |
g.set_xticklabels(g.get_xticklabels(), rotation=60, fontsize=fs) |
|
|
1415 |
g.set_xlabel('Features', fontsize=fs) |
|
|
1416 |
g.set_ylabel('Normalized Value', fontsize=fs) |
|
|
1417 |
|
|
|
1418 |
# Fix legend |
|
|
1419 |
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='Recurrence') |
|
|
1420 |
|
|
|
1421 |
# Save figure |
|
|
1422 |
fig.savefig(os.path.join(spath, 'ttest_feature_box_select%d_%s.svg' % (nkeys, area) ), format='svg') |
|
|
1423 |
# fig.savefig(os.path.join(spath, 'ttest_feature_box_select%d_%s.png' % (nkeys, area) ), format='png') |
|
|
1424 |
|
|
|
1425 |
# Plot specific features |
|
|
1426 |
if 'tumor' in area: |
|
|
1427 |
keep_keys = ['T2\nGLSZM\nZoneVariance', |
|
|
1428 |
'T2\nGLRLM\nRunLengthNonUniformity', |
|
|
1429 |
'Shape\nVoxelVolume', |
|
|
1430 |
'T1C\nGLSZM\nLargeAreaHighGrayLevelEmphasis', |
|
|
1431 |
'T1C\nGLDM\nDependenceNonUniformity', |
|
|
1432 |
'T1\nFirstOrder\nEnergy' |
|
|
1433 |
] |
|
|
1434 |
elif 'bed' in area: |
|
|
1435 |
keep_keys = ['T2\nGLDM\nDependenceNonUniformity', |
|
|
1436 |
'T2\nFirst Order\nEnergy', |
|
|
1437 |
'Shape\nVoxelVolume', |
|
|
1438 |
'T1C\nGLRLM\nRunVariance', |
|
|
1439 |
'T1\nGLSZM\nLongRunLowGrayLevelEmphasis', |
|
|
1440 |
'T1C\nFirst Order\nRange' |
|
|
1441 |
] |
|
|
1442 |
else: # Edge |
|
|
1443 |
keep_keys = ['T2\nFirst Order\nEnergy', |
|
|
1444 |
'Shape\nSurfaceArea', |
|
|
1445 |
'Shape\nMajorAxisLength', |
|
|
1446 |
'T1C\nFirst Order\nRange', |
|
|
1447 |
'Shape\nMaximum3DDiameter', |
|
|
1448 |
'T1C\nGLRLM\nGrayLevelNonUniformity' |
|
|
1449 |
] |
|
|
1450 |
|
|
|
1451 |
small_df_plot = df_plot.loc[df_plot['Features'].isin(keep_keys), :] |
|
|
1452 |
|
|
|
1453 |
fig = plt.figure(figsize=(10, 6)) |
|
|
1454 |
ax = fig.add_axes([0.08, 0.4, 0.80, 0.59]) |
|
|
1455 |
|
|
|
1456 |
# Plot |
|
|
1457 |
g = sns.boxplot(x='Features', y='Value', hue='RT', data=small_df_plot, ax=ax, palette="Set1") |
|
|
1458 |
|
|
|
1459 |
# Rotate feature names |
|
|
1460 |
g.set_xticklabels(g.get_xticklabels(), rotation=60, fontsize=10) |
|
|
1461 |
g.set_xlabel('Features', fontsize=10) |
|
|
1462 |
g.set_ylabel('Normalized Value') |
|
|
1463 |
|
|
|
1464 |
# Fix legend |
|
|
1465 |
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='Recurrence') |
|
|
1466 |
|
|
|
1467 |
# Save figure |
|
|
1468 |
fig.savefig(os.path.join(spath, 'ttest_feature_box_hand_select_%s.svg' % area), format='svg') |
|
|
1469 |
# fig.savefig(os.path.join(spath, 'ttest_feature_box_hand_select_%s.png' % area), format='png') |
|
|
1470 |
|
|
|
1471 |
# Close log file |
|
|
1472 |
f.close() |
|
|
1473 |
|
|
|
1474 |
|
|
|
1475 |
def change_with_RT_rec(df, spath, area='tumor'): |
|
|
1476 |
""" |
|
|
1477 |
Computes the difference between pre/post RT and generates plots showing these values. |
|
|
1478 |
Args: |
|
|
1479 |
df (pandas dataframe): radiomic features |
|
|
1480 |
spath (str): savepath |
|
|
1481 |
area (str): mask |
|
|
1482 |
|
|
|
1483 |
Returns: |
|
|
1484 |
|
|
|
1485 |
""" |
|
|
1486 |
|
|
|
1487 |
sns.set_style('whitegrid') |
|
|
1488 |
|
|
|
1489 |
feature_seletion = 'ttest' |
|
|
1490 |
|
|
|
1491 |
if 'ttest' in feature_seletion: |
|
|
1492 |
# Set up save paths |
|
|
1493 |
spath = os.path.join(spath, 'Analysis', 'ttests') |
|
|
1494 |
if not os.path.exists(spath): |
|
|
1495 |
os.makedirs(spath) |
|
|
1496 |
|
|
|
1497 |
# Compute p=values |
|
|
1498 |
# Get indicies of pre/post RT |
|
|
1499 |
inds_1 = df['Group'].str.contains('Pre') |
|
|
1500 |
inds_2 = df['Group'].str.contains('Post') |
|
|
1501 |
keys = df.keys() |
|
|
1502 |
non_data_labels = ['Group', 'recurrence', 'rec_days', 'ID'] |
|
|
1503 |
|
|
|
1504 |
# Create a dictionary (later a Dataframe) for evaluation and plotting with Seaborn |
|
|
1505 |
dfn = {'Features': [], 'P-Value': [], 'OrigFeatures': [], 'Contrast': []} |
|
|
1506 |
|
|
|
1507 |
# Populate dictionary, df, with features and values |
|
|
1508 |
for key in keys: |
|
|
1509 |
|
|
|
1510 |
if key not in non_data_labels: |
|
|
1511 |
|
|
|
1512 |
# Get feature values |
|
|
1513 |
tmp = df[key].to_numpy() |
|
|
1514 |
|
|
|
1515 |
p1 = tmp[inds_1] |
|
|
1516 |
p2 = tmp[inds_2] |
|
|
1517 |
|
|
|
1518 |
results = stats.ttest_rel(p1, p2) |
|
|
1519 |
|
|
|
1520 |
if not np.isnan(results.pvalue): |
|
|
1521 |
|
|
|
1522 |
# Remove "original" from the plots |
|
|
1523 |
nkey = key.replace('original_', '') |
|
|
1524 |
|
|
|
1525 |
# Capitalize method |
|
|
1526 |
ind1 = nkey.find('_') |
|
|
1527 |
method = nkey[:ind1] |
|
|
1528 |
if method == 'firstorder': |
|
|
1529 |
method = 'First Order' |
|
|
1530 |
elif method == 'shape': |
|
|
1531 |
method = method.capitalize() |
|
|
1532 |
else: |
|
|
1533 |
method = method.upper() |
|
|
1534 |
|
|
|
1535 |
# Get modality |
|
|
1536 |
ind2 = nkey.rfind('_') |
|
|
1537 |
modality = nkey[ind2+1:] |
|
|
1538 |
if method != 'Shape': |
|
|
1539 |
nkey = '%s\n%s\n%s' % (modality, method, nkey[ind1+1:ind2]) |
|
|
1540 |
else: |
|
|
1541 |
nkey = '%s\n%s' % (method, nkey[ind1+1:ind2]) |
|
|
1542 |
|
|
|
1543 |
# Add contrast |
|
|
1544 |
if '_T1C' in key: |
|
|
1545 |
con = 'T1C' |
|
|
1546 |
|
|
|
1547 |
elif 'T1' in key: |
|
|
1548 |
con = 'T1' |
|
|
1549 |
|
|
|
1550 |
elif 'T2' in key: |
|
|
1551 |
con = 'T2' |
|
|
1552 |
|
|
|
1553 |
else: |
|
|
1554 |
con = None |
|
|
1555 |
|
|
|
1556 |
if not nkey in dfn['Features']: |
|
|
1557 |
# Append features, values, recurrence, and IDs to the dictionary |
|
|
1558 |
dfn['Features'].append(nkey) |
|
|
1559 |
dfn['OrigFeatures'].append(key) |
|
|
1560 |
dfn['P-Value'].append(results.pvalue) |
|
|
1561 |
dfn['Contrast'].append(con) |
|
|
1562 |
|
|
|
1563 |
# Convert dictionary to Dataframe |
|
|
1564 |
dfn = pd.DataFrame.from_dict(dfn) |
|
|
1565 |
|
|
|
1566 |
from statsmodels.sandbox.stats.multicomp import multipletests |
|
|
1567 |
mod = multipletests(dfn['P-Value'], alpha=0.05) |
|
|
1568 |
|
|
|
1569 |
# Remove all but the smallest p-values |
|
|
1570 |
dfn['P-Value'] = mod[1] |
|
|
1571 |
dfn = dfn[mod[0]] |
|
|
1572 |
dfn = dfn.sort_values('P-Value', ascending=True) # Sort by P-Value, ascending |
|
|
1573 |
dfn = dfn.reset_index(drop=True) |
|
|
1574 |
|
|
|
1575 |
# Get a list of good features |
|
|
1576 |
features = dfn['OrigFeatures'].to_list() |
|
|
1577 |
# features = features |
|
|
1578 |
n_features = len(features) |
|
|
1579 |
|
|
|
1580 |
|
|
|
1581 |
# Set up save path |
|
|
1582 |
spath_tmp = os.path.join(spath, 'Diff_{}'.format(area)) |
|
|
1583 |
if not os.path.exists(spath_tmp): |
|
|
1584 |
os.makedirs(spath_tmp) |
|
|
1585 |
|
|
|
1586 |
# Get all unique ids |
|
|
1587 |
uniq_ids = df['ID'].unique() |
|
|
1588 |
|
|
|
1589 |
# Features to plot |
|
|
1590 |
features_to_plot = ['original_shape_SurfaceArea_T1', |
|
|
1591 |
'original_firstorder_TotalEnergy_T1C', |
|
|
1592 |
'original_gldm_DependenceEntropy_T2', |
|
|
1593 |
'original_glrlm_LongRunLowGrayLevelemphasis_T2', |
|
|
1594 |
'original_glrlm_GrayLevelNonUniformity_T1C', |
|
|
1595 |
'original_glrlm_RunVariance_T1' |
|
|
1596 |
] |
|
|
1597 |
|
|
|
1598 |
# Compute difference (f_post - f_pre) / f_post |
|
|
1599 |
diff_df_plot = {'Feature': [], 'Value': [], 'recurrence': [], 'RT': []} # Plotting df |
|
|
1600 |
diff_df = {'recurrence': []} # Processing df |
|
|
1601 |
for id in uniq_ids: |
|
|
1602 |
|
|
|
1603 |
# Get animal data |
|
|
1604 |
animal = df.loc[df['ID'] == id] |
|
|
1605 |
|
|
|
1606 |
# Get recurrence |
|
|
1607 |
rec = int(animal.loc[['Pre' in a for a in animal['Group']]]['recurrence']) |
|
|
1608 |
|
|
|
1609 |
# Append to processing dataframe |
|
|
1610 |
diff_df['recurrence'].append(rec) |
|
|
1611 |
|
|
|
1612 |
# Perform |
|
|
1613 |
for key in features_to_plot: |
|
|
1614 |
|
|
|
1615 |
# If the key is a data label |
|
|
1616 |
if key not in non_data_labels and key in features[:n_features]: |
|
|
1617 |
# Get pre/post values |
|
|
1618 |
post_val = float(animal.loc[['Post' in a for a in animal['Group']]][key]) |
|
|
1619 |
pre_val = float(animal.loc[['Pre' in a for a in animal['Group']]][key]) |
|
|
1620 |
|
|
|
1621 |
# Perform difference |
|
|
1622 |
val = 100 * (post_val - pre_val) / (post_val + 1e-6) |
|
|
1623 |
|
|
|
1624 |
|
|
|
1625 |
diff_df_plot['Value'].append(pre_val) |
|
|
1626 |
diff_df_plot['RT'].append('Pre') |
|
|
1627 |
|
|
|
1628 |
diff_df_plot['Value'].append(post_val) |
|
|
1629 |
diff_df_plot['RT'].append('Post') |
|
|
1630 |
|
|
|
1631 |
# Clean key for plotting |
|
|
1632 |
key = clean_key(key) |
|
|
1633 |
|
|
|
1634 |
for _ in range(2): |
|
|
1635 |
|
|
|
1636 |
diff_df_plot['Feature'].append(key) |
|
|
1637 |
|
|
|
1638 |
# Keep track of recurrence |
|
|
1639 |
if rec == 0: |
|
|
1640 |
diff_df_plot['recurrence'].append('No') |
|
|
1641 |
else: |
|
|
1642 |
diff_df_plot['recurrence'].append('Yes') |
|
|
1643 |
|
|
|
1644 |
if key in diff_df.keys(): |
|
|
1645 |
diff_df[key].append(val) |
|
|
1646 |
else: |
|
|
1647 |
diff_df[key] = [val] |
|
|
1648 |
|
|
|
1649 |
diff_df_plot = pd.DataFrame.from_dict(diff_df_plot) |
|
|
1650 |
# diff_df = pd.DataFrame.from_dict(diff_df) |
|
|
1651 |
|
|
|
1652 |
# Normalize data |
|
|
1653 |
features_to_plot = [clean_key(i) for i in features_to_plot] |
|
|
1654 |
for key in features_to_plot: |
|
|
1655 |
|
|
|
1656 |
inds = diff_df_plot['Feature'] == key |
|
|
1657 |
vals = diff_df_plot.loc[inds]['Value'] |
|
|
1658 |
mn = vals.min() |
|
|
1659 |
mx = vals.max() |
|
|
1660 |
|
|
|
1661 |
vals = (vals - mn) / (mx - mn + 1e-6) |
|
|
1662 |
|
|
|
1663 |
diff_df_plot['Value'][inds] = vals[inds] |
|
|
1664 |
|
|
|
1665 |
# diff_df_plot = diff_df_plot[keep_keys] |
|
|
1666 |
|
|
|
1667 |
# Set up figure |
|
|
1668 |
# fig = plt.figure(figsize=(60, 10)) |
|
|
1669 |
# ax = fig.add_axes([0.05, 0.35, 0.9, 0.60]) |
|
|
1670 |
# |
|
|
1671 |
# # Plot all |
|
|
1672 |
# g = sns.boxplot(x='Feature', y='Value', hue='recurrence', data=diff_df_plot, ax=ax, palette="Set1") |
|
|
1673 |
# |
|
|
1674 |
# # Rotate feature names |
|
|
1675 |
# g.set_xticklabels(g.get_xticklabels(), rotation=35, fontsize=10) |
|
|
1676 |
# g.set_yticklabels(g.get_yticklabels(), fontsize=10) |
|
|
1677 |
# g.set_xlabel('Features', fontsize=10) |
|
|
1678 |
# ax.set_ylabel('Change from Pre to Post RT [%]') |
|
|
1679 |
# plt.ylim([-100, 100]) |
|
|
1680 |
# |
|
|
1681 |
# # Fix legend |
|
|
1682 |
# plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='Recurrence') |
|
|
1683 |
# |
|
|
1684 |
# plt.savefig(os.path.join(spath_tmp, 'diff_all_%s.svg' % area)) |
|
|
1685 |
# |
|
|
1686 |
# plt.close(fig) |
|
|
1687 |
|
|
|
1688 |
# Set up figure |
|
|
1689 |
fig = plt.figure(figsize=(9, 6)) |
|
|
1690 |
ax1 = plt.subplot(212) |
|
|
1691 |
ax2 = plt.subplot(211, sharex=ax1) |
|
|
1692 |
|
|
|
1693 |
df_tmp = diff_df_plot.loc[diff_df_plot['RT'] == 'Post'] |
|
|
1694 |
g1 = sns.boxplot(x='Feature', y='Value', hue='recurrence', data=df_tmp, ax=ax1, palette="Set1") |
|
|
1695 |
df_tmp = diff_df_plot.loc[diff_df_plot['RT'] == 'Pre'] |
|
|
1696 |
g2 = sns.boxplot(x='Feature', y='Value', hue='recurrence', data=df_tmp, ax=ax2, palette="Set1") |
|
|
1697 |
ax1.get_legend().remove() |
|
|
1698 |
ax2.get_legend().remove() |
|
|
1699 |
|
|
|
1700 |
# # Rotate feature names |
|
|
1701 |
g1.set_xticklabels(ax1.get_xticklabels(), rotation=40, fontsize=10) |
|
|
1702 |
# ax1.set_yticklabels(ax1.get_yticklabels(), fontsize=10) |
|
|
1703 |
g1.set_xlabel('Features', fontsize=10) |
|
|
1704 |
g1.set_ylabel('Normalized Value', fontsize=10) |
|
|
1705 |
g2.set_xlabel('Features', fontsize=10, visible=False) |
|
|
1706 |
g2.set_ylabel('Normalized Value', fontsize=10) |
|
|
1707 |
g2.set_xticklabels(g1.get_xticklabels(), rotation=35, fontsize=10, visible=False) |
|
|
1708 |
# ax2.set_yticklabels(ax1.get_yticklabels(), fontsize=10) |
|
|
1709 |
ax1.set_ylim([-0.05, 1.05]) |
|
|
1710 |
ax2.set_ylim([-0.05, 1.05]) |
|
|
1711 |
|
|
|
1712 |
# Fix legend |
|
|
1713 |
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='Recurrence') |
|
|
1714 |
plt.tight_layout() |
|
|
1715 |
|
|
|
1716 |
plt.savefig(os.path.join(spath_tmp, 'RT_rec_boxplots_%s.svg' % area)) |
|
|
1717 |
|
|
|
1718 |
plt.close(fig) |
|
|
1719 |
|
|
|
1720 |
# Attempt classification |
|
|
1721 |
|
|
|
1722 |
# Get numeric data |
|
|
1723 |
# keys = diff_df.keys().tolist() |
|
|
1724 |
# lab_keys = 'recurrence' |
|
|
1725 |
# data_keys = keys |
|
|
1726 |
# data_keys.remove(lab_keys) |
|
|
1727 |
# |
|
|
1728 |
# # Separate data and labels |
|
|
1729 |
# rad = diff_df[data_keys].copy() |
|
|
1730 |
# rad = scale_features(rad) |
|
|
1731 |
# label = diff_df[lab_keys] |
|
|
1732 |
# # ids = diff_df['ID'] |
|
|
1733 |
# |
|
|
1734 |
# # SVM classifier |
|
|
1735 |
# svm_classifier_loc(rad, label, spath_tmp) |
|
|
1736 |
# |
|
|
1737 |
# # NN classifier |
|
|
1738 |
# nn_classifier_loc(rad, label, spath_tmp) |
|
|
1739 |
|
|
|
1740 |
|
|
|
1741 |
def load_data(base_path): |
|
|
1742 |
|
|
|
1743 |
spath = os.path.join(base_path, 'Analysis') |
|
|
1744 |
|
|
|
1745 |
snames = [os.path.join(spath, 'tumor_df.csv'), |
|
|
1746 |
os.path.join(spath, 'edge_df.csv'), |
|
|
1747 |
os.path.join(spath, 'bed_df.csv')] |
|
|
1748 |
|
|
|
1749 |
if not all([os.path.exists(i) for i in snames]): |
|
|
1750 |
|
|
|
1751 |
# Paths |
|
|
1752 |
summary_file = os.path.join(base_path, 'Summary.xlsx') |
|
|
1753 |
recurrence_file = os.path.join(base_path, 'recurrence.xlsx') |
|
|
1754 |
log_file = os.path.join(base_path, 'processing_log.json') |
|
|
1755 |
|
|
|
1756 |
recurrence_threshold = 20 |
|
|
1757 |
|
|
|
1758 |
# Animals to exclude from analysis until segmentations are fixed |
|
|
1759 |
exclude = ['K520719', 'K520918', 'K521092', 'K521104', 'K520762'] |
|
|
1760 |
|
|
|
1761 |
# Load summary data |
|
|
1762 |
summary_df, log = load_study_logs(summary_file, log_file) |
|
|
1763 |
df_rec = load_recurrence_log(recurrence_file, recurrence_threshold) |
|
|
1764 |
|
|
|
1765 |
# Sort data |
|
|
1766 |
df_rec = sort_study_data(df_rec, summary_df, exclude) |
|
|
1767 |
keep = df_rec['animalID'].tolist() |
|
|
1768 |
|
|
|
1769 |
# Load pre |
|
|
1770 |
radiomics_paths = os.path.join(base_path, 'Radiomics_control_preRT.txt') |
|
|
1771 |
df_pre_tumor_cnt = load_radiomics(radiomics_paths, exclude, region='tumor', keep=keep, group='PreCnt') |
|
|
1772 |
df_pre_edge_cnt = load_radiomics(radiomics_paths, exclude, region='edge', keep=keep, group='PreCnt') |
|
|
1773 |
df_pre_bed_cnt = load_radiomics(radiomics_paths, exclude, region='bed', keep=keep, group='PreCnt') |
|
|
1774 |
|
|
|
1775 |
radiomics_paths = os.path.join(base_path, 'Radiomics_PD1_preRT.txt') |
|
|
1776 |
df_pre_tumor_pd1 = load_radiomics(radiomics_paths, exclude, region='tumor', keep=keep, group='PrePD1') |
|
|
1777 |
df_pre_edge_pd1 = load_radiomics(radiomics_paths, exclude, region='edge', keep=keep, group='PrePD1') |
|
|
1778 |
df_pre_bed_pd1 = load_radiomics(radiomics_paths, exclude, region='bed', keep=keep, group='PrePD1') |
|
|
1779 |
|
|
|
1780 |
# Load post |
|
|
1781 |
radiomics_paths = os.path.join(base_path, 'Radiomics_control_postRT.txt') |
|
|
1782 |
df_post_tumor_cnt = load_radiomics(radiomics_paths, exclude, region='tumor', keep=keep, group='PostCnt') |
|
|
1783 |
df_post_edge_cnt = load_radiomics(radiomics_paths, exclude, region='edge', keep=keep, group='PostCnt') |
|
|
1784 |
df_post_bed_cnt = load_radiomics(radiomics_paths, exclude, region='bed', keep=keep, group='PostCnt') |
|
|
1785 |
|
|
|
1786 |
radiomics_paths = os.path.join(base_path, 'Radiomics_PD1_postRT.txt') |
|
|
1787 |
df_post_tumor_pd1 = load_radiomics(radiomics_paths, exclude, region='tumor', keep=keep, group='PostPD1') |
|
|
1788 |
df_post_edge_pd1 = load_radiomics(radiomics_paths, exclude, region='edge', keep=keep, group='PostPD1') |
|
|
1789 |
df_post_bed_pd1 = load_radiomics(radiomics_paths, exclude, region='bed', keep=keep, group='PostPD1') |
|
|
1790 |
|
|
|
1791 |
# Concatenate post for RT analysis |
|
|
1792 |
df_tumor = pd.concat((df_pre_tumor_cnt, df_pre_tumor_pd1, df_post_tumor_cnt, df_post_tumor_pd1), ignore_index=True) |
|
|
1793 |
df_edge = pd.concat((df_pre_edge_cnt, df_pre_edge_pd1, df_post_edge_cnt, df_post_edge_pd1), ignore_index=True) |
|
|
1794 |
df_bed = pd.concat((df_pre_bed_cnt, df_pre_bed_pd1, df_post_bed_cnt, df_post_bed_pd1), ignore_index=True) |
|
|
1795 |
|
|
|
1796 |
# Update recurrence data to account for incomplete animal sets |
|
|
1797 |
uniq = df_tumor['ID'].unique() |
|
|
1798 |
uniq_rec = df_rec['animalID'].unique() |
|
|
1799 |
diff = list(set(uniq_rec).difference(uniq)) |
|
|
1800 |
for un in diff: |
|
|
1801 |
df_rec = df_rec[df_rec['animalID'] != un] |
|
|
1802 |
|
|
|
1803 |
# Check that the sets contains the same animals |
|
|
1804 |
uniq = df_tumor['ID'].unique() |
|
|
1805 |
uniq_rec = df_rec['animalID'].unique() |
|
|
1806 |
diff = list(set(uniq_rec).difference(uniq)) |
|
|
1807 |
print('Len of recurrence data: %d' % len(uniq_rec)) |
|
|
1808 |
print('Len of image data: %d' % len(uniq)) |
|
|
1809 |
print('Difference: ', list(set(uniq_rec).difference(uniq))) |
|
|
1810 |
|
|
|
1811 |
# Combine recurrence and radiomics dataframes |
|
|
1812 |
df_tumor = append_rec(df_rec, df_tumor) |
|
|
1813 |
df_edge = append_rec(df_rec, df_edge) |
|
|
1814 |
df_bed = append_rec(df_rec, df_bed) |
|
|
1815 |
|
|
|
1816 |
# Save dataframes |
|
|
1817 |
df_tumor.to_csv(snames[0]) |
|
|
1818 |
df_edge.to_csv(snames[1]) |
|
|
1819 |
df_bed.to_csv(snames[2]) |
|
|
1820 |
|
|
|
1821 |
else: |
|
|
1822 |
|
|
|
1823 |
df_tumor = pd.DataFrame.from_csv(snames[0]) |
|
|
1824 |
df_edge = pd.DataFrame.from_csv(snames[1]) |
|
|
1825 |
df_bed = pd.DataFrame.from_csv(snames[2]) |
|
|
1826 |
|
|
|
1827 |
return df_tumor, df_edge, df_bed |
|
|
1828 |
|
|
|
1829 |
|
|
|
1830 |
class PPT: |
|
|
1831 |
def __init__(self, ppt_file): |
|
|
1832 |
|
|
|
1833 |
self.ppt_file = ppt_file |
|
|
1834 |
self.title_slide = 0 |
|
|
1835 |
self.subtitle_slide = 2 |
|
|
1836 |
self.title_and_content = 5 |
|
|
1837 |
if not os.path.exists(ppt_file): |
|
|
1838 |
prs = pptx.Presentation() |
|
|
1839 |
|
|
|
1840 |
else: |
|
|
1841 |
prs = pptx.Presentation(ppt_file) |
|
|
1842 |
|
|
|
1843 |
title_slide_layout = prs.slide_layouts[self.title_slide] |
|
|
1844 |
slide = prs.slides.add_slide(title_slide_layout) |
|
|
1845 |
|
|
|
1846 |
title = slide.shapes.title |
|
|
1847 |
title.text = 'Radiomic classifications' |
|
|
1848 |
|
|
|
1849 |
subtitle = slide.placeholders[1] |
|
|
1850 |
subtitle.text = 'Created %s' % datetime.strftime(datetime.now(), '%B %d, %Y') |
|
|
1851 |
|
|
|
1852 |
self.prs = prs |
|
|
1853 |
|
|
|
1854 |
# Output file |
|
|
1855 |
self.tmp_file = 'tmp_class.png' |
|
|
1856 |
|
|
|
1857 |
|
|
|
1858 |
def add_slides(self, folders, nets=['NN', 'SVM'], im_files=['NN_NoPCA_roc.png', 'SVM_NoPCA_roc.png'], num_features=None): |
|
|
1859 |
""" |
|
|
1860 |
|
|
|
1861 |
Args: |
|
|
1862 |
folders (list of str): path to 1. tumor, 2. edge, and 3. bed |
|
|
1863 |
|
|
|
1864 |
Returns: |
|
|
1865 |
|
|
|
1866 |
""" |
|
|
1867 |
|
|
|
1868 |
# Image file names |
|
|
1869 |
# im_files = ['NN_NoPCA_roc.png', 'SVM_NoPCA_roc.png'] |
|
|
1870 |
|
|
|
1871 |
# Get general information |
|
|
1872 |
# Grab folder name for getting group information |
|
|
1873 |
folder_name = os.path.split(folders[0])[1] |
|
|
1874 |
params = folder_name.split(sep='_') |
|
|
1875 |
|
|
|
1876 |
# Capitalize first letters |
|
|
1877 |
params = [i.lower().capitalize() for i in params] |
|
|
1878 |
|
|
|
1879 |
# Get group name |
|
|
1880 |
group = params[1] |
|
|
1881 |
mrmr_group = 'mRMR features: %s RT, %s' % (tuple(params[4:])) |
|
|
1882 |
|
|
|
1883 |
z = 0 |
|
|
1884 |
area = [] |
|
|
1885 |
files = dict(zip(nets, [[] for _ in nets])) |
|
|
1886 |
for folder in folders: |
|
|
1887 |
# Grab folder name for getting group information |
|
|
1888 |
folder_name = os.path.split(folder)[1] |
|
|
1889 |
params = folder_name.split(sep='_') |
|
|
1890 |
|
|
|
1891 |
# Capitalize first letters |
|
|
1892 |
params = [i.lower().capitalize() for i in params] |
|
|
1893 |
|
|
|
1894 |
# Get group name |
|
|
1895 |
area.append(params[2]) |
|
|
1896 |
|
|
|
1897 |
# Get image names |
|
|
1898 |
for i, key in enumerate(nets): |
|
|
1899 |
files[key].append(os.path.join(folder, im_files[i])) |
|
|
1900 |
|
|
|
1901 |
# Create a single output images |
|
|
1902 |
from PIL import Image, ImageDraw |
|
|
1903 |
offset = 60 |
|
|
1904 |
for z in range(len(files['NN'])): |
|
|
1905 |
if z == 0: |
|
|
1906 |
im = Image.open(files['NN'][z]) |
|
|
1907 |
height = im.height |
|
|
1908 |
width = im.width |
|
|
1909 |
|
|
|
1910 |
full_im = Image.new(mode='RGBA', size=(3 * width + offset, len(im_files) * height), color=(255, 255, 255, 255))#(255,255,255,0)) |
|
|
1911 |
|
|
|
1912 |
# Load images |
|
|
1913 |
for i, key in enumerate(nets): |
|
|
1914 |
tmp = Image.open(files[key][z]) |
|
|
1915 |
|
|
|
1916 |
# Compute image coordinates |
|
|
1917 |
bbox = (z * width + offset, i * height) |
|
|
1918 |
|
|
|
1919 |
# Paste into the larger image |
|
|
1920 |
full_im.paste(tmp, box=bbox) |
|
|
1921 |
|
|
|
1922 |
# Add text - horizontal |
|
|
1923 |
font = ImageFont.truetype("/usr/share/fonts/truetype/freefont/FreeMono.ttf", 60) |
|
|
1924 |
overlay = Image.new('RGBA', full_im.size, (0,0,0,0)) |
|
|
1925 |
draw = ImageDraw.Draw(overlay) |
|
|
1926 |
for z in range(3): |
|
|
1927 |
bbox1 = (z * width + offset, 45) |
|
|
1928 |
w, h = draw.textsize(area[z], font=font) |
|
|
1929 |
draw.text((bbox1[0] + (width - w) // 2 + 10, bbox1[1]), text=area[z], fill='black', font=font) |
|
|
1930 |
|
|
|
1931 |
# Add text - vertical |
|
|
1932 |
overlay = overlay.rotate(-90, expand=1) |
|
|
1933 |
draw = ImageDraw.Draw(overlay) |
|
|
1934 |
for z in range(len(nets)): |
|
|
1935 |
bbox1 = (z * height, 10) |
|
|
1936 |
w, h = draw.textsize(area[z], font=font) |
|
|
1937 |
draw.text((bbox1[0] + (height - w)//2, bbox1[1]), text=nets[len(nets)-z-1], fill='black', font=font) |
|
|
1938 |
overlay = overlay.rotate(90, expand=1) |
|
|
1939 |
|
|
|
1940 |
# Create overlayed image |
|
|
1941 |
out = Image.alpha_composite(full_im, overlay) |
|
|
1942 |
|
|
|
1943 |
# Save the temporary large image |
|
|
1944 |
out.save(self.tmp_file) |
|
|
1945 |
|
|
|
1946 |
# Create slide title |
|
|
1947 |
if not num_features: |
|
|
1948 |
section_title = '%s RT. %s' % (group, mrmr_group) |
|
|
1949 |
else: |
|
|
1950 |
section_title = '%s RT. %s, x%d' % (group, mrmr_group, num_features) |
|
|
1951 |
|
|
|
1952 |
|
|
|
1953 |
# Update PowerPoint |
|
|
1954 |
content_slide = self.prs.slide_layouts[self.title_and_content] |
|
|
1955 |
slide = self.prs.slides.add_slide(content_slide) |
|
|
1956 |
|
|
|
1957 |
title = slide.shapes.title |
|
|
1958 |
title.text = section_title |
|
|
1959 |
|
|
|
1960 |
# Insert image to PPT |
|
|
1961 |
left = pptx.util.Inches(0.1) |
|
|
1962 |
top = pptx.util.Inches(1.6) |
|
|
1963 |
width = pptx.util.Inches(9.8) |
|
|
1964 |
height = pptx.util.Inches(5.7) |
|
|
1965 |
pic = slide.shapes.add_picture(self.tmp_file, left, top, width, height) |
|
|
1966 |
|
|
|
1967 |
def save(self): |
|
|
1968 |
|
|
|
1969 |
# Save PPT |
|
|
1970 |
self.prs.save(self.ppt_file) |
|
|
1971 |
|
|
|
1972 |
# Remove temp image file |
|
|
1973 |
os.remove(self.tmp_file) |
|
|
1974 |
|
|
|
1975 |
|
|
|
1976 |
def recur_survival(df_tumor, base_path): |
|
|
1977 |
|
|
|
1978 |
spath = os.path.join(base_path, 'Analysis', 'Survival') |
|
|
1979 |
if not os.path.exists(spath): |
|
|
1980 |
os.makedirs(spath) |
|
|
1981 |
|
|
|
1982 |
# Set up fitter |
|
|
1983 |
kmf = KaplanMeierFitter() |
|
|
1984 |
df = df_tumor.loc[['Pre' in i for i in df_tumor['Group']]] |
|
|
1985 |
|
|
|
1986 |
fig, ax = plt.subplots() |
|
|
1987 |
|
|
|
1988 |
# Non recurrent |
|
|
1989 |
T = df.loc[~df['recurrence']]['rec_days'] |
|
|
1990 |
E = T > 180 #np.ones(T.shape) #df_tumor['recurrence'] |
|
|
1991 |
|
|
|
1992 |
kmf.fit(durations=T, event_observed=E, label='No recurrence') |
|
|
1993 |
kmf.plot(ax=ax) |
|
|
1994 |
|
|
|
1995 |
# Recurrent |
|
|
1996 |
T = df.loc[df['recurrence']]['rec_days'] |
|
|
1997 |
E = T > 180 #np.ones(T.shape) #df_tumor['recurrence'] |
|
|
1998 |
|
|
|
1999 |
kmf.fit(durations=T, event_observed=E, label='Recurrence') |
|
|
2000 |
kmf.plot(ax=ax) |
|
|
2001 |
|
|
|
2002 |
plt.savefig(os.path.join(spath, 'survival.svg')) |
|
|
2003 |
|
|
|
2004 |
def main(): |
|
|
2005 |
|
|
|
2006 |
tstart = time() |
|
|
2007 |
base_path = '/media/matt/Seagate Expansion Drive/b7TData_19/b7TData/Results' |
|
|
2008 |
|
|
|
2009 |
df_tumor, df_edge, df_bed = load_data(base_path) |
|
|
2010 |
|
|
|
2011 |
# Survival curves |
|
|
2012 |
# recur_survival(df_tumor, base_path) |
|
|
2013 |
|
|
|
2014 |
""" |
|
|
2015 |
A look at differences |
|
|
2016 |
""" |
|
|
2017 |
change_with_RT_rec(df_tumor, spath=base_path, area='tumor') |
|
|
2018 |
change_with_RT_rec(df_bed, spath=base_path, area='bed') |
|
|
2019 |
change_with_RT_rec(df_edge, spath=base_path, area='edge') |
|
|
2020 |
|
|
|
2021 |
# Paired t-test for RT |
|
|
2022 |
paired_ttests_loc(df_tumor, base_path=base_path, area='tumor') |
|
|
2023 |
paired_ttests_loc(df_bed, base_path=base_path, area='bed') |
|
|
2024 |
paired_ttests_loc(df_edge, base_path=base_path, area='edge') |
|
|
2025 |
|
|
|
2026 |
|
|
|
2027 |
# mRMR - Test all combinations |
|
|
2028 |
groups = ['Pre', 'Post'] |
|
|
2029 |
mrmr_files = ['mRMR_results_%s_tumor.txt', 'mRMR_results_%s_bed.txt', |
|
|
2030 |
'mRMR_results_%s_edge.txt'] |
|
|
2031 |
|
|
|
2032 |
# Set up PPT |
|
|
2033 |
ppt_file = os.path.join(base_path, 'Analysis', 'classifications3.pptx') |
|
|
2034 |
ppt = PPT(ppt_file) |
|
|
2035 |
|
|
|
2036 |
spaths = [''] * 3 |
|
|
2037 |
num_features = range(10, 115) |
|
|
2038 |
mrmr_file = 'mRMR_results_%s_edge.txt' |
|
|
2039 |
|
|
|
2040 |
gropus = ['Pre'] |
|
|
2041 |
mrmr_files = ['mRMR_results_%s_edge.txt'] |
|
|
2042 |
for group in groups: |
|
|
2043 |
for mrmr_file in mrmr_files: |
|
|
2044 |
nf = 108 |
|
|
2045 |
mrmr_file = mrmr_file % group |
|
|
2046 |
spaths[0] = mRMR(df_tumor, group=group, base_path=base_path, area='tumor', mrmr_file=mrmr_file, |
|
|
2047 |
num_features=nf, htmaps=True) |
|
|
2048 |
spaths[1] = mRMR(df_edge, group=group, base_path=base_path, area='edge', mrmr_file=mrmr_file, |
|
|
2049 |
num_features=nf, htmaps=True) |
|
|
2050 |
spaths[2] = mRMR(df_bed, group=group, base_path=base_path, area='bed', mrmr_file=mrmr_file, |
|
|
2051 |
num_features=nf, htmaps=True) |
|
|
2052 |
|
|
|
2053 |
# Add classifications to a PPT |
|
|
2054 |
ppt.add_slides(folders=spaths, num_features=nf) |
|
|
2055 |
|
|
|
2056 |
ppt.save() |
|
|
2057 |
|
|
|
2058 |
|
|
|
2059 |
# Test Post RT |
|
|
2060 |
ppt_file = os.path.join(base_path, 'Analysis', 'classifications4.pptx') |
|
|
2061 |
ppt = PPT(ppt_file) |
|
|
2062 |
|
|
|
2063 |
spaths = [''] * 3 |
|
|
2064 |
num_features = range(10, 115) |
|
|
2065 |
# mrmr_file = 'mRMR_results_%s_edge.txt' |
|
|
2066 |
|
|
|
2067 |
groups = ['Post'] |
|
|
2068 |
htmaps = False |
|
|
2069 |
# mrmr_files = ['mRMR_results_%s_edge.txt'] |
|
|
2070 |
for group in groups: |
|
|
2071 |
for mrmr_file in mrmr_files: |
|
|
2072 |
nf = 108 |
|
|
2073 |
mrmr_file = mrmr_file % group |
|
|
2074 |
spaths[0] = mRMR(df_tumor, group=group, base_path=base_path, area='tumor', mrmr_file=mrmr_file, |
|
|
2075 |
num_features=nf, htmaps=htmaps) |
|
|
2076 |
spaths[1] = mRMR(df_edge, group=group, base_path=base_path, area='edge', mrmr_file=mrmr_file, |
|
|
2077 |
num_features=nf, htmaps=htmaps) |
|
|
2078 |
spaths[2] = mRMR(df_bed, group=group, base_path=base_path, area='bed', mrmr_file=mrmr_file, |
|
|
2079 |
num_features=nf, htmaps=htmaps) |
|
|
2080 |
|
|
|
2081 |
# Add classifications to a PPT |
|
|
2082 |
ppt.add_slides(folders=spaths, num_features=nf) |
|
|
2083 |
|
|
|
2084 |
ppt.save() |
|
|
2085 |
|
|
|
2086 |
print('\tTotal time (HH:MM:SS): %s\n\n' % (str(dt.timedelta(seconds=round(time() - tstart))))) |
|
|
2087 |
|
|
|
2088 |
|
|
|
2089 |
|
|
|
2090 |
if __name__ == '__main__': |
|
|
2091 |
|
|
|
2092 |
main() |
|
|
2093 |
|
|
|
2094 |
folders = ['/media/matt/Seagate Expansion Drive/b7TData_19/b7TData/Results/Analysis/mrmr_Pre_tumor_feat_Pre_tumor', |
|
|
2095 |
'/media/matt/Seagate Expansion Drive/b7TData_19/b7TData/Results/Analysis/mrmr_Pre_edge_feat_Pre_tumor', |
|
|
2096 |
'/media/matt/Seagate Expansion Drive/b7TData_19/b7TData/Results/Analysis/mrmr_Pre_bed_feat_Pre_tumor'] |