Switch to unified view

a b/Radiomics/radiomics_classifications.py
1
"""
2
A collection of functions used to analyze radiomic features. This code appends data from RT, primary tumor
3
recurrence, and survival times. The example code is at the end of this file.
4
"""
5
6
import os
7
import sys
8
file = '/home/matt/Documents/SegSarcoma'
9
sys.path.append(file)
10
import pptx
11
import pickle
12
import json
13
from datetime import datetime, timedelta
14
import datetime as dt
15
from time import time
16
from subprocess import call
17
from lifelines import KaplanMeierFitter
18
import numpy as np
19
import pandas as pd
20
from matplotlib import pyplot as plt
21
from matplotlib import rcParams
22
import seaborn as sns
23
from glob2 import glob
24
from scipy import stats
25
from Crawler.crawler_radiomics import load_study_data
26
from Radiomics.radiomic_functions import *
27
from PIL import Image, ImageDraw, ImageFont
28
29
# Set up plotting properties
30
sns.set()
31
sns.set_style('whitegrid')
32
sns.set_context("paper")
33
34
PATH_TO_MRMR = '/home/matt/Downloads/mrmr_c_src/mrmr'
35
36
37
def concate_contrasts_small(df):
38
    """
39
    Takes a dataframe containing radiomcis data for all animals and contrasta.
40
    Returns a dataframe with contrasts of the same animal concatenated together.
41
    Args:
42
        df:
43
44
    Returns:
45
46
    """
47
48
    # Turn multiple contrast images into a single vector of data
49
    keys = df.keys()
50
    cat_cols = [i + '_T1' for i in keys] + \
51
               [i + '_T1C' for i in keys] + \
52
               [i + '_T2' for i in keys]
53
    key = [[i + '_T1' for i in keys],
54
           [i + '_T1C' for i in keys],
55
           [i + '_T2' for i in keys]]
56
    df_cat = pd.DataFrame(columns=cat_cols, index=range(1))
57
58
    ind = 0
59
    for i in range(3):
60
61
        tmp = df.iloc[i]
62
63
        # Append data into larger array
64
        df_cat[key[i]] = [n for n in tmp.get_values().ravel()]
65
66
    return df_cat
67
68
69
def strip_excluded(rfiles, exclude):
70
    """
71
    Remove animals which either did not have two scans or died shortly after surgery.
72
    Args:
73
        rfiles (list): list of all animals
74
        exclude (list): list of animals to exclude
75
76
    Returns:
77
        (list): filtered list
78
    """
79
80
    out_rfiles = rfiles
81
82
    for file in rfiles:
83
84
        path, _ = os.path.split(file)
85
        path, _ = os.path.split(path)
86
        _, animal = os.path.split(path)
87
88
        if animal in exclude:
89
            out_rfiles.remove(file)
90
91
    return out_rfiles
92
93
94
def keep_animals(rfiles, keep):
95
    """
96
    Only keep specific animals.
97
    Args:
98
        rfiles (list): list of all animals
99
        keep (list): list of animals to keep
100
101
    Returns:
102
        (list): filtered list
103
    """
104
105
    out_rfiles = []
106
107
    for file in rfiles:
108
109
        path, _ = os.path.split(file)
110
        path, _ = os.path.split(path)
111
        _, animal = os.path.split(path)
112
113
        if animal in keep:
114
            out_rfiles.append(file)
115
116
    return out_rfiles
117
118
119
def select_region(rfiles, region):
120
    """
121
    Select features based on the region they were calculated in.
122
    Args:
123
        rfiles (list): list of all animals
124
        region (str): 'tumor', 'bed', or 'edge'
125
126
    Returns:
127
128
    """
129
130
    out_rfiles = rfiles
131
132
    for _ in range(2):
133
134
        for file in rfiles:
135
136
            if region == 'tumor':
137
138
                if '_bed' in file:
139
                    out_rfiles.remove(file)
140
                elif 'edge' in file:
141
                    out_rfiles.remove(file)
142
143
            elif region == 'bed':
144
145
                if '_bed' not in file:
146
                    out_rfiles.remove(file)
147
148
            elif region == 'edge':
149
150
                if '_edge' not in file:
151
                    out_rfiles.remove(file)
152
153
        rfiles = out_rfiles
154
155
    return out_rfiles
156
157
158
def load_radiomics(radiomics_paths, exclude, region, keep, group):
159
    """
160
    Load radimic data (output of PyRadiomics)
161
    Args:
162
        radiomics_paths (str): path to radiomic files
163
        exclude (str): amimals to exclude
164
        region (str): which region to select
165
        keep (str): animals to keep
166
        group (str): pre/post RT and control/treatment
167
168
    Returns:
169
        pandas dataframe: radiomic data
170
    """
171
172
    # Load radiomics file data
173
    with open(radiomics_paths, 'r') as f:
174
        rfiles = f.readlines()
175
176
    # Remove line endings '\n'
177
    rfiles = [i.strip() for i in rfiles]
178
179
    # Sort radiomics files
180
    rfiles = strip_excluded(rfiles, exclude)
181
    rfiles = select_region(rfiles, region)
182
183
    # Only load specific animals
184
    rfiles = keep_animals(rfiles, keep)
185
186
    # Load data and combine contrasts
187
    df = load_and_concate(rfiles)
188
189
    df['Group'] = group
190
    # df['Region'] = region
191
192
    df.index = range(df.shape[0])
193
194
    return df
195
196
197
def load_study_logs(summary_file, log_file):
198
    """
199
    Load study logs which contain relevant dates
200
    Args:
201
        summary_file:
202
        log_file:
203
204
    Returns:
205
206
    """
207
208
    # Load summary file
209
    summary = load_study_data(summary_file)
210
211
    # Convert animal IDs to match the log file
212
    summary['Kirsch lab iD'] = ['K' + str(int(i)) for i in summary['Kirsch lab iD']]
213
214
    if not log_file == '':
215
        # Load log file
216
        with open(log_file) as f:
217
            log = json.load(f)
218
    else:
219
        log = 0
220
221
    # Amputation date
222
    treat_date = summary['Date of Antibody treatment   (0.2mg/mouse Ip)  1st']
223
224
    # Compute time until today
225
    summary['SinceAmputation'] = (datetime.now() - treat_date)
226
227
    return summary, log
228
229
230
def capitalize_method(key):
231
    """
232
    Capitalizes an list of strings
233
    Args:
234
        key (str): a feature name to capitalize
235
236
    Returns:
237
        (str): capitalized name
238
    """
239
240
    if str in key:
241
242
        key = key.replace(str, str.upper())
243
244
    return key
245
246
247
def load_and_concate(rfiles):
248
    """
249
    Loads and concatenates radiomic features from different MR contrasts
250
    Args:
251
        rfiles (list): list of radiomic files (one for each scan)
252
253
    Returns:
254
        pandas dataframe: radiomic features
255
    """
256
257
    animal_id = []
258
    for i, file in enumerate(rfiles):
259
260
        df = pd.DataFrame.from_csv(file)
261
262
        if i == 0:
263
            cat_df = concate_contrasts_small(df)
264
265
        else:
266
            df = concate_contrasts_small(df)
267
            cat_df = pd.concat([cat_df, df])
268
269
        path, _ = os.path.split(file)
270
        path, _ = os.path.split(path)
271
        _, animal = os.path.split(path)
272
273
        if animal not in animal_id:
274
            animal_id.append(animal)
275
276
        else:
277
            print(animal)
278
279
    # Filter out non-feature names
280
    features = cat_df.keys()
281
    feature_names = list(
282
        sorted(filter(lambda k: k.startswith("original_"), features)))
283
    cat_df = cat_df[feature_names]
284
285
     # Add animal name
286
    cat_df['ID'] = animal_id
287
288
    # Update df index
289
    cat_df.index = range(cat_df.shape[0])
290
291
    return cat_df
292
293
294
def load_recurrence_log(recurrence_file, threshold):
295
    """
296
    Load information about recurrence
297
    Args:
298
        recurrence_file (str): file containing primary tumor recurrence information.
299
        threshold (int): a limit on how long a mouse must survive post-surgury to be included
300
301
    Returns:
302
        pandas dataframe: animal and recurrence information
303
    """
304
305
    # Read in log file
306
    dat = pd.read_excel(recurrence_file)
307
308
    # Add 'K' to each animal name
309
    dat['Animal'] = ['K' + str(n) for n in dat['Animal']]
310
311
    # Filter out animals without amputation
312
    dat = dat[dat['AmputationDate'].notnull()]
313
314
    # Fill in death dates with today
315
    dat['DeathDate'][dat['DeathDate'].isnull()] = datetime.now()
316
317
    # Find lifespan post amputation
318
    dat['AmputationDate'] = [i.date() for i in dat['AmputationDate']]
319
    dat['DeathDate'] = [i.date() for i in dat['DeathDate']]
320
    dat['Recurrence'] = [i.date() for i in dat['Recurrence']]
321
    lifespan = dat['DeathDate'] - dat['AmputationDate']
322
    dat['since_amp'] = [int(life.days) for life in lifespan]
323
324
    # Filter out short lifespans without recurrence
325
    thresh = timedelta(threshold)
326
    recurr = ~pd.isna(dat['Recurrence'])
327
    inds = np.logical_or(lifespan > thresh, recurr)
328
    dat = dat[inds]
329
    # dat = dat[lifespan > thresh]
330
331
    # Get recurrence
332
    dat['bool_recur'] = dat['Recurrence'].notnull().tolist()
333
334
    # Reindex
335
    dat = dat.reset_index(drop=True)
336
337
    return dat
338
339
340
def sort_study_data(rec_df, sum_df, exclude):
341
342
    # Create a new dictionary to sort recurrence data
343
    df = {'animalID': [], 'group': [], 'recurrence': [], 'rec_days': []}
344
345
    # Populate df with data from both logs
346
    n = 0
347
    for i, animal in enumerate(sum_df['Kirsch lab iD']):
348
349
        if any(animal == rec_df['Animal']) and animal not in exclude:
350
351
            tmp = rec_df[rec_df['Animal'].str.match(animal)]
352
353
            df['animalID'].append(animal)
354
            df['group'].append(sum_df['Group'][i])
355
            df['recurrence'].append(tmp['bool_recur'].tolist()[0])
356
            df['rec_days'].append(int(tmp['since_amp']))
357
358
    # Make a Pandas df
359
    df = pd.DataFrame.from_dict(df)
360
361
    # Print general stats
362
    nrec = sum(df['recurrence'])
363
    norec = sum(df['recurrence']==False)
364
365
    nrecpd1 = sum([df['group'][i] == 'PD1' and df['recurrence'][i] for i in range(len(df))])
366
    nreccon = sum([df['group'][i] == 'Control' and df['recurrence'][i] for i in range(len(df))])
367
368
    norecpd1 = sum([df['group'][i] == 'PD1' and not df['recurrence'][i] for i in range(len(df))])
369
    noreccon = sum([df['group'][i] == 'Control' and not df['recurrence'][i] for i in range(len(df))])
370
371
    print('Number of recurrent tumors:     %d' % nrec)
372
    print('\tControl:    \t%d' % nreccon)
373
    print('\tPD1:        \t%d' % noreccon)
374
    print('Number of non-recurrent tumors: %d' % norec)
375
    print('\tControl:    \t%d' % noreccon)
376
    print('\tPD1:        \t%d' % norecpd1)
377
378
    return df
379
380
381
def append_rec(df_rec, df_tumor):
382
    """
383
    Append radiomics, RT, survival, and recurrence data together
384
    Args:
385
        df_rec (pandas dataframe): recurrence
386
        df_tumor (pandas dataframe): radiomics
387
388
    Returns:
389
        appended pandas dataframe
390
    """
391
392
    # Get the list of animals
393
    animals = df_rec['animalID'].tolist()
394
    keys = df_rec.keys().tolist()
395
    keys.remove('animalID')
396
    keys.remove('group')
397
398
    # Extend the output dataframe
399
    ltumor = len(df_tumor['ID'])
400
    df_out = df_tumor.copy()
401
    for key in keys:
402
        df_out[key] = pd.Series(np.zeros(ltumor), index=df_out.index)
403
404
    for animal in animals:
405
406
        rec = df_rec[df_rec['animalID'] == animal]
407
408
        for key in keys:
409
            df_out.loc[df_out['ID'] == animal, key] = rec[key].tolist()
410
411
    return df_out
412
413
414
def examine_recurrence(df_tumor, spath, group='Pre'):
415
    """
416
    Run classifiers to predict recurrence
417
    Args:
418
        df_tumor (pandas dataframe): radiomic data
419
        spath (str): output path
420
        group (str): pre/post RT
421
422
    Returns:
423
424
    """
425
426
    # Ensure spath exists
427
    if not os.path.exists(spath):
428
        os.makedirs(spath)
429
430
    # Separate control and PD1
431
    df_cnt = df_tumor[df_tumor['Group'] == group + 'Cnt']
432
    df_pd1 = df_tumor[df_tumor['Group'] == group + 'PD1']
433
434
    # Separate by larger pre/post
435
    df_tumor = df_tumor.loc[(df_tumor['Group'] == group + 'Cnt') | (df_tumor['Group'] == group + 'PD1'), :]
436
437
    # Get numeric data
438
    keys = df_tumor.keys().tolist()
439
    rm_keys = ['ID', 'Group', 'rec_days']
440
    [keys.remove(i) for i in rm_keys]
441
    lab_keys = 'recurrence'
442
    data_keys = keys
443
    data_keys.remove(lab_keys)
444
445
    # Scale measurements
446
    rad = df_tumor[data_keys].copy()
447
    # rad = scale_features(rad)
448
    label = df_tumor[lab_keys]
449
    ids = df_tumor['ID']
450
    # bar_plot(rad_data)
451
452
    # tmp = rad.loc[label, 'original_firstorder_10Percentile_T1']
453
    # rad.loc[label, 'original_firstorder_10Percentile_T1'] = tmp.copy()
454
455
    # Plot metrics
456
    plot_radiomics(rad, label, ids, spath)
457
458
    # SVM classification
459
    svm_classifier_loc(scale_features(rad), label, spath)
460
    # tspath = os.path.join(spath, 'cnt')
461
    # svm_classifier_loc(scale_features(df_cnt[data_keys]), df_cnt[lab_keys], tspath)
462
    # tspath = os.path.join(spath, 'pd1')
463
    # svm_classifier_loc(scale_features(df_pd1[data_keys]), df_pd1[lab_keys], tspath)
464
465
    # NN classification
466
    nn_classifier_loc(scale_features(rad), label, spath)
467
    # tspath = os.path.join(spath, 'cnt')
468
    # nn_classifier_loc(scale_features(df_cnt[data_keys]), df_cnt[lab_keys], tspath)
469
    # tspath = os.path.join(spath, 'pd1')
470
    # nn_classifier_loc(scale_features(df_pd1[data_keys]), df_pd1[lab_keys], tspath)
471
472
    # sname = os.path.join(spath, 'full.csv')
473
    # pd.concat((scale_features(rad), label), axis=1).to_csv(sname, index=False)
474
    # sname = os.path.join(spath, 'cnt.csv')
475
    # pd.concat((scale_features(df_cnt[data_keys]), df_cnt[lab_keys]), axis=1).to_csv(sname, index=False)
476
    # sname = os.path.join(spath, 'pd1.csv')
477
    # pd.concat((scale_features(df_pd1[data_keys]), df_pd1[lab_keys]), axis=1).to_csv(sname, index=False)
478
479
480
def svm_classifier_loc(rad, label, spath):
481
    """
482
    Attempts at SVM classification using the full dataset and PCA-based reduced datasets.
483
    Args:
484
        rad (pandas dataframe): radiomic features
485
        label (pandas dataframe): classification label
486
        spath (str): output directory
487
488
    Returns:
489
490
    """
491
492
    sns.set(font_scale=1.1)
493
494
    # Ensure spath exists
495
    if not os.path.exists(spath):
496
        os.makedirs(spath)
497
498
    old_stdout = sys.stdout
499
    log_file_path = os.path.join(spath, 'SVM_results.txt')
500
    log_file = open(log_file_path, 'w')
501
    sys.stdout = log_file
502
503
    print('Training samples: %d, test samples: %d' % (int(rad.shape[0]*0.75), int(rad.shape[0]*0.25)))
504
    print('Number of features %d' % rad.shape[1])
505
506
    # Get index
507
    inds_1 = label == True
508
    inds_2 = label == False
509
510
    # Create label vector
511
    label = label.get_values().astype(np.float64)
512
    rad = rad.get_values().astype(np.float64)
513
514
    # Plot euclidean distances - multidimensional scaling
515
    fig = plot_euclidean_distances(rad, inds_1, inds_2)
516
    fig.savefig(os.path.join(spath, 'SVM_EuclDist_SVM.png'), dpi=300)
517
    plt.close(fig)
518
519
    # SVM without reduction
520
    print('Trying SVM classifier without dimensionality reduction:')
521
    print('-' * 50)
522
    # fig, clf = svm_classifier(rad.get_values(), inds_1, inds_2)
523
    fig = svm_cross(rad, label)
524
    fig.savefig(os.path.join(spath, 'SVM_NoPCA_roc.svg'))
525
    fig.savefig(os.path.join(spath, 'SVM_NoPCA_roc.png'), dpi=200)
526
    plt.close(fig)
527
528
    with open(os.path.join(spath, 'NoReduction_SVM.model'), 'wb') as f:
529
        pickle.dump(clf, f)
530
531
    # PCA analysis
532
    # a look at 2 components and SVM
533
    print('\n')
534
    print('Trying SVM classifier with 2 principal components:')
535
    print('-' * 50)
536
    rad_pca = pca_reduction(rad, npcomps=2)
537
    fig = svm_cross(rad_pca, label)
538
    fig.savefig(os.path.join(spath, 'SVM_PCA.svg'))
539
    plt.close(fig)
540
    fig = svm_cross(rad_pca, label)
541
    fig.savefig(os.path.join(spath, 'SVM_PCA2_roc.svg'))
542
    fig.savefig(os.path.join(spath, 'SVM_PCA2_roc.png'), dpi=300)
543
    plt.close(fig)
544
    with open(os.path.join(spath, 'PCA2_SVM.model'), 'wb') as f:
545
        pickle.dump(clf, f)
546
547
    # a look at variance
548
    print('\n')
549
    npcomps, fig = pca_analyis(rad)
550
    print('Trying SVM classifier with %d principal components found from data variance:' % npcomps)
551
    print('-' * 50)
552
    rad_pca = pca_reduction(rad, npcomps)
553
    fig.savefig(os.path.join(spath, 'Cumulative_var.svg'))
554
    # fig.savefig(os.path.join(spath, 'Cumulative_var.png'), dpi=300)
555
    plt.close(fig)
556
557
    fig = svm_cross(rad_pca, label)
558
    fig = svm_cross(rad_pca, label)
559
    # fig.savefig(os.path.join(spath, 'SVM_PCA_roc_var.svg'))
560
    # fig.savefig(os.path.join(spath, 'SVM_PCA_roc_var.png'), dpi=300)
561
    plt.close(fig)
562
    with open(os.path.join(spath, 'SVM_PCA95.model'), 'wb') as f:
563
        pickle.dump(clf, f)
564
565
    # Close log file
566
    sys.stdout = old_stdout
567
    log_file.close()
568
569
570
def nn_classifier_loc(rad, label, spath):
571
    """
572
    Attempts at NN classification using the full dataset and PCA-based reduced datasets.
573
    Args:
574
        rad (pandas dataframe): radiomic features
575
        label (pandas dataframe): classification label
576
        spath (str): output directory
577
578
    Returns:
579
580
    """
581
582
    sns.set(font_scale=1.1)
583
584
    # Ensure spath exists
585
    if not os.path.exists(spath):
586
        os.makedirs(spath)
587
588
    old_stdout = sys.stdout
589
    log_file_path = os.path.join(spath, 'NN_results.txt')
590
    log_file = open(log_file_path, 'w')
591
    sys.stdout = log_file
592
593
    print('Training samples: %d, test samples: %d' % (int(rad.shape[0]*0.75), int(rad.shape[0]*0.25)))
594
    print('Number of features %d' % rad.shape[1])
595
596
    # Get index
597
    inds_1 = label == True
598
    inds_2 = label == False
599
600
    # Create label vector
601
    label = label.get_values().astype(np.float64)
602
    rad = rad.get_values().astype(np.float64)
603
604
    # SVM without reduction
605
    print('Trying NN classifier without dimensionality reduction:')
606
    print('-' * 50)
607
    # fig, clf = svm_classifier(rad.get_values(), inds_1, inds_2)
608
    fig = neural_network_cross(rad, label)
609
    fig.savefig(os.path.join(spath, 'NN_NoPCA_roc.svg'))
610
    fig.savefig(os.path.join(spath, 'NN_NoPCA_roc.png'), dpi=200)
611
    plt.close(fig)
612
613
    # PCA analysis
614
    # a look at 2 components and SVM
615
    print('\n')
616
    print('Trying NN classifier with 2 principal components:')
617
    print('-' * 50)
618
    rad_pca = pca_reduction(rad, npcomps=2)
619
    # fig, clf = svm_classifier(rad_pca, inds_1, inds_2)
620
    # fig.savefig(os.path.join(spath, 'PCA_NN.png'), dpi=300)
621
    # plt.close(fig)
622
    fig = neural_network_cross(rad_pca, label)
623
    # fig.savefig(os.path.join(spath, 'NN_PCA2_roc.svg'))
624
    # fig.savefig(os.path.join(spath, 'NN_PCA2_roc.png'), dpi=300)
625
    plt.close(fig)
626
    # with open(os.path.join(spath, 'PCA2_SVM.model'), 'wb') as f:
627
    #     pickle.dump(clf, f)
628
629
    # a look at variance
630
    print('\n')
631
    npcomps, fig = pca_analyis(rad)
632
    print('Trying NN classifier with %d principal components found from data variance:' % npcomps)
633
    print('-' * 50)
634
    rad_pca = pca_reduction(rad, npcomps)
635
    # fig.savefig(os.path.join(spath, 'Cumulative_var.png'), dpi=300)
636
    plt.close(fig)
637
638
    # fig, clf = svm_classifier(rad_pca, inds_1, inds_2)
639
    # fig = neural_network_cross(rad_pca, label)
640
    # fig.savefig(os.path.join(spath, 'NN_PCA_var_roc.svg'))
641
    # fig.savefig(os.path.join(spath, 'NN_PCA_var_roc.png'), dpi=300)
642
    # plt.close(fig)
643
    # with open(os.path.join(spath, 'PCA95_NN.model'), 'wb') as f:
644
    #     pickle.dump(clf, f)
645
646
    # Close log file
647
    sys.stdout = old_stdout
648
    log_file.close()
649
650
651
def mRMR(df_tumor, base_path, group='Post', area='tumor', mrmr_file=None, num_features=10, htmaps=False):
652
    """
653
    Compute mRMR feature selection. This funtion has been bundled with code which computes correlation
654
    maps and later calls training on classifiers for tumor recurrent.
655
    Args:
656
        df_tumor (panas dataframe):
657
        base_path (str): where to save mRMR output
658
        group (str): pre/post RT
659
        area (str): masked location
660
        mrmr_file (str): file of mRMR features to load or create
661
        num_features (int): number of features to use
662
        htmaps (bool): whether to create and save all heatmaps
663
664
    Returns:
665
666
    """
667
668
    print('\n')
669
    print('-'*80)
670
    print('Processing %s, %s' %(group, area))
671
    print('Number of features: %d' % num_features)
672
673
    dfn = df_tumor.loc[(df_tumor['Group'] == group + 'Cnt') | (df_tumor['Group'] == group + 'PD1'), :]
674
    dfn = dfn.reset_index()
675
676
    keys = df_tumor.keys().tolist()
677
    rm_keys = ['ID', 'Group', 'rec_days']
678
    [keys.remove(i) for i in rm_keys]
679
    lab_keys = 'recurrence'
680
    data_keys = keys
681
    data_keys.remove(lab_keys)
682
683
    # # Separate by larger pre/post
684
    # df_tumor = df_tumor[(df_tumor['Group'] == group + 'Cnt') | (df_tumor['Group'] == group + 'PD1')]
685
686
    # Create class in the first column
687
    category = dfn['recurrence']
688
689
    # Remove columns
690
    dfn = dfn[data_keys].copy()
691
692
    dfn['Class'] = category.astype(int)
693
694
    # Rearrange columns so class is first
695
    cols = dfn.columns.tolist()
696
    cols = [cols[-1]] + cols[:-1]
697
    dfn = dfn[cols]
698
699
    # Drop outliers
700
    dfn = dfn.loc[dfn['original_gldm_GrayLevelNonUniformity_T1'] != 5607, :]
701
702
    # Write as csv
703
    sname = 'tumor_%s_%s.csv' % (group, area)
704
    dfn.to_csv(sname, index=False, index_label=False)
705
706
    # Generate savepath
707
    if mrmr_file:
708
        if 'post' in mrmr_file.lower():
709
            mrmr_time = 'Post'
710
        elif 'pre' in mrmr_file.lower():
711
            mrmr_time = 'Pre'
712
        if 'tumor' in mrmr_file.lower():
713
            mode = 'tumor'
714
        elif 'bed' in mrmr_file.lower():
715
            mode = 'bed'
716
        else:
717
            mode = 'edge'
718
        spath = os.path.join(base_path, 'Analysis', 'mrmr_%s_%s_feat_%s_%s' % (group, area, mrmr_time, mode))
719
    else:
720
        spath = os.path.join(base_path, 'Analysis', 'mrmr_%s_%s' % (group, area))
721
722
    print('\tSaving to: ', spath)
723
    if not os.path.exists(spath):
724
        os.makedirs(spath)
725
726
    # Run mRMR
727
    if not mrmr_file:
728
        print('\tCreating a new mRMR file')
729
730
        log_file_path = os.path.join(base_path, 'Analysis', 'mRMR_results_%s_%s.txt' % (group, area))
731
        log_file = open(log_file_path, 'w')
732
        call([PATH_TO_MRMR, '-i', sname, '-t', '1', '-n', '200'],
733
             stdout=log_file)
734
        log_file.close()
735
736
        # Select out predictive features
737
        mrmr_file = os.path.join(base_path, 'Analysis', 'mRMR_results_%s_%s.txt' % (group, area))
738
    else:
739
        mrmr_file = os.path.join(base_path, 'Analysis', mrmr_file)
740
        print('\tUsing the specified mRMR file:\n\t\t%s' % mrmr_file)
741
742
    # Read mRMR features
743
    # res = pd.read_csv(mrmr_file, delimiter='\t', header=205, skipfooter=9, skipinitialspace=True, engine='python')
744
    res = pd.read_csv(mrmr_file, delimiter='\t', header=3, skipfooter=212, skipinitialspace=True, engine='python')
745
    cols = res.keys().tolist()
746
    cols = [i.strip() for i in cols]
747
    res.columns = cols
748
749
    features = res['Name'].tolist()
750
    features = [i.strip() for i in features]
751
752
    # Make heatmaps of control and pd1
753
    if htmaps:
754
        # Make heatmaps for cntr and pd1
755
        df_cnt = df_tumor.loc[df_tumor['Group'] == group + 'Cnt', :].copy()
756
        df_pd1 = df_tumor.loc[df_tumor['Group'] == group + 'PD1', :].copy()
757
758
        # Create class in the first column
759
        category_cnt = df_cnt['recurrence'].astype(int)
760
        category_pd1 = df_pd1['recurrence'].astype(int)
761
762
        # Remove columns
763
        df_cnt = df_cnt[features]
764
        df_pd1 = df_pd1[features]
765
766
        # Convert columns to float
767
        for key in features:
768
            df_cnt.loc[:, key] = pd.to_numeric(df_cnt.loc[:, key], errors='ignore')
769
            df_pd1.loc[:, key] = pd.to_numeric(df_pd1.loc[:, key], errors='ignore')
770
771
        # Rename to remove "original_"
772
        for key in df_cnt.keys():
773
            nkey = key.replace('original_', '')
774
            df_cnt = df_cnt.rename(columns={key: nkey})
775
            df_pd1 = df_pd1.rename(columns={key: nkey})
776
777
        # Set up colors for the animal classification
778
        lut = dict(zip(dfn['Class'].unique(), 'bg'))
779
        row_colors_cnt = category_cnt.map(lut)
780
        row_colors_pd1 = category_pd1.map(lut)
781
        row_colors_cnt.name = 'Recurrence'
782
        row_colors_pd1.name = 'Recurrence'
783
784
        # sns.set(font_scale=0.8)
785
        # g = sns.clustermap(df_cnt, figsize=(18, 18), standard_scale=1, metric='correlation', row_colors=row_colors_cnt)
786
        # g.savefig(os.path.join(spath, 'heatmap_animals_cnt.png'), dpi=300)
787
        # plt.close()
788
        #
789
        # sns.set(font_scale=0.8)
790
        # g = sns.clustermap(df_pd1, figsize=(18, 18), standard_scale=1, metric='correlation', row_colors=row_colors_pd1)
791
        # g.savefig(os.path.join(spath, 'heatmap_animals_pd1.png'), dpi=300)
792
        # plt.close()
793
794
        # Remove class category
795
        # df_cnt = df_cnt.drop('Class', axis=1)
796
        # df_pd1 = df_pd1.drop('Class', axis=1)
797
798
        # Compute correlation matrix
799
        df_cnt = df_cnt.corr()
800
        df_pd1 = df_pd1.corr()
801
802
        # Sort correlation matrix
803
        order_cnt = df_cnt.sum(axis=0).argsort()[::-1]
804
        order_pd1 = df_pd1.sum(axis=0).argsort()[::-1]
805
806
        # Reorder row and columns
807
        df_cnt = df_cnt.iloc[order_cnt, order_cnt]
808
        df_pd1 = df_pd1.iloc[order_cnt, order_cnt]
809
810
        # Plot correlation - control
811
        sns.set(font_scale=0.8)
812
        f, ax = plt.subplots(figsize=(18, 10))
813
        sns.heatmap(df_cnt, vmax=1.0, square=True, ax=ax)
814
        plt.yticks(rotation=0)
815
        ax.get_xaxis().set_ticks([])
816
        plt.xticks(rotation=90)
817
        # f.savefig(os.path.join(spath, 'heatmap_control_corr.svg'))
818
        f.savefig(os.path.join(spath, 'heatmap_control_corr.png'), dpi=300)
819
        plt.close()
820
821
        # Plot correlation - PD1
822
        sns.set(font_scale=0.8)
823
        f, ax = plt.subplots(figsize=(18, 10))
824
        sns.heatmap(df_pd1, vmax=1.0, square=True, ax=ax)
825
        plt.yticks(rotation=0)
826
        ax.get_xaxis().set_ticks([])
827
        plt.xticks(rotation=90)
828
        # f.savefig(os.path.join(spath, 'heatmap_pd1_corr.svg'))
829
        f.savefig(os.path.join(spath, 'heatmap_pd1_corr.png'), dpi=300)
830
        plt.close()
831
832
        # Plot correlation - difference
833
        sns.set(font_scale=0.8)
834
        f, ax = plt.subplots(figsize=(18, 10))
835
        sns.heatmap(np.abs(df_cnt - df_pd1), vmax=1.0, square=True, ax=ax)
836
        plt.yticks(rotation=0)
837
        ax.get_xaxis().set_ticks([])
838
        plt.xticks(rotation=90)
839
        # f.savefig(os.path.join(spath, 'heatmap_cnt_pd1_diff_corr.svg'))
840
        f.savefig(os.path.join(spath, 'heatmap_cnt_pd1_diff_corr.png'), dpi=300)
841
        plt.close()
842
843
844
    if htmaps:
845
        # Make heatmap dataframe
846
        htmp1 = dfn[features].copy()
847
848
        # Convert columns to float (from object)
849
        for key in htmp1.keys():
850
            htmp1.loc[:, key] = pd.to_numeric(htmp1.loc[:, key], errors='ignore')
851
852
        # Rename to remove "original_"
853
        for key in htmp1.keys():
854
855
            nkey = key.replace('original_', '')
856
            htmp1 = htmp1.rename(columns={key: nkey})
857
858
        # Set up colors for the animal classification
859
        lut = dict(zip(dfn['Class'].unique(), 'bg'))
860
        row_colors = dfn['Class'].map(lut)
861
        row_colors.name = 'Recurrence'
862
863
        # sns.set(font_scale=0.8)
864
        # g = sns.clustermap(htmp1, figsize=(18, 18), standard_scale=1, metric='correlation', row_colors=row_colors)
865
        # g.savefig(os.path.join(spath, 'heatmap_animals.svg'))
866
        # # g.savefig(os.path.join(spath, 'heatmap_animals.png'), dpi=300)
867
        # plt.close()
868
869
        # Compute correlation matrix - recurrence and no recurrence
870
        htmp_nrec = dfn.loc[dfn['Class'] == 0, features]
871
        htmp_rec = dfn.loc[dfn['Class'] == 1, features]
872
873
        # Convert columns to float
874
        for key in features:
875
            htmp_nrec.loc[:, key] = pd.to_numeric(htmp_nrec.loc[:, key], errors='ignore')
876
            htmp_rec.loc[:, key] = pd.to_numeric(htmp_rec.loc[:, key], errors='ignore')
877
878
        # Rename to remove "original_"
879
        for key in htmp_nrec.keys():
880
            nkey = key.replace('original_', '')
881
            htmp_nrec = htmp_nrec.rename(columns={key: nkey})
882
            htmp_rec = htmp_rec.rename(columns={key: nkey})
883
884
        # Compute correlation matrix
885
        htmp_nrec = htmp_nrec.corr()
886
        htmp_rec = htmp_rec.corr()
887
888
        # Sort correlation matrix
889
        order_nrec = htmp_nrec.sum(axis=0).argsort()[::-1]
890
        order_rec = htmp_rec.sum(axis=0).argsort()[::-1]
891
892
        # Reorder row and columns
893
        htmp_nrec = htmp_nrec.iloc[order_nrec, order_nrec]
894
        htmp_rec = htmp_rec.iloc[order_nrec, order_nrec]
895
896
        # Plot correlation - no recurrence
897
        sns.set(font_scale=0.8)
898
        f, ax = plt.subplots(figsize=(18, 10))
899
        sns.heatmap(htmp_nrec, vmax=1.0, square=True, ax=ax)
900
        plt.yticks(rotation=0)
901
        ax.get_xaxis().set_ticks([])
902
        plt.xticks(rotation=90)
903
        f.savefig(os.path.join(spath, 'heatmap_noRecur_corr.svg'))
904
        # f.savefig(os.path.join(spath, 'heatmap_noRecur_corr.png'), dpi=300)
905
        plt.close()
906
907
        # Plot correlation - recurrence
908
        sns.set(font_scale=0.8)
909
        f, ax = plt.subplots(figsize=(18, 10))
910
        sns.heatmap(htmp_rec, vmax=1.0, square=True, ax=ax)
911
        plt.yticks(rotation=0)
912
        ax.get_xaxis().set_ticks([])
913
        plt.xticks(rotation=90)
914
        f.savefig(os.path.join(spath, 'heatmap_Recur_corr.svg'))
915
        # f.savefig(os.path.join(spath, 'heatmap_Recur_corr.png'), dpi=300)
916
        plt.close()
917
918
        # Plot correlation - difference
919
        sns.set(font_scale=0.8)
920
        f, ax = plt.subplots(figsize=(18, 10))
921
        sns.heatmap(np.abs(htmp_nrec - htmp_rec), vmax=1.0, square=True, ax=ax)
922
        plt.yticks(rotation=0)
923
        ax.get_xaxis().set_ticks([])
924
        plt.xticks(rotation=90)
925
        f.savefig(os.path.join(spath, 'heatmap_Recur_diff_corr.svg'))
926
        # f.savefig(os.path.join(spath, 'heatmap_Recur_diff_corr.png'), dpi=300)
927
        plt.close()
928
929
    # Use only top 10 features
930
    features = features[:num_features]
931
932
    # Make heatmap dataframe - 10 features
933
    htmp1 = dfn[features].copy()
934
935
    # Convert columns to float (from object)
936
    for key in htmp1.keys():
937
        htmp1.loc[:, key] = pd.to_numeric(htmp1.loc[:, key], errors='ignore')
938
939
    # Rename to remove "original_"
940
    for key in htmp1.keys():
941
942
        nkey = key.replace('original_', '')
943
        htmp1 = htmp1.rename(columns={key: nkey})
944
945
    # Set up colors for the animal classification
946
    lut = dict(zip(dfn['Class'].unique(), 'bg'))
947
    row_colors = dfn['Class'].map(lut)
948
    row_colors.name = 'Recurrence'
949
950
    # sns.set(font_scale=0.8)
951
    # g = sns.clustermap(htmp1, figsize=(18, 18), standard_scale=1, metric='correlation', row_colors=row_colors)
952
    # g.savefig(os.path.join(spath, 'heatmap_animals.svg'))
953
    # # g.savefig(os.path.join(spath, 'heatmap_animals.png'), dpi=300)
954
    # plt.close()
955
956
    # Compute correlation matrix - recurrence and no recurrence
957
    htmp_nrec = dfn.loc[dfn['Class'] == 0, features]
958
    htmp_rec = dfn.loc[dfn['Class'] == 1, features]
959
960
    # Convert columns to float
961
    for key in features:
962
        htmp_nrec.loc[:, key] = pd.to_numeric(htmp_nrec.loc[:, key], errors='ignore')
963
        htmp_rec.loc[:, key] = pd.to_numeric(htmp_rec.loc[:, key], errors='ignore')
964
965
    # Rename to remove "original_"
966
    for key in htmp_nrec.keys():
967
        nkey = key.replace('original_', '')
968
        htmp_nrec = htmp_nrec.rename(columns={key: nkey})
969
        htmp_rec = htmp_rec.rename(columns={key: nkey})
970
971
    # Compute correlation matrix
972
    htmp_nrec = htmp_nrec.corr()
973
    htmp_rec = htmp_rec.corr()
974
975
    # Sort correlation matrix
976
    order_nrec = htmp_nrec.sum(axis=0).argsort()[::-1]
977
    order_rec = htmp_rec.sum(axis=0).argsort()[::-1]
978
979
    # Reorder row and columns
980
    htmp_nrec = htmp_nrec.iloc[order_nrec, order_nrec]
981
    htmp_rec = htmp_rec.iloc[order_nrec, order_nrec]
982
983
    # Plot correlation - no recurrence
984
    sns.set(font_scale=0.8)
985
    f, ax = plt.subplots(figsize=(18, 10))
986
    sns.heatmap(htmp_nrec, vmax=1.0, square=True, ax=ax)
987
    plt.yticks(rotation=0)
988
    ax.get_xaxis().set_ticks([])
989
    plt.xticks(rotation=90)
990
    f.savefig(os.path.join(spath, 'heatmap_noRecur_corr_10.svg'))
991
    # f.savefig(os.path.join(spath, 'heatmap_noRecur_corr_10.png'), dpi=300)
992
    plt.close()
993
994
    # Plot correlation - recurrence
995
    sns.set(font_scale=0.8)
996
    f, ax = plt.subplots(figsize=(18, 10))
997
    sns.heatmap(htmp_rec, vmax=1.0, square=True, ax=ax)
998
    plt.yticks(rotation=0)
999
    ax.get_xaxis().set_ticks([])
1000
    plt.xticks(rotation=90)
1001
    f.savefig(os.path.join(spath, 'heatmap_Recur_corr_10.svg'))
1002
    # f.savefig(os.path.join(spath, 'heatmap_Recur_corr_10.png'), dpi=300)
1003
    plt.close()
1004
1005
    # Plot correlation - difference
1006
    sns.set(font_scale=0.8)
1007
    f, ax = plt.subplots(figsize=(18, 10))
1008
    sns.heatmap(np.abs(htmp_nrec - htmp_rec), vmax=1.0, square=True, ax=ax)
1009
    plt.yticks(rotation=0)
1010
    ax.get_xaxis().set_ticks([])
1011
    plt.xticks(rotation=90)
1012
    f.savefig(os.path.join(spath, 'heatmap_Recur_diff_corr_10.svg'))
1013
    # f.savefig(os.path.join(spath, 'heatmap_Recur_diff_corr_10.png'), dpi=300)
1014
    plt.close()
1015
1016
    # Create a DataFrame with mRMR features
1017
    df_sub = df_tumor[features + rm_keys + [lab_keys]].copy()
1018
1019
    # Perform computations
1020
    examine_recurrence(df_sub, spath, group=group)
1021
1022
    return spath
1023
1024
1025
def plot_radiomics(rad, label, ids, spath):
1026
    """
1027
    Plot radiomic features used for recurrence classification.
1028
    Args:
1029
        rad (pandas dataframe): radiomic features
1030
        label (pandas dataframe): recurrence labels
1031
        ids (list): animal IDs
1032
        spath (str): savepath
1033
1034
    Returns:
1035
1036
    """
1037
1038
    sns.set_style('whitegrid')
1039
1040
    # Reduce the number of features to plot to 5
1041
    keys = rad.keys()
1042
    rad = rad[keys[:6]]
1043
1044
    # Set up figure
1045
    fig = plt.figure(figsize=(10, 6))
1046
    ax = fig.add_axes([0.1, 0.5, 0.80, 0.47])
1047
1048
    # Get the list of radiomic features
1049
    keys = rad.keys()
1050
1051
    # Create a dictionary (later a Dataframe) for plotting with Seaborn
1052
    df = {'Features': [], 'Value': [], 'Recurrence': [], 'ID': []}
1053
1054
    # Populate dictionary, df, with features and values
1055
    for key in keys:
1056
1057
        # Get feature values
1058
        tmp = rad[key].to_numpy()
1059
1060
        # Normalize between -1 and 1
1061
        tmp -= tmp.min()
1062
        tmp /= 0.5 * tmp.max()
1063
        tmp -= 1
1064
1065
        for i in range(len(tmp)):
1066
1067
            # Remove "original" from the plots
1068
            nkey = key.replace('original_', '')
1069
1070
            # Capitalize method
1071
            ind1 = nkey.find('_')
1072
            method = nkey[:ind1]
1073
            if method == 'firstorder':
1074
                method = 'First Order'
1075
            elif method == 'shape':
1076
                method = method.capitalize()
1077
            else:
1078
                method = method.upper()
1079
1080
            # Get modality
1081
            ind2 = nkey.rfind('_')
1082
            modality = nkey[ind2 + 1:]
1083
            if method != 'Shape':
1084
                nkey = '%s\n%s\n%s' % (modality, method, nkey[ind1 + 1:ind2])
1085
            else:
1086
                nkey = '%s\n%s' % (method, nkey[ind1 + 1:ind2])
1087
1088
1089
            # Append features, values, recurrence, and IDs to the dictionary
1090
            df['Features'].append(nkey)
1091
            df['Value'].append(tmp[i])
1092
1093
            if label.iloc[i]:
1094
                df['Recurrence'].append('Yes')
1095
            else:
1096
                df['Recurrence'].append('No')
1097
1098
            df['ID'].append(ids.iloc[i])
1099
1100
    # Convert dictionary to Dataframe
1101
    df = pd.DataFrame.from_dict(df)
1102
1103
    # Plot
1104
    g = sns.boxplot(x='Features', y='Value', hue='Recurrence', data=df, ax=ax, palette="Set1")
1105
1106
    # Rotate feature names
1107
    g.set_xticklabels(g.get_xticklabels(), rotation=60, fontsize=10)
1108
    # g.set_yticklabels(g.get_yticklabels(), fontsize=10)
1109
    g.set_xlabel('Features', fontsize=10)
1110
    g.set_ylabel('Normalized Value')
1111
1112
    # Fix legend
1113
    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='Recurrence')
1114
1115
    # Save figure
1116
    fig.savefig(os.path.join(spath, 'feature_box.svg'), format='svg')
1117
    # fig.savefig(os.path.join(spath, 'feature_box.png'), format='png', dpi=300)
1118
1119
1120
def clean_key(key):
1121
    # Remove "original" from the plots
1122
    nkey = key.replace('original_', '')
1123
1124
    # Capitalize method
1125
    ind1 = nkey.find('_')
1126
    method = nkey[:ind1]
1127
    if method == 'firstorder':
1128
        method = 'First Order'
1129
    elif method == 'shape':
1130
        method = method.capitalize()
1131
    else:
1132
        method = method.upper()
1133
1134
    # Get modality
1135
    ind2 = nkey.rfind('_')
1136
    modality = nkey[ind2 + 1:]
1137
    if method != 'Shape':
1138
        nkey = '%s\n%s\n%s' % (modality, method, nkey[ind1 + 1:ind2])
1139
    else:
1140
        nkey = '%s\n%s' % (method, nkey[ind1 + 1:ind2])
1141
1142
    return nkey
1143
1144
1145
def paired_ttests_loc(df, base_path, area):
1146
    """
1147
    Compute paired t-test for data
1148
    Args:
1149
        df (Pandas df): full radiomics data (with label columns)
1150
        area (str): mask area
1151
1152
    Returns:
1153
1154
    """
1155
    sns.set_style('whitegrid')
1156
1157
    # Set up spath
1158
    spath = os.path.join(base_path, 'Analysis', 'ttests')
1159
    if not os.path.exists(spath):
1160
        os.makedirs(spath)
1161
1162
    # Set up output
1163
    log_file_path = os.path.join(spath, 'ttest_%s.txt' % area)
1164
    f = open(log_file_path, 'w')
1165
1166
    # Get indicies of pre/post RT
1167
    inds_1 = df['Group'].str.contains('Pre')
1168
    inds_2 = df['Group'].str.contains('Post')
1169
    keys = df.keys()
1170
    n_sets = len(df['ID'].unique())
1171
    label = np.empty(shape=len(inds_1), dtype=str)
1172
    non_data_labels = ['Group', 'recurrence', 'rec_days', 'ID']
1173
1174
    # Create a dictionary (later a Dataframe) for evaluation and plotting with Seaborn
1175
    dfn = {'Features': [], 'P-Value': [], 'OrigFeatures': [], 'Contrast': []}
1176
1177
1178
    # kkeys = [key for key in keys if key not in non_data_labels]
1179
    # df_k = df[kkeys]
1180
    # ob = MultiComparison(df_k, inds_1)
1181
    # ob.allpairtest(stats.ttest_rel)
1182
1183
    # Populate dictionary, df, with features and values
1184
    for key in keys:
1185
1186
        if key not in non_data_labels:
1187
1188
            # Get feature values
1189
            tmp = df[key].to_numpy()
1190
1191
            p1 = tmp[inds_1]
1192
            p2 = tmp[inds_2]
1193
1194
            results = stats.ttest_rel(p1, p2)
1195
1196
            if not np.isnan(results.pvalue):
1197
1198
                # Remove "original" from the plots
1199
                nkey = key.replace('original_', '')
1200
1201
                # Capitalize method
1202
                ind1 = nkey.find('_')
1203
                method = nkey[:ind1]
1204
                if method == 'firstorder':
1205
                    method = 'First Order'
1206
                elif method == 'shape':
1207
                    method = method.capitalize()
1208
                else:
1209
                    method = method.upper()
1210
1211
                # Get modality
1212
                ind2 = nkey.rfind('_')
1213
                modality = nkey[ind2+1:]
1214
                if method != 'Shape':
1215
                    nkey = '%s\n%s\n%s' % (modality, method, nkey[ind1+1:ind2])
1216
                else:
1217
                    nkey = '%s\n%s' % (method, nkey[ind1+1:ind2])
1218
1219
                # Add contrast
1220
                if '_T1C' in key:
1221
                    con = 'T1C'
1222
1223
                elif 'T1' in key:
1224
                    con = 'T1'
1225
1226
                elif 'T2' in key:
1227
                    con = 'T2'
1228
1229
                else:
1230
                    con = None
1231
1232
                if not nkey in dfn['Features']:
1233
                    # Append features, values, recurrence, and IDs to the dictionary
1234
                    dfn['Features'].append(nkey)
1235
                    dfn['OrigFeatures'].append(key)
1236
                    dfn['P-Value'].append(results.pvalue)
1237
                    dfn['Contrast'].append(con)
1238
1239
    # Convert dictionary to Dataframe
1240
    dfn = pd.DataFrame.from_dict(dfn)
1241
1242
    f.write('%d significant features found after t-tests!\n' % np.sum(dfn['P-Value'] < 0.05))
1243
1244
    from statsmodels.sandbox.stats.multicomp import multipletests
1245
    mod = multipletests(dfn['P-Value'], alpha=0.05)
1246
1247
    # Plot corrected P-values
1248
    fig = plt.figure(figsize=(10, 6))
1249
    plt.plot(dfn['P-Value'], 'bo', label='Original')
1250
    plt.plot(mod[1], 'ro', label='Corrected')
1251
    plt.legend()
1252
    plt.xlim([0, 400])
1253
    plt.ylabel('P-Value')
1254
    plt.xlabel('Features')
1255
    # plt.savefig(os.path.join(spath, 'compare_corrected_pvalues.png'), dpi=300)
1256
    plt.savefig(os.path.join(spath, 'compare_corrected_pvalues.svg'), dpi=300)
1257
    plt.close(fig)
1258
1259
    # Remove all but the smallest p-values
1260
    dfn['P-Value'] = mod[1]
1261
    dfn = dfn[mod[0]]
1262
    dfn = dfn.sort_values('P-Value', ascending=True)  # Sort by P-Value, ascending
1263
    dfn = dfn.reset_index(drop=True)
1264
    f.write('%d significant features found after multiple p-value correction!\n' % len(dfn))
1265
1266
    # p_thresh = 0.05
1267
    # dfn = dfn.drop(dfn[dfn['P-Value'] > p_thresh].index)
1268
    # f.write('%d significant features found!\n' % len(dfn))
1269
    # dfn = dfn.reset_index(drop=True)
1270
1271
    # Get stats on which contrasts are most significant
1272
    t1 = 0
1273
    t1c = 0
1274
    t2 = 0
1275
    shape = 0
1276
    feature_cats = {'firstorder': 0,
1277
                    'glcm': 0,
1278
                    'glszm':  0,
1279
                    'glrlm': 0,
1280
                    'glrlm': 0,
1281
                    'ngtdm': 0,
1282
                    'gldm': 0}
1283
1284
    for key in dfn['Features']:
1285
1286
        if 'T1C' in key:
1287
            t1c += 1
1288
1289
        elif 'T1' in key:
1290
            t1 += 1
1291
1292
        elif 'T2' in key:
1293
            t2 += 1
1294
1295
        elif 'Shape' in key:
1296
            shape += 1
1297
1298
        if 'first order' in key.lower():
1299
            feature_cats['firstorder'] += 1
1300
        elif 'glcm' in key.lower():
1301
            feature_cats['glcm'] += 1
1302
        elif 'glszm' in key.lower():
1303
            feature_cats['glszm'] += 1
1304
        elif 'glrlm' in key.lower():
1305
            feature_cats['glrlm'] += 1
1306
        elif 'ngtdm' in key.lower():
1307
            feature_cats['ngtdm'] += 1
1308
        elif 'gldm' in key.lower():
1309
            feature_cats['gldm'] += 1
1310
1311
    counts_df = pd.DataFrame({'Contrast': ['Shape', 'T1', 'T1C', 'T2'], 'Count': [shape, t1, t1c, t2]})
1312
1313
    f.write('Features summary for radiomics computed from %s mask\n' % area)
1314
    f.write('-'*40 + '\n')
1315
1316
    f.write('Contrast\tCount\n')
1317
    for i, con in enumerate(counts_df['Contrast']):
1318
1319
        f.write('%s\t\t%d\n' % (con, counts_df['Count'].iloc[i]))
1320
1321
    f.write('-'*40 + '\n')
1322
    f.write('Feature\tCount\n')
1323
    for key in feature_cats.keys():
1324
1325
        f.write('%s\t\t%s\n' % (key, feature_cats[key]))
1326
1327
1328
1329
    f.write('-'*40 + '\n')
1330
    for con, pval in zip(dfn['Features'], dfn['P-Value']):
1331
1332
        con = con.splitlines()
1333
        if len(con) == 3:
1334
            f.write('%s\n%s\n%s\nP-value: %0.5f\n\n' % (con[1], con[2], con[0], pval) )
1335
        else:
1336
            f.write('%s\n%s\nP-Value: %0.5f\n\n' % (con[1], con[0], pval) )
1337
1338
1339
    fig = plt.figure(figsize=(10, 6))
1340
    ax = fig.add_axes([0.11, 0.15, 0.87, 0.65])
1341
1342
    sns.barplot(x='Contrast', y='Count', data=counts_df, ax=ax, color='midnightblue', saturation=0.5)
1343
    plt.ylabel('Num. features (p < 0.05)')
1344
    # plt.show()
1345
    fig.savefig(os.path.join(spath, 'ttest_feature_count_%s.svg' % area))
1346
1347
    # Plot specific categories
1348
    cats = ['firstorder', 'shape', 'GLCM']
1349
    mk_size = 6
1350
    cmap = sns.color_palette()
1351
1352
    # Set up a df for plotting all values
1353
    df_plot = {'Features': [], 'Value': [], 'RT': [], }
1354
1355
    for key, nkey in zip(dfn['OrigFeatures'], dfn['Features']):
1356
1357
        # Get feature values
1358
        tmp = df[key].to_numpy()
1359
1360
        # Normalize between -1 and 1
1361
        tmp -= tmp.min()
1362
        tmp /= 0.5 * tmp.max()
1363
        tmp -= 1
1364
1365
        for i in range(len(tmp)):
1366
1367
            # Append features, values, recurrence, and IDs to the dictionary
1368
            df_plot['Features'].append(nkey)
1369
            df_plot['Value'].append(tmp[i])
1370
1371
            if inds_1.iloc[i]:
1372
                df_plot['RT'].append('Pre RT')
1373
            else:
1374
                df_plot['RT'].append('Post RT')
1375
1376
    df_plot = pd.DataFrame.from_dict(df_plot)
1377
1378
    # Set up figure
1379
    fig = plt.figure(figsize=(20 * len(dfn['Features'])/10, 8))
1380
    ax = fig.add_axes([0.08, 0.4, 0.9, 0.60])
1381
1382
    # Plot all
1383
    g = sns.boxplot(x='Features', y='Value', hue='RT', data=df_plot, ax=ax, palette="Set1")
1384
1385
    # Rotate feature names
1386
    # g.set_xticklabels(g.get_xticklabels(), rotation=35, fontsize=18)
1387
    # ax.set_ylabel('')
1388
    g.set_xticklabels(g.get_xticklabels(), rotation=35, fontsize=10)
1389
    g.set_yticklabels(g.get_yticklabels(), fontsize=10)
1390
    g.set_xlabel('Features', fontsize=10)
1391
    ax.set_ylabel('')
1392
1393
    # Fix legend
1394
    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='RT')
1395
1396
    # Save figure
1397
    fig.savefig(os.path.join(spath, 'ttest_feature_box_%s.svg' % area), format='svg')
1398
    # fig.savefig(os.path.join(spath, 'ttest_feature_box_%s.png' % area), format='png', dpi=300)
1399
1400
    # Plot top features
1401
    nkeys = 6
1402
    keep_keys = dfn['Features'].iloc[:nkeys]
1403
1404
    small_df_plot = df_plot.loc[df_plot['Features'].isin(keep_keys), :]
1405
1406
    fs = 10
1407
    fig = plt.figure() #plt.figure(figsize=(10, 6))
1408
    ax = fig.add_axes([0.08, 0.4, 0.80, 0.59])
1409
1410
    # Plot
1411
    g = sns.boxplot(x='Features', y='Value', hue='RT', data=small_df_plot, ax=ax, palette="Set1")
1412
1413
    # Rotate feature names
1414
    g.set_xticklabels(g.get_xticklabels(), rotation=60, fontsize=fs)
1415
    g.set_xlabel('Features', fontsize=fs)
1416
    g.set_ylabel('Normalized Value', fontsize=fs)
1417
1418
    # Fix legend
1419
    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='Recurrence')
1420
1421
    # Save figure
1422
    fig.savefig(os.path.join(spath, 'ttest_feature_box_select%d_%s.svg' % (nkeys, area) ), format='svg')
1423
    # fig.savefig(os.path.join(spath, 'ttest_feature_box_select%d_%s.png' % (nkeys, area) ), format='png')
1424
1425
    # Plot specific features
1426
    if 'tumor' in area:
1427
        keep_keys = ['T2\nGLSZM\nZoneVariance',
1428
                     'T2\nGLRLM\nRunLengthNonUniformity',
1429
                     'Shape\nVoxelVolume',
1430
                     'T1C\nGLSZM\nLargeAreaHighGrayLevelEmphasis',
1431
                     'T1C\nGLDM\nDependenceNonUniformity',
1432
                     'T1\nFirstOrder\nEnergy'
1433
                     ]
1434
    elif 'bed' in area:
1435
        keep_keys = ['T2\nGLDM\nDependenceNonUniformity',
1436
                     'T2\nFirst Order\nEnergy',
1437
                     'Shape\nVoxelVolume',
1438
                     'T1C\nGLRLM\nRunVariance',
1439
                     'T1\nGLSZM\nLongRunLowGrayLevelEmphasis',
1440
                     'T1C\nFirst Order\nRange'
1441
                     ]
1442
    else: # Edge
1443
        keep_keys = ['T2\nFirst Order\nEnergy',
1444
                     'Shape\nSurfaceArea',
1445
                     'Shape\nMajorAxisLength',
1446
                     'T1C\nFirst Order\nRange',
1447
                     'Shape\nMaximum3DDiameter',
1448
                     'T1C\nGLRLM\nGrayLevelNonUniformity'
1449
                     ]
1450
1451
    small_df_plot = df_plot.loc[df_plot['Features'].isin(keep_keys), :]
1452
1453
    fig = plt.figure(figsize=(10, 6))
1454
    ax = fig.add_axes([0.08, 0.4, 0.80, 0.59])
1455
1456
    # Plot
1457
    g = sns.boxplot(x='Features', y='Value', hue='RT', data=small_df_plot, ax=ax, palette="Set1")
1458
1459
    # Rotate feature names
1460
    g.set_xticklabels(g.get_xticklabels(), rotation=60, fontsize=10)
1461
    g.set_xlabel('Features', fontsize=10)
1462
    g.set_ylabel('Normalized Value')
1463
1464
    # Fix legend
1465
    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='Recurrence')
1466
1467
    # Save figure
1468
    fig.savefig(os.path.join(spath, 'ttest_feature_box_hand_select_%s.svg' % area), format='svg')
1469
    # fig.savefig(os.path.join(spath, 'ttest_feature_box_hand_select_%s.png' % area), format='png')
1470
1471
    # Close log file
1472
    f.close()
1473
1474
1475
def change_with_RT_rec(df, spath, area='tumor'):
1476
    """
1477
    Computes the difference between pre/post RT and generates plots showing these values.
1478
    Args:
1479
        df (pandas dataframe): radiomic features
1480
        spath (str): savepath
1481
        area (str): mask
1482
1483
    Returns:
1484
1485
    """
1486
1487
    sns.set_style('whitegrid')
1488
1489
    feature_seletion = 'ttest'
1490
1491
    if 'ttest' in feature_seletion:
1492
        # Set up save paths
1493
        spath = os.path.join(spath, 'Analysis', 'ttests')
1494
        if not os.path.exists(spath):
1495
            os.makedirs(spath)
1496
1497
        # Compute p=values
1498
        # Get indicies of pre/post RT
1499
        inds_1 = df['Group'].str.contains('Pre')
1500
        inds_2 = df['Group'].str.contains('Post')
1501
        keys = df.keys()
1502
        non_data_labels = ['Group', 'recurrence', 'rec_days', 'ID']
1503
1504
        # Create a dictionary (later a Dataframe) for evaluation and plotting with Seaborn
1505
        dfn = {'Features': [], 'P-Value': [], 'OrigFeatures': [], 'Contrast': []}
1506
1507
        # Populate dictionary, df, with features and values
1508
        for key in keys:
1509
1510
            if key not in non_data_labels:
1511
1512
                # Get feature values
1513
                tmp = df[key].to_numpy()
1514
1515
                p1 = tmp[inds_1]
1516
                p2 = tmp[inds_2]
1517
1518
                results = stats.ttest_rel(p1, p2)
1519
1520
                if not np.isnan(results.pvalue):
1521
1522
                    # Remove "original" from the plots
1523
                    nkey = key.replace('original_', '')
1524
1525
                    # Capitalize method
1526
                    ind1 = nkey.find('_')
1527
                    method = nkey[:ind1]
1528
                    if method == 'firstorder':
1529
                        method = 'First Order'
1530
                    elif method == 'shape':
1531
                        method = method.capitalize()
1532
                    else:
1533
                        method = method.upper()
1534
1535
                    # Get modality
1536
                    ind2 = nkey.rfind('_')
1537
                    modality = nkey[ind2+1:]
1538
                    if method != 'Shape':
1539
                        nkey = '%s\n%s\n%s' % (modality, method, nkey[ind1+1:ind2])
1540
                    else:
1541
                        nkey = '%s\n%s' % (method, nkey[ind1+1:ind2])
1542
1543
                    # Add contrast
1544
                    if '_T1C' in key:
1545
                        con = 'T1C'
1546
1547
                    elif 'T1' in key:
1548
                        con = 'T1'
1549
1550
                    elif 'T2' in key:
1551
                        con = 'T2'
1552
1553
                    else:
1554
                        con = None
1555
1556
                    if not nkey in dfn['Features']:
1557
                        # Append features, values, recurrence, and IDs to the dictionary
1558
                        dfn['Features'].append(nkey)
1559
                        dfn['OrigFeatures'].append(key)
1560
                        dfn['P-Value'].append(results.pvalue)
1561
                        dfn['Contrast'].append(con)
1562
1563
        # Convert dictionary to Dataframe
1564
        dfn = pd.DataFrame.from_dict(dfn)
1565
1566
        from statsmodels.sandbox.stats.multicomp import multipletests
1567
        mod = multipletests(dfn['P-Value'], alpha=0.05)
1568
1569
        # Remove all but the smallest p-values
1570
        dfn['P-Value'] = mod[1]
1571
        dfn = dfn[mod[0]]
1572
        dfn = dfn.sort_values('P-Value', ascending=True)  # Sort by P-Value, ascending
1573
        dfn = dfn.reset_index(drop=True)
1574
1575
        # Get a list of good features
1576
        features = dfn['OrigFeatures'].to_list()
1577
        # features = features
1578
        n_features = len(features)
1579
1580
1581
    # Set up save path
1582
    spath_tmp = os.path.join(spath, 'Diff_{}'.format(area))
1583
    if not os.path.exists(spath_tmp):
1584
        os.makedirs(spath_tmp)
1585
1586
    # Get all unique ids
1587
    uniq_ids = df['ID'].unique()
1588
1589
    # Features to plot
1590
    features_to_plot = ['original_shape_SurfaceArea_T1',
1591
                        'original_firstorder_TotalEnergy_T1C',
1592
                        'original_gldm_DependenceEntropy_T2',
1593
                        'original_glrlm_LongRunLowGrayLevelemphasis_T2',
1594
                        'original_glrlm_GrayLevelNonUniformity_T1C',
1595
                        'original_glrlm_RunVariance_T1'
1596
                        ]
1597
1598
    # Compute difference (f_post - f_pre) / f_post
1599
    diff_df_plot = {'Feature': [], 'Value': [], 'recurrence': [], 'RT': []}   # Plotting df
1600
    diff_df = {'recurrence': []}                                    # Processing df
1601
    for id in uniq_ids:
1602
1603
        # Get animal data
1604
        animal = df.loc[df['ID'] == id]
1605
1606
        # Get recurrence
1607
        rec = int(animal.loc[['Pre' in a for a in animal['Group']]]['recurrence'])
1608
1609
        # Append to processing dataframe
1610
        diff_df['recurrence'].append(rec)
1611
1612
        # Perform
1613
        for key in features_to_plot:
1614
1615
            # If the key is a data label
1616
            if key not in non_data_labels and key in features[:n_features]:
1617
                # Get pre/post values
1618
                post_val = float(animal.loc[['Post' in a for a in animal['Group']]][key])
1619
                pre_val = float(animal.loc[['Pre' in a for a in animal['Group']]][key])
1620
1621
                # Perform difference
1622
                val = 100 * (post_val - pre_val) / (post_val + 1e-6)
1623
1624
1625
                diff_df_plot['Value'].append(pre_val)
1626
                diff_df_plot['RT'].append('Pre')
1627
1628
                diff_df_plot['Value'].append(post_val)
1629
                diff_df_plot['RT'].append('Post')
1630
1631
                # Clean key for plotting
1632
                key = clean_key(key)
1633
1634
                for _ in range(2):
1635
1636
                    diff_df_plot['Feature'].append(key)
1637
1638
                    # Keep track of recurrence
1639
                    if rec == 0:
1640
                        diff_df_plot['recurrence'].append('No')
1641
                    else:
1642
                        diff_df_plot['recurrence'].append('Yes')
1643
1644
                    if key in diff_df.keys():
1645
                        diff_df[key].append(val)
1646
                    else:
1647
                        diff_df[key] = [val]
1648
1649
    diff_df_plot = pd.DataFrame.from_dict(diff_df_plot)
1650
    # diff_df = pd.DataFrame.from_dict(diff_df)
1651
1652
    # Normalize data
1653
    features_to_plot = [clean_key(i) for i in features_to_plot]
1654
    for key in features_to_plot:
1655
1656
        inds = diff_df_plot['Feature'] == key
1657
        vals = diff_df_plot.loc[inds]['Value']
1658
        mn = vals.min()
1659
        mx = vals.max()
1660
1661
        vals = (vals - mn) / (mx - mn + 1e-6)
1662
1663
        diff_df_plot['Value'][inds] = vals[inds]
1664
1665
    # diff_df_plot = diff_df_plot[keep_keys]
1666
1667
    # Set up figure
1668
    # fig = plt.figure(figsize=(60, 10))
1669
    # ax = fig.add_axes([0.05, 0.35, 0.9, 0.60])
1670
    #
1671
    # # Plot all
1672
    # g = sns.boxplot(x='Feature', y='Value', hue='recurrence', data=diff_df_plot, ax=ax, palette="Set1")
1673
    #
1674
    # # Rotate feature names
1675
    # g.set_xticklabels(g.get_xticklabels(), rotation=35, fontsize=10)
1676
    # g.set_yticklabels(g.get_yticklabels(), fontsize=10)
1677
    # g.set_xlabel('Features', fontsize=10)
1678
    # ax.set_ylabel('Change from Pre to Post RT [%]')
1679
    # plt.ylim([-100, 100])
1680
    #
1681
    # # Fix legend
1682
    # plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='Recurrence')
1683
    #
1684
    # plt.savefig(os.path.join(spath_tmp, 'diff_all_%s.svg' % area))
1685
    #
1686
    # plt.close(fig)
1687
1688
    # Set up figure
1689
    fig = plt.figure(figsize=(9, 6))
1690
    ax1 = plt.subplot(212)
1691
    ax2 = plt.subplot(211, sharex=ax1)
1692
1693
    df_tmp = diff_df_plot.loc[diff_df_plot['RT'] == 'Post']
1694
    g1 = sns.boxplot(x='Feature', y='Value', hue='recurrence', data=df_tmp, ax=ax1, palette="Set1")
1695
    df_tmp = diff_df_plot.loc[diff_df_plot['RT'] == 'Pre']
1696
    g2 = sns.boxplot(x='Feature', y='Value', hue='recurrence', data=df_tmp, ax=ax2, palette="Set1")
1697
    ax1.get_legend().remove()
1698
    ax2.get_legend().remove()
1699
1700
    # # Rotate feature names
1701
    g1.set_xticklabels(ax1.get_xticklabels(), rotation=40, fontsize=10)
1702
    # ax1.set_yticklabels(ax1.get_yticklabels(), fontsize=10)
1703
    g1.set_xlabel('Features', fontsize=10)
1704
    g1.set_ylabel('Normalized Value', fontsize=10)
1705
    g2.set_xlabel('Features', fontsize=10, visible=False)
1706
    g2.set_ylabel('Normalized Value', fontsize=10)
1707
    g2.set_xticklabels(g1.get_xticklabels(), rotation=35, fontsize=10, visible=False)
1708
    # ax2.set_yticklabels(ax1.get_yticklabels(), fontsize=10)
1709
    ax1.set_ylim([-0.05, 1.05])
1710
    ax2.set_ylim([-0.05, 1.05])
1711
1712
    # Fix legend
1713
    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., title='Recurrence')
1714
    plt.tight_layout()
1715
1716
    plt.savefig(os.path.join(spath_tmp, 'RT_rec_boxplots_%s.svg' % area))
1717
1718
    plt.close(fig)
1719
1720
    # Attempt classification
1721
1722
    # Get numeric data
1723
    # keys = diff_df.keys().tolist()
1724
    # lab_keys = 'recurrence'
1725
    # data_keys = keys
1726
    # data_keys.remove(lab_keys)
1727
    #
1728
    # # Separate data and labels
1729
    # rad = diff_df[data_keys].copy()
1730
    # rad = scale_features(rad)
1731
    # label = diff_df[lab_keys]
1732
    # # ids = diff_df['ID']
1733
    #
1734
    # # SVM classifier
1735
    # svm_classifier_loc(rad, label, spath_tmp)
1736
    #
1737
    # # NN classifier
1738
    # nn_classifier_loc(rad, label, spath_tmp)
1739
1740
1741
def load_data(base_path):
1742
1743
    spath = os.path.join(base_path, 'Analysis')
1744
1745
    snames = [os.path.join(spath, 'tumor_df.csv'),
1746
              os.path.join(spath, 'edge_df.csv'),
1747
              os.path.join(spath, 'bed_df.csv')]
1748
1749
    if not all([os.path.exists(i) for i in snames]):
1750
1751
        # Paths
1752
        summary_file = os.path.join(base_path, 'Summary.xlsx')
1753
        recurrence_file = os.path.join(base_path, 'recurrence.xlsx')
1754
        log_file = os.path.join(base_path, 'processing_log.json')
1755
1756
        recurrence_threshold = 20
1757
1758
        # Animals to exclude from analysis until segmentations are fixed
1759
        exclude = ['K520719', 'K520918', 'K521092', 'K521104', 'K520762']
1760
1761
        # Load summary data
1762
        summary_df, log = load_study_logs(summary_file, log_file)
1763
        df_rec = load_recurrence_log(recurrence_file, recurrence_threshold)
1764
1765
        # Sort data
1766
        df_rec = sort_study_data(df_rec, summary_df, exclude)
1767
        keep = df_rec['animalID'].tolist()
1768
1769
        # Load pre
1770
        radiomics_paths = os.path.join(base_path, 'Radiomics_control_preRT.txt')
1771
        df_pre_tumor_cnt = load_radiomics(radiomics_paths, exclude, region='tumor', keep=keep, group='PreCnt')
1772
        df_pre_edge_cnt = load_radiomics(radiomics_paths, exclude, region='edge', keep=keep, group='PreCnt')
1773
        df_pre_bed_cnt = load_radiomics(radiomics_paths, exclude, region='bed', keep=keep, group='PreCnt')
1774
1775
        radiomics_paths = os.path.join(base_path, 'Radiomics_PD1_preRT.txt')
1776
        df_pre_tumor_pd1 = load_radiomics(radiomics_paths, exclude, region='tumor', keep=keep, group='PrePD1')
1777
        df_pre_edge_pd1 = load_radiomics(radiomics_paths, exclude, region='edge', keep=keep, group='PrePD1')
1778
        df_pre_bed_pd1 = load_radiomics(radiomics_paths, exclude, region='bed', keep=keep, group='PrePD1')
1779
1780
        # Load post
1781
        radiomics_paths = os.path.join(base_path, 'Radiomics_control_postRT.txt')
1782
        df_post_tumor_cnt = load_radiomics(radiomics_paths, exclude, region='tumor', keep=keep, group='PostCnt')
1783
        df_post_edge_cnt = load_radiomics(radiomics_paths, exclude, region='edge', keep=keep, group='PostCnt')
1784
        df_post_bed_cnt = load_radiomics(radiomics_paths, exclude, region='bed', keep=keep, group='PostCnt')
1785
1786
        radiomics_paths = os.path.join(base_path, 'Radiomics_PD1_postRT.txt')
1787
        df_post_tumor_pd1 = load_radiomics(radiomics_paths, exclude, region='tumor', keep=keep, group='PostPD1')
1788
        df_post_edge_pd1 = load_radiomics(radiomics_paths, exclude, region='edge', keep=keep, group='PostPD1')
1789
        df_post_bed_pd1 = load_radiomics(radiomics_paths, exclude, region='bed', keep=keep, group='PostPD1')
1790
1791
        # Concatenate post for RT analysis
1792
        df_tumor = pd.concat((df_pre_tumor_cnt, df_pre_tumor_pd1, df_post_tumor_cnt, df_post_tumor_pd1), ignore_index=True)
1793
        df_edge = pd.concat((df_pre_edge_cnt, df_pre_edge_pd1, df_post_edge_cnt, df_post_edge_pd1), ignore_index=True)
1794
        df_bed = pd.concat((df_pre_bed_cnt, df_pre_bed_pd1, df_post_bed_cnt, df_post_bed_pd1), ignore_index=True)
1795
1796
        # Update recurrence data to account for incomplete animal sets
1797
        uniq = df_tumor['ID'].unique()
1798
        uniq_rec = df_rec['animalID'].unique()
1799
        diff = list(set(uniq_rec).difference(uniq))
1800
        for un in diff:
1801
            df_rec = df_rec[df_rec['animalID'] != un]
1802
1803
        # Check that the sets contains the same animals
1804
        uniq = df_tumor['ID'].unique()
1805
        uniq_rec = df_rec['animalID'].unique()
1806
        diff = list(set(uniq_rec).difference(uniq))
1807
        print('Len of recurrence data: %d' % len(uniq_rec))
1808
        print('Len of image data:      %d' % len(uniq))
1809
        print('Difference: ', list(set(uniq_rec).difference(uniq)))
1810
1811
        # Combine recurrence and radiomics dataframes
1812
        df_tumor = append_rec(df_rec, df_tumor)
1813
        df_edge = append_rec(df_rec, df_edge)
1814
        df_bed = append_rec(df_rec, df_bed)
1815
1816
        # Save dataframes
1817
        df_tumor.to_csv(snames[0])
1818
        df_edge.to_csv(snames[1])
1819
        df_bed.to_csv(snames[2])
1820
1821
    else:
1822
1823
        df_tumor = pd.DataFrame.from_csv(snames[0])
1824
        df_edge = pd.DataFrame.from_csv(snames[1])
1825
        df_bed = pd.DataFrame.from_csv(snames[2])
1826
1827
    return df_tumor, df_edge, df_bed
1828
1829
1830
class PPT:
1831
    def __init__(self, ppt_file):
1832
1833
        self.ppt_file = ppt_file
1834
        self.title_slide = 0
1835
        self.subtitle_slide = 2
1836
        self.title_and_content = 5
1837
        if not os.path.exists(ppt_file):
1838
            prs = pptx.Presentation()
1839
1840
        else:
1841
            prs = pptx.Presentation(ppt_file)
1842
1843
        title_slide_layout = prs.slide_layouts[self.title_slide]
1844
        slide = prs.slides.add_slide(title_slide_layout)
1845
1846
        title = slide.shapes.title
1847
        title.text = 'Radiomic classifications'
1848
1849
        subtitle = slide.placeholders[1]
1850
        subtitle.text = 'Created %s' % datetime.strftime(datetime.now(), '%B %d, %Y')
1851
1852
        self.prs = prs
1853
1854
        # Output file
1855
        self.tmp_file = 'tmp_class.png'
1856
1857
1858
    def add_slides(self, folders, nets=['NN', 'SVM'], im_files=['NN_NoPCA_roc.png', 'SVM_NoPCA_roc.png'], num_features=None):
1859
        """
1860
1861
        Args:
1862
            folders (list of str): path to 1. tumor, 2. edge, and 3. bed
1863
1864
        Returns:
1865
1866
        """
1867
1868
        # Image file names
1869
        # im_files = ['NN_NoPCA_roc.png', 'SVM_NoPCA_roc.png']
1870
1871
        # Get general information
1872
        # Grab folder name for getting group information
1873
        folder_name = os.path.split(folders[0])[1]
1874
        params = folder_name.split(sep='_')
1875
1876
        # Capitalize first letters
1877
        params = [i.lower().capitalize() for i in params]
1878
1879
        # Get group name
1880
        group = params[1]
1881
        mrmr_group = 'mRMR features: %s RT, %s' % (tuple(params[4:]))
1882
1883
        z = 0
1884
        area = []
1885
        files = dict(zip(nets, [[] for _ in nets]))
1886
        for folder in folders:
1887
            # Grab folder name for getting group information
1888
            folder_name = os.path.split(folder)[1]
1889
            params = folder_name.split(sep='_')
1890
1891
            # Capitalize first letters
1892
            params = [i.lower().capitalize() for i in params]
1893
1894
            # Get group name
1895
            area.append(params[2])
1896
1897
            # Get image names
1898
            for i, key in enumerate(nets):
1899
                files[key].append(os.path.join(folder, im_files[i]))
1900
1901
        # Create a single output images
1902
        from PIL import Image, ImageDraw
1903
        offset = 60
1904
        for z in range(len(files['NN'])):
1905
            if z == 0:
1906
                im = Image.open(files['NN'][z])
1907
                height = im.height
1908
                width = im.width
1909
1910
                full_im = Image.new(mode='RGBA', size=(3 * width + offset, len(im_files) * height), color=(255, 255, 255, 255))#(255,255,255,0))
1911
1912
            # Load images
1913
            for i, key in enumerate(nets):
1914
                tmp = Image.open(files[key][z])
1915
1916
                # Compute image coordinates
1917
                bbox = (z * width + offset, i * height)
1918
1919
                # Paste into the larger image
1920
                full_im.paste(tmp, box=bbox)
1921
1922
        # Add text - horizontal
1923
        font = ImageFont.truetype("/usr/share/fonts/truetype/freefont/FreeMono.ttf", 60)
1924
        overlay = Image.new('RGBA', full_im.size, (0,0,0,0))
1925
        draw = ImageDraw.Draw(overlay)
1926
        for z  in range(3):
1927
            bbox1 = (z * width + offset, 45)
1928
            w, h = draw.textsize(area[z], font=font)
1929
            draw.text((bbox1[0] + (width - w) // 2 + 10, bbox1[1]), text=area[z], fill='black', font=font)
1930
1931
        # Add text - vertical
1932
        overlay = overlay.rotate(-90, expand=1)
1933
        draw = ImageDraw.Draw(overlay)
1934
        for z in range(len(nets)):
1935
            bbox1 = (z * height, 10)
1936
            w, h = draw.textsize(area[z], font=font)
1937
            draw.text((bbox1[0] + (height - w)//2, bbox1[1]), text=nets[len(nets)-z-1], fill='black', font=font)
1938
        overlay = overlay.rotate(90, expand=1)
1939
1940
        # Create overlayed image
1941
        out = Image.alpha_composite(full_im, overlay)
1942
1943
        # Save the temporary large image
1944
        out.save(self.tmp_file)
1945
1946
        # Create slide title
1947
        if not num_features:
1948
            section_title = '%s RT. %s' % (group, mrmr_group)
1949
        else:
1950
            section_title = '%s RT. %s, x%d' % (group, mrmr_group, num_features)
1951
1952
1953
        # Update PowerPoint
1954
        content_slide = self.prs.slide_layouts[self.title_and_content]
1955
        slide = self.prs.slides.add_slide(content_slide)
1956
1957
        title = slide.shapes.title
1958
        title.text = section_title
1959
1960
        # Insert image to PPT
1961
        left = pptx.util.Inches(0.1)
1962
        top = pptx.util.Inches(1.6)
1963
        width = pptx.util.Inches(9.8)
1964
        height = pptx.util.Inches(5.7)
1965
        pic = slide.shapes.add_picture(self.tmp_file, left, top, width, height)
1966
1967
    def save(self):
1968
1969
        # Save PPT
1970
        self.prs.save(self.ppt_file)
1971
1972
        # Remove temp image file
1973
        os.remove(self.tmp_file)
1974
1975
1976
def recur_survival(df_tumor, base_path):
1977
1978
    spath = os.path.join(base_path, 'Analysis', 'Survival')
1979
    if not os.path.exists(spath):
1980
        os.makedirs(spath)
1981
1982
    # Set up fitter
1983
    kmf = KaplanMeierFitter()
1984
    df = df_tumor.loc[['Pre' in i for i in df_tumor['Group']]]
1985
1986
    fig, ax = plt.subplots()
1987
1988
    # Non recurrent
1989
    T = df.loc[~df['recurrence']]['rec_days']
1990
    E = T > 180  #np.ones(T.shape) #df_tumor['recurrence']
1991
1992
    kmf.fit(durations=T, event_observed=E, label='No recurrence')
1993
    kmf.plot(ax=ax)
1994
1995
    # Recurrent
1996
    T = df.loc[df['recurrence']]['rec_days']
1997
    E = T > 180  #np.ones(T.shape) #df_tumor['recurrence']
1998
1999
    kmf.fit(durations=T, event_observed=E, label='Recurrence')
2000
    kmf.plot(ax=ax)
2001
2002
    plt.savefig(os.path.join(spath, 'survival.svg'))
2003
2004
def main():
2005
2006
    tstart = time()
2007
    base_path = '/media/matt/Seagate Expansion Drive/b7TData_19/b7TData/Results'
2008
2009
    df_tumor, df_edge, df_bed = load_data(base_path)
2010
2011
    # Survival curves
2012
    # recur_survival(df_tumor, base_path)
2013
2014
    """ 
2015
    A look at differences
2016
    """
2017
    change_with_RT_rec(df_tumor, spath=base_path, area='tumor')
2018
    change_with_RT_rec(df_bed, spath=base_path, area='bed')
2019
    change_with_RT_rec(df_edge, spath=base_path, area='edge')
2020
2021
    # Paired t-test for RT
2022
    paired_ttests_loc(df_tumor, base_path=base_path, area='tumor')
2023
    paired_ttests_loc(df_bed, base_path=base_path, area='bed')
2024
    paired_ttests_loc(df_edge, base_path=base_path, area='edge')
2025
2026
2027
    # mRMR - Test all combinations
2028
    groups = ['Pre', 'Post']
2029
    mrmr_files = ['mRMR_results_%s_tumor.txt', 'mRMR_results_%s_bed.txt',
2030
                  'mRMR_results_%s_edge.txt']
2031
2032
    # Set up PPT
2033
    ppt_file = os.path.join(base_path, 'Analysis', 'classifications3.pptx')
2034
    ppt = PPT(ppt_file)
2035
2036
    spaths = [''] * 3
2037
    num_features = range(10, 115)
2038
    mrmr_file = 'mRMR_results_%s_edge.txt'
2039
2040
    gropus = ['Pre']
2041
    mrmr_files = ['mRMR_results_%s_edge.txt']
2042
    for group in groups:
2043
        for mrmr_file in mrmr_files:
2044
            nf = 108
2045
            mrmr_file = mrmr_file % group
2046
            spaths[0] = mRMR(df_tumor, group=group, base_path=base_path, area='tumor', mrmr_file=mrmr_file,
2047
                             num_features=nf, htmaps=True)
2048
            spaths[1] = mRMR(df_edge, group=group, base_path=base_path, area='edge', mrmr_file=mrmr_file,
2049
                             num_features=nf, htmaps=True)
2050
            spaths[2] = mRMR(df_bed, group=group, base_path=base_path, area='bed', mrmr_file=mrmr_file,
2051
                             num_features=nf, htmaps=True)
2052
2053
            # Add classifications to a PPT
2054
            ppt.add_slides(folders=spaths, num_features=nf)
2055
2056
            ppt.save()
2057
2058
2059
    # Test Post RT
2060
    ppt_file = os.path.join(base_path, 'Analysis', 'classifications4.pptx')
2061
    ppt = PPT(ppt_file)
2062
2063
    spaths = [''] * 3
2064
    num_features = range(10, 115)
2065
    # mrmr_file = 'mRMR_results_%s_edge.txt'
2066
2067
    groups = ['Post']
2068
    htmaps = False
2069
    # mrmr_files = ['mRMR_results_%s_edge.txt']
2070
    for group in groups:
2071
        for mrmr_file in mrmr_files:
2072
            nf = 108
2073
            mrmr_file = mrmr_file % group
2074
            spaths[0] = mRMR(df_tumor, group=group, base_path=base_path, area='tumor', mrmr_file=mrmr_file,
2075
                             num_features=nf, htmaps=htmaps)
2076
            spaths[1] = mRMR(df_edge, group=group, base_path=base_path, area='edge', mrmr_file=mrmr_file,
2077
                             num_features=nf, htmaps=htmaps)
2078
            spaths[2] = mRMR(df_bed, group=group, base_path=base_path, area='bed', mrmr_file=mrmr_file,
2079
                             num_features=nf, htmaps=htmaps)
2080
2081
            # Add classifications to a PPT
2082
            ppt.add_slides(folders=spaths, num_features=nf)
2083
2084
            ppt.save()
2085
2086
    print('\tTotal time (HH:MM:SS): %s\n\n' % (str(dt.timedelta(seconds=round(time() - tstart)))))
2087
2088
2089
2090
if __name__ == '__main__':
2091
2092
    main()
2093
2094
    folders = ['/media/matt/Seagate Expansion Drive/b7TData_19/b7TData/Results/Analysis/mrmr_Pre_tumor_feat_Pre_tumor',
2095
               '/media/matt/Seagate Expansion Drive/b7TData_19/b7TData/Results/Analysis/mrmr_Pre_edge_feat_Pre_tumor',
2096
               '/media/matt/Seagate Expansion Drive/b7TData_19/b7TData/Results/Analysis/mrmr_Pre_bed_feat_Pre_tumor']