Diff of /inputdata_setup.py [000000] .. [2cc208]

Switch to unified view

a b/inputdata_setup.py
1
import os, sys
2
import numpy as np
3
import pandas as pd
4
import fnmatch
5
import pickle
6
7
8
def all_files_exist(flist):
9
    numfiles = len(flist)
10
    allexist = True
11
    co = 0
12
    while allexist and co < numfiles:
13
        allexist = os.path.isfile(flist[co])
14
        co += 1
15
    return allexist
16
17
18
def file_len(fname):
19
    if os.path.isfile(fname) and os.path.getsize(fname)>0:
20
        with open(fname) as f:
21
            for i, l in enumerate(f):
22
                pass
23
        return i + 1
24
    else: 
25
        print('Failed to read {}!'.format(fname))
26
        return -1
27
28
29
try:
30
    stubdir = sys.argv[1]
31
    print('Reading mesh motion data from directory {}....'.format(stubdir))
32
except:
33
    print('Please pass name of directory containing segmented data...')
34
35
36
mpointsfile = 'matchedpointsnew.txt'
37
pim2 = 'subjnames.txt'
38
outcomefile = os.path.join(stubdir,'surv_outcomes.csv')
39
stepsuccess = [True for _ in range(3)]
40
41
42
# Read mpointsfile
43
try: 
44
    mpoints = np.loadtxt(os.path.join(stubdir,mpointsfile), dtype=int)
45
except: 
46
    stepsuccess[0] = False
47
    print('{} read failed!'.format(mpointsfile))
48
49
# Read list of subjects
50
try: 
51
    with open(os.path.join(stubdir,pim2)) as f: IDlist = [lin.strip('\n') for lin in f.readlines() if len(lin)>1]
52
except: 
53
    stepsuccess[1] = False
54
    print('{} read failed!'.format(pim2))
55
56
# Find number of vertices
57
if stepsuccess[1]:
58
    try:
59
        meshtxtfile = os.path.join(stubdir,IDlist[0],'motion/RV_fr00.txt')
60
        num_vertx = file_len(meshtxtfile)
61
        if num_vertx <= 0:
62
            stepsuccess[2] = False
63
            print('There was a problem reading {} in order to determine number of vertices in 3D meshes!'.format(meshtxtfile))
64
    except:
65
        stepsuccess[2] = False
66
        print('Failed to read {} in order to determine number of vertices in 3D meshes!'.format(meshtxtfile))
67
68
69
validIDs = [False]
70
numframes = 20
71
if all(stepsuccess):
72
    print('\n\n------------------------------------------')
73
    print('Reading mesh motion data from directory {}...'.format(stubdir))
74
    print('Subject IDs will be read from file {}...'.format(pim2))
75
    print('Expected number of vertices per mesh = {0}, of which {1} will be extracted'.format(num_vertx, mpoints.shape[0]))
76
    print('Outcome data will be read from file {}...'.format(outcomefile))
77
    print('------------------------------------------\n\n\n')
78
    if os.path.exists(stubdir):
79
        if len(IDlist)>0:
80
            validIDs = [False for _ in range(len(IDlist))]
81
            X_all = np.zeros(shape=(len(IDlist),(numframes-1),mpoints.shape[0],3), dtype=float)
82
            for counter,ID in enumerate(IDlist):
83
                if os.path.exists(os.path.join(stubdir,ID)):
84
                    if os.path.exists(os.path.join(stubdir,ID,'motion')):
85
                        frames_file_list = [os.path.join(stubdir, ID, 'motion/RV_fr' + '{:0>2}'.format(b) + '.txt') for b in range(numframes)]
86
                        if all_files_exist(frames_file_list):
87
                            nframes = len(fnmatch.filter(os.listdir(os.path.join(stubdir , ID , 'motion')), 'RV_fr*.txt'))
88
                            if nframes == numframes:
89
                                if np.sum([file_len(frames_file_list[i]) == num_vertx for i in range(numframes)]) == numframes:
90
                                    vs = [True for _ in range(numframes)]
91
                                    try:
92
                                        coords_fr0 = np.loadtxt(frames_file_list[0])[mpoints[:,1]]
93
                                    except: 
94
                                        print('Error! could not read file {} !'.format(frames_file_list[0]))
95
                                        vs[0]=False
96
                                    if vs[0]:
97
                                        for j in range(1, numframes):
98
                                            try: 
99
                                                coords_frj = np.loadtxt(frames_file_list[j])[mpoints[:,1]]
100
                                            except: 
101
                                                print('Error! could not read file {} !'.format(frames_file_list[j]))
102
                                                vs[j]=False
103
                                            if vs[j]:
104
                                                X_all[counter, j-1, :, :] = coords_frj - coords_fr0
105
                                            else:
106
                                                break
107
                                        if np.all(vs):
108
                                            validIDs[counter] = True
109
                                            print('Successfully read motion data for ID {}'.format(ID))
110
                #               else: print(ID + ' RV files do not have ' + str(num_vertx) + ' vertices')
111
                                else:
112
                                    print('{0} : wrong # of vertices, expected {1} for all {2} frames but got {3}'.format(ID,num_vertx,numframes,str([file_len(frames_file_list[i]) for i in range(numframes)])))
113
                            else:
114
                                print('{0} : RV files exist but not {1} in number. Skipping to next ID....'.format(ID,numframes))
115
                        else:
116
                            print(ID + ' : folder exists but not all RV files exist. Skipping to next ID....')
117
                    else:
118
                        print('There is no motion folder under directory {} !'.format(os.path.join(stubdir,ID)))
119
                else:
120
                    print('{0} folder does not exist under directory {1}'.format(ID,stubdir))
121
        else:
122
            print('No IDs found in predinput_master2.txt !')
123
    else:
124
        print('directory meant to contain IDs is not valid!' )
125
else: pass
126
127
128
if any(validIDs): 
129
    numvalids = np.sum(validIDs)
130
    print('{} IDs with valid mesh motion data were found'.format(numvalids))
131
    X = X_all[validIDs]
132
else: print('No valid mesh motion data could be read!')
133
134
135
# Processing outcome data
136
137
# Read outcome master file - Column 1: ID, Column 2: censoring status, Column 3: time to event/censoring
138
# Tests of outcome file:
139
    # number of columns is 3
140
    # columns ordered correctly - ID, status, time
141
    # columns contain correct data (ID is string, status = 0 or 1, time > 0)
142
if any(validIDs):
143
    oreadable = True
144
    ofmtcorr1 = True
145
    if os.path.exists(outcomefile):
146
        try:
147
            outcome_df = pd.read_csv(outcomefile)
148
        except: 
149
            print('Error in reading outcome file {} !'.format(outcomefile))
150
            oreadable = False
151
        if oreadable:
152
            print('Outcome file {0} read: {1} rows and {2} columns...'.format(outcomefile, outcome_df.shape[0], outcome_df.shape[1]))
153
            if len(outcome_df.columns) != 3:
154
                print('Wrong number of columns in outcome file {} ! Expected 3 columns'.format(outcomefile))
155
            else:
156
                outcome_df.columns = ['ID','status','time']
157
                try: 
158
                    ocorrfmt = np.all([ i and j for (i,j) in zip([l in [0,1] for l in list(outcome_df.status)], [k>=0 for k in list(outcome_df.time)])])
159
                except: 
160
                    ofmtcorr1 = False
161
                    ocorrfmt = False
162
                if not (ofmtcorr1 and ocorrfmt):
163
                    print('status and/or time columns in {} are incorrectly formatted!'.format(outcomefile))
164
                    if ofmtcorr1==True and ocorrfmt == False:
165
                        aw = np.argwhere([ not(i and j) for (i,j) in zip([l in [0,1] for l in list(outcome_df.status)], [k>=0 for k in list(outcome_df.time)])])
166
                        if aw.shape[0] > 0:
167
                            print('{} {rw} {w} problematic: '.format(aw.shape[0],rw='rows' if aw.shape[0]>1 else 'row',w='were' if aw.shape[0]>1 else 'was'))
168
                            print(outcome_df.iloc[list(aw[:,0])])
169
                else: 
170
                    if any(validIDs):
171
                        print('matching mesh motion data IDs to outcome data IDs....')
172
                        IDlist_valids = list(np.array(IDlist)[validIDs])
173
                        #IDlist_woutc = [ii for ii in IDlist_valids if ii in list(outcome_df.ID)]
174
                        IDlist_woutc = list(set(list(outcome_df.ID)).intersection(set(IDlist_valids)))
175
                        if len(IDlist_woutc)==0:
176
                            print('None of the IDs from the mesh motion data were found in outcome file {}'.format(outcomefile))
177
                        else:
178
                            print('{1} of {2} valid IDs from mesh motion data were found in outcome file {0}'.format(outcomefile, len(IDlist_woutc), len(IDlist_valids)))
179
                            if len(IDlist_woutc) < len(IDlist_valids):
180
                                print('The following IDs from the mesh motion data were not found in outcome file {} :'.format(outcomefile))
181
                                print([ii for ii in IDlist_valids if ii not in list(outcome_df.ID)])
182
                            y = outcome_df[(outcome_df['ID'].isin(IDlist_woutc))]
183
                            matchmask = [(u in IDlist_woutc) for u in IDlist_valids]
184
                            Xout = X[matchmask]
185
                            xshp = Xout.shape
186
                            xymatch = (y.shape[0]==xshp[0])
187
                            assert xymatch, 'ERROR: mesh motion (x) data has {1} rows while outcome (y) data has {0} rows'.format(y.shape[0], xshp[0])
188
                            if xymatch:
189
                                Xfin = Xout.reshape(xshp[:2]+(np.prod(xshp[2:]),)).reshape((xshp[0],-1))
190
                                plist = [Xfin,np.array(y[['status','time']]),list(y.ID)]
191
                                pklname = 'inputdata_DL' + '.pkl'
192
                                pklpath = os.path.join(os.getcwd(),'data',pklname)
193
                                with open(pklpath, 'wb') as f: pickle.dump(obj=plist, file=f)
194
                                print('Mesh motion and corresponding survival data for {0} subjects has been saved in {1}'.format(xshp[0],pklpath))
195
196
    else:
197
        print('Outcome file {} does not exist! Outcome data cannot be read!'.format(outcomefile))