Diff of /inputdata_setup.py [000000] .. [2cc208]

Switch to side-by-side view

--- a
+++ b/inputdata_setup.py
@@ -0,0 +1,197 @@
+import os, sys
+import numpy as np
+import pandas as pd
+import fnmatch
+import pickle
+
+
+def all_files_exist(flist):
+    numfiles = len(flist)
+    allexist = True
+    co = 0
+    while allexist and co < numfiles:
+        allexist = os.path.isfile(flist[co])
+        co += 1
+    return allexist
+
+
+def file_len(fname):
+    if os.path.isfile(fname) and os.path.getsize(fname)>0:
+        with open(fname) as f:
+            for i, l in enumerate(f):
+                pass
+        return i + 1
+    else: 
+        print('Failed to read {}!'.format(fname))
+        return -1
+
+
+try:
+    stubdir = sys.argv[1]
+    print('Reading mesh motion data from directory {}....'.format(stubdir))
+except:
+    print('Please pass name of directory containing segmented data...')
+
+
+mpointsfile = 'matchedpointsnew.txt'
+pim2 = 'subjnames.txt'
+outcomefile = os.path.join(stubdir,'surv_outcomes.csv')
+stepsuccess = [True for _ in range(3)]
+
+
+# Read mpointsfile
+try: 
+    mpoints = np.loadtxt(os.path.join(stubdir,mpointsfile), dtype=int)
+except: 
+    stepsuccess[0] = False
+    print('{} read failed!'.format(mpointsfile))
+
+# Read list of subjects
+try: 
+    with open(os.path.join(stubdir,pim2)) as f: IDlist = [lin.strip('\n') for lin in f.readlines() if len(lin)>1]
+except: 
+    stepsuccess[1] = False
+    print('{} read failed!'.format(pim2))
+
+# Find number of vertices
+if stepsuccess[1]:
+    try:
+        meshtxtfile = os.path.join(stubdir,IDlist[0],'motion/RV_fr00.txt')
+        num_vertx = file_len(meshtxtfile)
+        if num_vertx <= 0:
+            stepsuccess[2] = False
+            print('There was a problem reading {} in order to determine number of vertices in 3D meshes!'.format(meshtxtfile))
+    except:
+        stepsuccess[2] = False
+        print('Failed to read {} in order to determine number of vertices in 3D meshes!'.format(meshtxtfile))
+
+
+validIDs = [False]
+numframes = 20
+if all(stepsuccess):
+    print('\n\n------------------------------------------')
+    print('Reading mesh motion data from directory {}...'.format(stubdir))
+    print('Subject IDs will be read from file {}...'.format(pim2))
+    print('Expected number of vertices per mesh = {0}, of which {1} will be extracted'.format(num_vertx, mpoints.shape[0]))
+    print('Outcome data will be read from file {}...'.format(outcomefile))
+    print('------------------------------------------\n\n\n')
+    if os.path.exists(stubdir):
+        if len(IDlist)>0:
+            validIDs = [False for _ in range(len(IDlist))]
+            X_all = np.zeros(shape=(len(IDlist),(numframes-1),mpoints.shape[0],3), dtype=float)
+            for counter,ID in enumerate(IDlist):
+                if os.path.exists(os.path.join(stubdir,ID)):
+                    if os.path.exists(os.path.join(stubdir,ID,'motion')):
+                        frames_file_list = [os.path.join(stubdir, ID, 'motion/RV_fr' + '{:0>2}'.format(b) + '.txt') for b in range(numframes)]
+                        if all_files_exist(frames_file_list):
+                            nframes = len(fnmatch.filter(os.listdir(os.path.join(stubdir , ID , 'motion')), 'RV_fr*.txt'))
+                            if nframes == numframes:
+                                if np.sum([file_len(frames_file_list[i]) == num_vertx for i in range(numframes)]) == numframes:
+                                    vs = [True for _ in range(numframes)]
+                                    try:
+                                        coords_fr0 = np.loadtxt(frames_file_list[0])[mpoints[:,1]]
+                                    except: 
+                                        print('Error! could not read file {} !'.format(frames_file_list[0]))
+                                        vs[0]=False
+                                    if vs[0]:
+                                        for j in range(1, numframes):
+                                            try: 
+                                                coords_frj = np.loadtxt(frames_file_list[j])[mpoints[:,1]]
+                                            except: 
+                                                print('Error! could not read file {} !'.format(frames_file_list[j]))
+                                                vs[j]=False
+                                            if vs[j]:
+                                                X_all[counter, j-1, :, :] = coords_frj - coords_fr0
+                                            else:
+                                                break
+                                        if np.all(vs):
+                                            validIDs[counter] = True
+                                            print('Successfully read motion data for ID {}'.format(ID))
+                #               else: print(ID + ' RV files do not have ' + str(num_vertx) + ' vertices')
+                                else:
+                                    print('{0} : wrong # of vertices, expected {1} for all {2} frames but got {3}'.format(ID,num_vertx,numframes,str([file_len(frames_file_list[i]) for i in range(numframes)])))
+                            else:
+                                print('{0} : RV files exist but not {1} in number. Skipping to next ID....'.format(ID,numframes))
+                        else:
+                            print(ID + ' : folder exists but not all RV files exist. Skipping to next ID....')
+                    else:
+                        print('There is no motion folder under directory {} !'.format(os.path.join(stubdir,ID)))
+                else:
+                    print('{0} folder does not exist under directory {1}'.format(ID,stubdir))
+        else:
+            print('No IDs found in predinput_master2.txt !')
+    else:
+        print('directory meant to contain IDs is not valid!' )
+else: pass
+
+
+if any(validIDs): 
+    numvalids = np.sum(validIDs)
+    print('{} IDs with valid mesh motion data were found'.format(numvalids))
+    X = X_all[validIDs]
+else: print('No valid mesh motion data could be read!')
+
+
+# Processing outcome data
+
+# Read outcome master file - Column 1: ID, Column 2: censoring status, Column 3: time to event/censoring
+# Tests of outcome file:
+    # number of columns is 3
+    # columns ordered correctly - ID, status, time
+    # columns contain correct data (ID is string, status = 0 or 1, time > 0)
+if any(validIDs):
+    oreadable = True
+    ofmtcorr1 = True
+    if os.path.exists(outcomefile):
+        try:
+            outcome_df = pd.read_csv(outcomefile)
+        except: 
+            print('Error in reading outcome file {} !'.format(outcomefile))
+            oreadable = False
+        if oreadable:
+            print('Outcome file {0} read: {1} rows and {2} columns...'.format(outcomefile, outcome_df.shape[0], outcome_df.shape[1]))
+            if len(outcome_df.columns) != 3:
+                print('Wrong number of columns in outcome file {} ! Expected 3 columns'.format(outcomefile))
+            else:
+                outcome_df.columns = ['ID','status','time']
+                try: 
+                    ocorrfmt = np.all([ i and j for (i,j) in zip([l in [0,1] for l in list(outcome_df.status)], [k>=0 for k in list(outcome_df.time)])])
+                except: 
+                    ofmtcorr1 = False
+                    ocorrfmt = False
+                if not (ofmtcorr1 and ocorrfmt):
+                    print('status and/or time columns in {} are incorrectly formatted!'.format(outcomefile))
+                    if ofmtcorr1==True and ocorrfmt == False:
+                        aw = np.argwhere([ not(i and j) for (i,j) in zip([l in [0,1] for l in list(outcome_df.status)], [k>=0 for k in list(outcome_df.time)])])
+                        if aw.shape[0] > 0:
+                            print('{} {rw} {w} problematic: '.format(aw.shape[0],rw='rows' if aw.shape[0]>1 else 'row',w='were' if aw.shape[0]>1 else 'was'))
+                            print(outcome_df.iloc[list(aw[:,0])])
+                else: 
+                    if any(validIDs):
+                        print('matching mesh motion data IDs to outcome data IDs....')
+                        IDlist_valids = list(np.array(IDlist)[validIDs])
+                        #IDlist_woutc = [ii for ii in IDlist_valids if ii in list(outcome_df.ID)]
+                        IDlist_woutc = list(set(list(outcome_df.ID)).intersection(set(IDlist_valids)))
+                        if len(IDlist_woutc)==0:
+                            print('None of the IDs from the mesh motion data were found in outcome file {}'.format(outcomefile))
+                        else:
+                            print('{1} of {2} valid IDs from mesh motion data were found in outcome file {0}'.format(outcomefile, len(IDlist_woutc), len(IDlist_valids)))
+                            if len(IDlist_woutc) < len(IDlist_valids):
+                                print('The following IDs from the mesh motion data were not found in outcome file {} :'.format(outcomefile))
+                                print([ii for ii in IDlist_valids if ii not in list(outcome_df.ID)])
+                            y = outcome_df[(outcome_df['ID'].isin(IDlist_woutc))]
+                            matchmask = [(u in IDlist_woutc) for u in IDlist_valids]
+                            Xout = X[matchmask]
+                            xshp = Xout.shape
+                            xymatch = (y.shape[0]==xshp[0])
+                            assert xymatch, 'ERROR: mesh motion (x) data has {1} rows while outcome (y) data has {0} rows'.format(y.shape[0], xshp[0])
+                            if xymatch:
+                                Xfin = Xout.reshape(xshp[:2]+(np.prod(xshp[2:]),)).reshape((xshp[0],-1))
+                                plist = [Xfin,np.array(y[['status','time']]),list(y.ID)]
+                                pklname = 'inputdata_DL' + '.pkl'
+                                pklpath = os.path.join(os.getcwd(),'data',pklname)
+                                with open(pklpath, 'wb') as f: pickle.dump(obj=plist, file=f)
+                                print('Mesh motion and corresponding survival data for {0} subjects has been saved in {1}'.format(xshp[0],pklpath))
+
+    else:
+        print('Outcome file {} does not exist! Outcome data cannot be read!'.format(outcomefile))