--- a +++ b/inputdata_setup.py @@ -0,0 +1,197 @@ +import os, sys +import numpy as np +import pandas as pd +import fnmatch +import pickle + + +def all_files_exist(flist): + numfiles = len(flist) + allexist = True + co = 0 + while allexist and co < numfiles: + allexist = os.path.isfile(flist[co]) + co += 1 + return allexist + + +def file_len(fname): + if os.path.isfile(fname) and os.path.getsize(fname)>0: + with open(fname) as f: + for i, l in enumerate(f): + pass + return i + 1 + else: + print('Failed to read {}!'.format(fname)) + return -1 + + +try: + stubdir = sys.argv[1] + print('Reading mesh motion data from directory {}....'.format(stubdir)) +except: + print('Please pass name of directory containing segmented data...') + + +mpointsfile = 'matchedpointsnew.txt' +pim2 = 'subjnames.txt' +outcomefile = os.path.join(stubdir,'surv_outcomes.csv') +stepsuccess = [True for _ in range(3)] + + +# Read mpointsfile +try: + mpoints = np.loadtxt(os.path.join(stubdir,mpointsfile), dtype=int) +except: + stepsuccess[0] = False + print('{} read failed!'.format(mpointsfile)) + +# Read list of subjects +try: + with open(os.path.join(stubdir,pim2)) as f: IDlist = [lin.strip('\n') for lin in f.readlines() if len(lin)>1] +except: + stepsuccess[1] = False + print('{} read failed!'.format(pim2)) + +# Find number of vertices +if stepsuccess[1]: + try: + meshtxtfile = os.path.join(stubdir,IDlist[0],'motion/RV_fr00.txt') + num_vertx = file_len(meshtxtfile) + if num_vertx <= 0: + stepsuccess[2] = False + print('There was a problem reading {} in order to determine number of vertices in 3D meshes!'.format(meshtxtfile)) + except: + stepsuccess[2] = False + print('Failed to read {} in order to determine number of vertices in 3D meshes!'.format(meshtxtfile)) + + +validIDs = [False] +numframes = 20 +if all(stepsuccess): + print('\n\n------------------------------------------') + print('Reading mesh motion data from directory {}...'.format(stubdir)) + print('Subject IDs will be read from file {}...'.format(pim2)) + print('Expected number of vertices per mesh = {0}, of which {1} will be extracted'.format(num_vertx, mpoints.shape[0])) + print('Outcome data will be read from file {}...'.format(outcomefile)) + print('------------------------------------------\n\n\n') + if os.path.exists(stubdir): + if len(IDlist)>0: + validIDs = [False for _ in range(len(IDlist))] + X_all = np.zeros(shape=(len(IDlist),(numframes-1),mpoints.shape[0],3), dtype=float) + for counter,ID in enumerate(IDlist): + if os.path.exists(os.path.join(stubdir,ID)): + if os.path.exists(os.path.join(stubdir,ID,'motion')): + frames_file_list = [os.path.join(stubdir, ID, 'motion/RV_fr' + '{:0>2}'.format(b) + '.txt') for b in range(numframes)] + if all_files_exist(frames_file_list): + nframes = len(fnmatch.filter(os.listdir(os.path.join(stubdir , ID , 'motion')), 'RV_fr*.txt')) + if nframes == numframes: + if np.sum([file_len(frames_file_list[i]) == num_vertx for i in range(numframes)]) == numframes: + vs = [True for _ in range(numframes)] + try: + coords_fr0 = np.loadtxt(frames_file_list[0])[mpoints[:,1]] + except: + print('Error! could not read file {} !'.format(frames_file_list[0])) + vs[0]=False + if vs[0]: + for j in range(1, numframes): + try: + coords_frj = np.loadtxt(frames_file_list[j])[mpoints[:,1]] + except: + print('Error! could not read file {} !'.format(frames_file_list[j])) + vs[j]=False + if vs[j]: + X_all[counter, j-1, :, :] = coords_frj - coords_fr0 + else: + break + if np.all(vs): + validIDs[counter] = True + print('Successfully read motion data for ID {}'.format(ID)) + # else: print(ID + ' RV files do not have ' + str(num_vertx) + ' vertices') + else: + print('{0} : wrong # of vertices, expected {1} for all {2} frames but got {3}'.format(ID,num_vertx,numframes,str([file_len(frames_file_list[i]) for i in range(numframes)]))) + else: + print('{0} : RV files exist but not {1} in number. Skipping to next ID....'.format(ID,numframes)) + else: + print(ID + ' : folder exists but not all RV files exist. Skipping to next ID....') + else: + print('There is no motion folder under directory {} !'.format(os.path.join(stubdir,ID))) + else: + print('{0} folder does not exist under directory {1}'.format(ID,stubdir)) + else: + print('No IDs found in predinput_master2.txt !') + else: + print('directory meant to contain IDs is not valid!' ) +else: pass + + +if any(validIDs): + numvalids = np.sum(validIDs) + print('{} IDs with valid mesh motion data were found'.format(numvalids)) + X = X_all[validIDs] +else: print('No valid mesh motion data could be read!') + + +# Processing outcome data + +# Read outcome master file - Column 1: ID, Column 2: censoring status, Column 3: time to event/censoring +# Tests of outcome file: + # number of columns is 3 + # columns ordered correctly - ID, status, time + # columns contain correct data (ID is string, status = 0 or 1, time > 0) +if any(validIDs): + oreadable = True + ofmtcorr1 = True + if os.path.exists(outcomefile): + try: + outcome_df = pd.read_csv(outcomefile) + except: + print('Error in reading outcome file {} !'.format(outcomefile)) + oreadable = False + if oreadable: + print('Outcome file {0} read: {1} rows and {2} columns...'.format(outcomefile, outcome_df.shape[0], outcome_df.shape[1])) + if len(outcome_df.columns) != 3: + print('Wrong number of columns in outcome file {} ! Expected 3 columns'.format(outcomefile)) + else: + outcome_df.columns = ['ID','status','time'] + try: + ocorrfmt = np.all([ i and j for (i,j) in zip([l in [0,1] for l in list(outcome_df.status)], [k>=0 for k in list(outcome_df.time)])]) + except: + ofmtcorr1 = False + ocorrfmt = False + if not (ofmtcorr1 and ocorrfmt): + print('status and/or time columns in {} are incorrectly formatted!'.format(outcomefile)) + if ofmtcorr1==True and ocorrfmt == False: + aw = np.argwhere([ not(i and j) for (i,j) in zip([l in [0,1] for l in list(outcome_df.status)], [k>=0 for k in list(outcome_df.time)])]) + if aw.shape[0] > 0: + print('{} {rw} {w} problematic: '.format(aw.shape[0],rw='rows' if aw.shape[0]>1 else 'row',w='were' if aw.shape[0]>1 else 'was')) + print(outcome_df.iloc[list(aw[:,0])]) + else: + if any(validIDs): + print('matching mesh motion data IDs to outcome data IDs....') + IDlist_valids = list(np.array(IDlist)[validIDs]) + #IDlist_woutc = [ii for ii in IDlist_valids if ii in list(outcome_df.ID)] + IDlist_woutc = list(set(list(outcome_df.ID)).intersection(set(IDlist_valids))) + if len(IDlist_woutc)==0: + print('None of the IDs from the mesh motion data were found in outcome file {}'.format(outcomefile)) + else: + print('{1} of {2} valid IDs from mesh motion data were found in outcome file {0}'.format(outcomefile, len(IDlist_woutc), len(IDlist_valids))) + if len(IDlist_woutc) < len(IDlist_valids): + print('The following IDs from the mesh motion data were not found in outcome file {} :'.format(outcomefile)) + print([ii for ii in IDlist_valids if ii not in list(outcome_df.ID)]) + y = outcome_df[(outcome_df['ID'].isin(IDlist_woutc))] + matchmask = [(u in IDlist_woutc) for u in IDlist_valids] + Xout = X[matchmask] + xshp = Xout.shape + xymatch = (y.shape[0]==xshp[0]) + assert xymatch, 'ERROR: mesh motion (x) data has {1} rows while outcome (y) data has {0} rows'.format(y.shape[0], xshp[0]) + if xymatch: + Xfin = Xout.reshape(xshp[:2]+(np.prod(xshp[2:]),)).reshape((xshp[0],-1)) + plist = [Xfin,np.array(y[['status','time']]),list(y.ID)] + pklname = 'inputdata_DL' + '.pkl' + pklpath = os.path.join(os.getcwd(),'data',pklname) + with open(pklpath, 'wb') as f: pickle.dump(obj=plist, file=f) + print('Mesh motion and corresponding survival data for {0} subjects has been saved in {1}'.format(xshp[0],pklpath)) + + else: + print('Outcome file {} does not exist! Outcome data cannot be read!'.format(outcomefile))