--- a +++ b/modules/instantiate_nwk.py @@ -0,0 +1,120 @@ +import pandas as pd +import argparse +import numpy as np +import time +import sys +from multiprocessing import Pool +from sklearn.preprocessing import MinMaxScaler +from scipy.stats import pearsonr + +def isNum(x): + try: + float(x) + return True + except: + return False + +def corrCut(nwk,cutoff=None): + '''weight cutoff, positive sorting''' + nwk.dropna(subset=['weight'],inplace=True) + nwk.sort_values(by=['weight'],inplace=True,ascending=False) + if cutoff!=None: + return nwk.loc[lambda x:abs(x.weight)>=cutoff] + else: + return nwk + +def setMinExp(nwk,exp,expCut): + '''remove gene whose expression is lower than expCut''' + filtered_gene = exp[exp.max(axis=1)>1]['Hybridization REF'] + boolean_mask = np.logical_and(nwk['protein1'].isin(filtered_gene),nwk['protein2'].isin(filtered_gene)) + return nwk[boolean_mask] + +def expCut(nwk,exp,sample_list,expCut): + '''remove gene whose mean of group(mutated/not-mutated) expression is lower than expCut''' + with open(sample_list) as f: + mutSamples=f.read().strip().split() + exp['no_mut']=exp.loc[:,~exp.columns.isin(mutSamples)].mean(axis=1) + exp['mut']=exp.loc[:,exp.columns.isin(mutSamples)].mean(axis=1) + boolean_mask = np.logical_or(exp['no_mut']>=1,exp['mut']>=1) + gene_selected = exp[boolean_mask]['Hybridization REF'] + boolean_mask2 = np.logical_and(nwk['protein1'].isin(gene_selected),nwk['protein2'].isin(gene_selected)) + return nwk[boolean_mask2] + +def FCcut(nwk,FC_df,FCcut): + keys = FC_df.iloc[:,0] + values = FC_df.iloc[:,1] + dictionary = dict(zip(keys, values)) + dictionary.pop('?','Not Found') + first_col = np.array([dictionary[i] for i in nwk.iloc[:,0]]) + second_col = np.array([dictionary[i] for i in nwk.iloc[:,1]]) + boolean_mask = np.logical_and(abs(first_col) >= FCcut, abs(second_col) >= FCcut) + boolean_mask2 = nwk['protein1'].apply(lambda x: dictionary[x])*nwk['protein2'].apply(lambda x: dictionary[x])>0 + bigFC_nwk = nwk[boolean_mask & boolean_mask2] + return bigFC_nwk + +if __name__=="__main__": + parser = argparse.ArgumentParser(description='python 0_instantiate_nwk.py nwkFile exp -corrCut [] -nThreads [] -o []') + parser.add_argument('nwk',help='network') + parser.add_argument('exp',help='exp File') + parser.add_argument('-corrCut',type=float, required=False,help='corealtion cutoff') + parser.add_argument('-nThreads',type=int, required=False,default=1) + parser.add_argument('-o',required=True,help='output') + args=parser.parse_args() + + ####correaltion score combined with string score + start=time.time() + exp=pd.read_csv(args.exp,sep='\t',header=0,index_col=0) + + #remove duplicates + data=[] + with open(args.nwk) as f: + for line in f.readlines(): + tmp=line.strip().split('\t') + data.append(sorted(tmp)) + + df_nwk=pd.DataFrame(data[1:],columns=['node_1','node_2'],dtype=float) + + df_nwk.drop_duplicates(subset=['node_1','node_2'],inplace=True) + df_nwk_filt = df_nwk.loc[lambda x:np.logical_and(x.node_1.isin(exp.index),x.node_2.isin(exp.index))].loc[lambda x:x.node_1 != x.node_2] + + #make exp dictionary to calculate weight + lst_exps=dict() + with open(args.exp) as f: + lines=f.readlines() + for line in lines: + s=line.strip().split('\t') + if not isNum(s[1]): + continue + else: + gene, exps = s[0], list(map(float,s[1:])) + lst_exps[gene]=exps + lst_pairs2=zip(df_nwk_filt['node_1'],df_nwk_filt['node_2']) + + def myCorr(x): + g1,g2=sorted(x) + if np.all(np.array(lst_exps[g1])==lst_exps[g1][0]) or np.all(np.array(lst_exps[g2])==lst_exps[g2][0]): + val,pval=(0,1) + else: + val, pval = pearsonr(lst_exps[g1],lst_exps[g2]) + return (g1,g2,abs(val)) + + p = Pool(args.nThreads) + res2=p.imap_unordered(myCorr, lst_pairs2) + p.close() + p.join() + + corr_res2=[] + for g1,g2,val in res2: + if g1==g2: + continue + corr_res2.append([g1,g2,val]) + + df_nwk_corr=pd.DataFrame(corr_res2,columns=['node_1','node_2','weight']) + df_nwk_corrCut=corrCut(df_nwk_corr,args.corrCut) + + end=time.time() + time_elapsed=end-start + print('tr_tr',df_nwk_corrCut.shape) + #df_nwk_corrCut.to_csv(args.o,sep='\t',header=False,index=False) + df_nwk_corrCut.loc[:,['node_1','node_2']].to_csv(args.o,sep='\t',header=False,index=False) + #print(args.o, 'time_elapsed', time_elapsed)