import pandas as pd
import argparse
import numpy as np
import time
import sys
from multiprocessing import Pool
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
def isNum(x):
try:
float(x)
return True
except:
return False
def corrCut(nwk,cutoff=None):
'''weight cutoff, positive sorting'''
nwk.dropna(subset=['weight'],inplace=True)
nwk.sort_values(by=['weight'],inplace=True,ascending=False)
if cutoff!=None:
return nwk.loc[lambda x:abs(x.weight)>=cutoff]
else:
return nwk
def setMinExp(nwk,exp,expCut):
'''remove gene whose expression is lower than expCut'''
filtered_gene = exp[exp.max(axis=1)>1]['Hybridization REF']
boolean_mask = np.logical_and(nwk['protein1'].isin(filtered_gene),nwk['protein2'].isin(filtered_gene))
return nwk[boolean_mask]
def expCut(nwk,exp,sample_list,expCut):
'''remove gene whose mean of group(mutated/not-mutated) expression is lower than expCut'''
with open(sample_list) as f:
mutSamples=f.read().strip().split()
exp['no_mut']=exp.loc[:,~exp.columns.isin(mutSamples)].mean(axis=1)
exp['mut']=exp.loc[:,exp.columns.isin(mutSamples)].mean(axis=1)
boolean_mask = np.logical_or(exp['no_mut']>=1,exp['mut']>=1)
gene_selected = exp[boolean_mask]['Hybridization REF']
boolean_mask2 = np.logical_and(nwk['protein1'].isin(gene_selected),nwk['protein2'].isin(gene_selected))
return nwk[boolean_mask2]
def FCcut(nwk,FC_df,FCcut):
keys = FC_df.iloc[:,0]
values = FC_df.iloc[:,1]
dictionary = dict(zip(keys, values))
dictionary.pop('?','Not Found')
first_col = np.array([dictionary[i] for i in nwk.iloc[:,0]])
second_col = np.array([dictionary[i] for i in nwk.iloc[:,1]])
boolean_mask = np.logical_and(abs(first_col) >= FCcut, abs(second_col) >= FCcut)
boolean_mask2 = nwk['protein1'].apply(lambda x: dictionary[x])*nwk['protein2'].apply(lambda x: dictionary[x])>0
bigFC_nwk = nwk[boolean_mask & boolean_mask2]
return bigFC_nwk
if __name__=="__main__":
parser = argparse.ArgumentParser(description='python 0_instantiate_nwk.py nwkFile exp -corrCut [] -nThreads [] -o []')
parser.add_argument('nwk',help='network')
parser.add_argument('exp',help='exp File')
parser.add_argument('-corrCut',type=float, required=False,help='corealtion cutoff')
parser.add_argument('-nThreads',type=int, required=False,default=1)
parser.add_argument('-o',required=True,help='output')
args=parser.parse_args()
####correaltion score combined with string score
start=time.time()
exp=pd.read_csv(args.exp,sep='\t',header=0,index_col=0)
#remove duplicates
data=[]
with open(args.nwk) as f:
for line in f.readlines():
tmp=line.strip().split('\t')
data.append(sorted(tmp))
df_nwk=pd.DataFrame(data[1:],columns=['node_1','node_2'],dtype=float)
df_nwk.drop_duplicates(subset=['node_1','node_2'],inplace=True)
df_nwk_filt = df_nwk.loc[lambda x:np.logical_and(x.node_1.isin(exp.index),x.node_2.isin(exp.index))].loc[lambda x:x.node_1 != x.node_2]
#make exp dictionary to calculate weight
lst_exps=dict()
with open(args.exp) as f:
lines=f.readlines()
for line in lines:
s=line.strip().split('\t')
if not isNum(s[1]):
continue
else:
gene, exps = s[0], list(map(float,s[1:]))
lst_exps[gene]=exps
lst_pairs2=zip(df_nwk_filt['node_1'],df_nwk_filt['node_2'])
def myCorr(x):
g1,g2=sorted(x)
if np.all(np.array(lst_exps[g1])==lst_exps[g1][0]) or np.all(np.array(lst_exps[g2])==lst_exps[g2][0]):
val,pval=(0,1)
else:
val, pval = pearsonr(lst_exps[g1],lst_exps[g2])
return (g1,g2,abs(val))
p = Pool(args.nThreads)
res2=p.imap_unordered(myCorr, lst_pairs2)
p.close()
p.join()
corr_res2=[]
for g1,g2,val in res2:
if g1==g2:
continue
corr_res2.append([g1,g2,val])
df_nwk_corr=pd.DataFrame(corr_res2,columns=['node_1','node_2','weight'])
df_nwk_corrCut=corrCut(df_nwk_corr,args.corrCut)
end=time.time()
time_elapsed=end-start
print('tr_tr',df_nwk_corrCut.shape)
#df_nwk_corrCut.to_csv(args.o,sep='\t',header=False,index=False)
df_nwk_corrCut.loc[:,['node_1','node_2']].to_csv(args.o,sep='\t',header=False,index=False)
#print(args.o, 'time_elapsed', time_elapsed)