|
a |
|
b/modas/regiongwas.py |
|
|
1 |
import pandas as pd |
|
|
2 |
import numpy as np |
|
|
3 |
import modas.multiprocess as mp |
|
|
4 |
import modas.gwas_cmd as gc |
|
|
5 |
from sklearn.decomposition import PCA |
|
|
6 |
from rpy2.robjects import pandas2ri |
|
|
7 |
from rpy2.rinterface_lib.embedded import RRuntimeError |
|
|
8 |
import rpy2.robjects as robjects |
|
|
9 |
from rpy2.robjects.packages import importr |
|
|
10 |
from rpy2.rinterface_lib.callbacks import logger as rpy2_logger |
|
|
11 |
import subprocess |
|
|
12 |
import logging |
|
|
13 |
import glob, os |
|
|
14 |
import shutil |
|
|
15 |
import re |
|
|
16 |
|
|
|
17 |
pandas2ri.activate() |
|
|
18 |
rpy2_logger.setLevel(logging.ERROR) |
|
|
19 |
rMVP = importr('rMVP') |
|
|
20 |
base = importr('base') |
|
|
21 |
bigmemory = importr('bigmemory') |
|
|
22 |
|
|
|
23 |
utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8') |
|
|
24 |
#utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1]) |
|
|
25 |
utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1) |
|
|
26 |
if not utils_path.endswith('utils'): |
|
|
27 |
utils_path = '/'.join(utils_path.split('/')[:-1]) |
|
|
28 |
|
|
|
29 |
|
|
|
30 |
def region_gwas_parallel(bed_dir, threads, geno, gwas_model): |
|
|
31 |
region_gwas_args = list() |
|
|
32 |
geno_prefix = geno.split('/')[-1] |
|
|
33 |
if gwas_model == 'GLM' or gwas_model == 'MLM': |
|
|
34 |
fam = pd.read_csv(geno+'.fam', sep=r'\s+', header=None) |
|
|
35 |
fam[5] = 1 |
|
|
36 |
fam.to_csv(geno_prefix+'.link.fam', sep='\t', na_rep='NA', header=None, index=False) |
|
|
37 |
if os.path.exists(geno_prefix+'.link.bed'): |
|
|
38 |
os.remove(geno_prefix+'.link.bed') |
|
|
39 |
if os.path.exists(geno_prefix+'.link.bim'): |
|
|
40 |
os.remove(geno_prefix+'.link.bim') |
|
|
41 |
os.symlink(geno+'.bed', geno_prefix+'.link.bed') |
|
|
42 |
os.symlink(geno+'.bim', geno_prefix+'.link.bim') |
|
|
43 |
if gwas_model=='MLM': |
|
|
44 |
related_matrix_cmd = utils_path + '/gemma -bfile {0}.link -gk 1 -o {1}'.format(geno_prefix,geno_prefix) |
|
|
45 |
s = mp.run(related_matrix_cmd) |
|
|
46 |
if s!=0: |
|
|
47 |
return None |
|
|
48 |
# if gwas_model=='MLM': |
|
|
49 |
# gemma_cmd = 'gemma.linux -bfile {0} -k ./output/{1}.cXX.txt -lmm -n 1 -o {2}' |
|
|
50 |
# elif gwas_model=='LM': |
|
|
51 |
# gemma_cmd = 'gemma.linux -bfile {0} -lm -o {1}' |
|
|
52 |
|
|
|
53 |
|
|
|
54 |
for _, i in enumerate(glob.glob(bed_dir+'/*.bed')): |
|
|
55 |
i = i.replace('.bed','') |
|
|
56 |
i = i.replace('m/z','m.z') |
|
|
57 |
prefix = i.split('/')[-1] |
|
|
58 |
if gwas_model == 'MLM': |
|
|
59 |
region_gwas_args.append((gc.gemma_cmd('MLM', i, geno_prefix, 1, prefix + '_plink'),)) |
|
|
60 |
# region_gwas_args.append((gemma_cmd.format(i, geno_prefix, prefix+'_plink'),)) |
|
|
61 |
elif gwas_model == 'LM': |
|
|
62 |
region_gwas_args.append((gc.gemma_cmd('LM', i, None, None, prefix + '_plink'),)) |
|
|
63 |
# region_gwas_args.append((gemma_cmd.format(i, prefix+'_plink'),)) |
|
|
64 |
else: |
|
|
65 |
phe = pd.read_csv(i+'.fam', sep='\s+',header=None) |
|
|
66 |
phe = phe.iloc[:, [0, -1]] |
|
|
67 |
phe.columns = ['Taxa', prefix] |
|
|
68 |
# region_gwas_args.append((phe, '../'+geno_prefix + '.link', '../'+i, 1)) |
|
|
69 |
region_gwas_args.append(('GLM', geno_prefix+'.link', i, phe, 1, './output')) |
|
|
70 |
if gwas_model == 'LM' or gwas_model == 'MLM': |
|
|
71 |
s = mp.parallel(mp.run, region_gwas_args, threads) |
|
|
72 |
else: |
|
|
73 |
if not os.path.exists('./output'): |
|
|
74 |
os.mkdir('./output') |
|
|
75 |
# os.chdir('./output') |
|
|
76 |
# s = mp.parallel(glm_gwas, (region_gwas_args[0],), 1) |
|
|
77 |
# s = mp.parallel(glm_gwas, region_gwas_args[1:], threads) |
|
|
78 |
# os.chdir('../') |
|
|
79 |
s = mp.parallel(gc.rmvp, (region_gwas_args[0],), 1) |
|
|
80 |
s = mp.parallel(gc.rmvp, region_gwas_args[1:], threads) |
|
|
81 |
if gwas_model == 'GLM' or gwas_model == 'MLM': |
|
|
82 |
os.remove(geno_prefix+'.link.bed') |
|
|
83 |
os.remove(geno_prefix+'.link.bim') |
|
|
84 |
os.remove(geno_prefix+'.link.fam') |
|
|
85 |
return s |
|
|
86 |
|
|
|
87 |
|
|
|
88 |
def glm_gwas(omics_phe, pc_geno_prefix, geno_prefix, threads): |
|
|
89 |
try: |
|
|
90 |
base.sink('/dev/null') |
|
|
91 |
if not os.path.exists(pc_geno_prefix + '.pc.desc'): |
|
|
92 |
rMVP.MVP_Data(fileBed=pc_geno_prefix, fileKin=False, filePC=False, out=pc_geno_prefix, |
|
|
93 |
verbose=False) |
|
|
94 |
rMVP.MVP_Data_PC(True, mvp_prefix=pc_geno_prefix, pcs_keep=3, verbose=False) |
|
|
95 |
rMVP.MVP_Data(fileBed=geno_prefix, fileKin=False, filePC=False, out=geno_prefix, verbose=False) |
|
|
96 |
geno = bigmemory.attach_big_matrix(geno_prefix +'.geno.desc') |
|
|
97 |
map_file = pd.read_csv(geno_prefix +'.geno.map', sep='\t') |
|
|
98 |
Covariates_PC = bigmemory.as_matrix(bigmemory.attach_big_matrix(pc_geno_prefix + '.pc.desc')) |
|
|
99 |
# base.setwd('./output') |
|
|
100 |
rMVP.MVP(phe=omics_phe, geno=geno, map=map_file, CV_GLM=Covariates_PC, priority="speed", |
|
|
101 |
ncpus=threads, maxLoop=10, threshold=0.05, method=['GLM'], file_output=True, verbose=False) |
|
|
102 |
base.sink() |
|
|
103 |
#gwas(omics_phe, pc_geno_prefix, geno_prefix, threads) |
|
|
104 |
except RRuntimeError: |
|
|
105 |
return 0 |
|
|
106 |
except ValueError: |
|
|
107 |
return 0 |
|
|
108 |
else: |
|
|
109 |
return 1 |
|
|
110 |
|
|
|
111 |
|
|
|
112 |
def generate_qtl_batch(omics_phe,phe_sig_qtl,geno_name,threads,bed_dir,rs_dir): |
|
|
113 |
plink_extract = utils_path + '/plink -bfile {} --extract {} --make-bed -out {}' |
|
|
114 |
bim = pd.read_csv(geno_name+'.bim', sep='\t', header=None) |
|
|
115 |
qtl_batch = list() |
|
|
116 |
rs = dict() |
|
|
117 |
for index,row in phe_sig_qtl.iterrows(): |
|
|
118 |
rs.setdefault(row['phe_name'],[]).extend(bim.loc[(bim[0]==row['chr']) & (bim[3]>=row['start']) & (bim[3]<=row['end']),1].values.tolist()) |
|
|
119 |
for phe_name in rs: |
|
|
120 |
out_name = bed_dir.strip('/') + '/' + '_'.join(['tmp',phe_name]) |
|
|
121 |
rs_name = rs_dir.strip('/') + '/' + '_'.join(['tmp',phe_name,'rs.txt']) |
|
|
122 |
pd.Series(rs[phe_name]).to_frame().to_csv(rs_name,index=False,header=False) |
|
|
123 |
qtl_batch.append((plink_extract.format(geno_name,rs_name,out_name),)) |
|
|
124 |
mp.parallel(mp.run,qtl_batch,threads) |
|
|
125 |
for fn in glob.glob(bed_dir.strip('/')+'/*fam'): |
|
|
126 |
fam = pd.read_csv(fn,sep=' ',header=None) |
|
|
127 |
phe_name = '_'.join(fn.split('/')[-1].split('_')[1:]).replace('m.z','m/z').replace('.fam','') |
|
|
128 |
fam.loc[:,5] = omics_phe.loc[:,phe_name].reindex(fam.loc[:,0]).values |
|
|
129 |
fam.to_csv(fn,index=False,header=None,sep=' ',na_rep='NA') |
|
|
130 |
|
|
|
131 |
|
|
|
132 |
# def generate_clump_input(dir,num_threads): |
|
|
133 |
# if os.path.exists('./clump_input'): |
|
|
134 |
# shutil.rmtree('./clump_input') |
|
|
135 |
# os.mkdir('./clump_input') |
|
|
136 |
# cmd = '''awk '{if(NR==1)print "SNP\\tP"; else print $2"\\t"$11}' ''' |
|
|
137 |
# cmds = list() |
|
|
138 |
# fns = list() |
|
|
139 |
# for fn in glob.glob(dir.strip('/')+'/*_plink.assoc.txt'): |
|
|
140 |
# filename = fn.split('/')[-1] |
|
|
141 |
# cmds.append((cmd+'{0} > ./clump_input/{1}'.format(fn, filename.replace('_plink.assoc.txt', '.assoc')),)) |
|
|
142 |
# fns.append(filename) |
|
|
143 |
# s = mp.parallel(mp.run, cmds, num_threads) |
|
|
144 |
# if sum(s) != 0: |
|
|
145 |
# print(','.join(list(np.array(fns)[s]))+' do not successfully generated clump input file.') |
|
|
146 |
# return s |
|
|
147 |
def generate_clump_input(dir, gwas_model): |
|
|
148 |
if os.path.exists('./clump_input'): |
|
|
149 |
shutil.rmtree('./clump_input') |
|
|
150 |
os.mkdir('./clump_input') |
|
|
151 |
if gwas_model == 'LM' or gwas_model == 'MLM': |
|
|
152 |
for fn in glob.glob(dir.strip('/')+'/*_plink.assoc.txt'): |
|
|
153 |
filename = fn.split('/')[-1] |
|
|
154 |
assoc = pd.read_csv(fn, sep='\t') |
|
|
155 |
assoc = assoc[['rs', 'p_wald']] |
|
|
156 |
assoc.columns = ['SNP', 'P'] |
|
|
157 |
assoc.to_csv('./clump_input/' + filename.replace('_plink.assoc.txt', '.assoc'), index=False, sep='\t') |
|
|
158 |
else: |
|
|
159 |
for fn in glob.glob(dir.strip('/')+'/tmp_*GLM.csv'): |
|
|
160 |
filename = fn.split('/')[-1] |
|
|
161 |
assoc = pd.read_csv(fn) |
|
|
162 |
assoc = assoc.iloc[:, [0, -1]] |
|
|
163 |
assoc.columns = ['SNP', 'P'] |
|
|
164 |
assoc.to_csv('./clump_input/' + filename.replace('.GLM.csv', '.assoc'), index=False, sep='\t') |
|
|
165 |
|
|
|
166 |
|
|
|
167 |
def plink_clump(geno_path, p1, p2, num_threads): |
|
|
168 |
if os.path.exists('./clump_result'): |
|
|
169 |
shutil.rmtree('./clump_result') |
|
|
170 |
os.mkdir('./clump_result') |
|
|
171 |
cmd = utils_path + '/plink --bfile {0} --clump {1} --clump-p1 {2} --clump-p2 {3} --clump-kb {4} --clump-r2 0.2 --out {5}' |
|
|
172 |
cmds = list() |
|
|
173 |
ms = list() |
|
|
174 |
for fn in glob.glob('./clump_input/*'): |
|
|
175 |
phe_name = fn.split('/')[-1].replace('.assoc','') |
|
|
176 |
cmds.append((cmd.format(geno_path+'/'+phe_name, fn, p1, p2,str(500), './clump_result/' + phe_name + '_'+str(500)),)) |
|
|
177 |
ms.append(phe_name) |
|
|
178 |
s = mp.parallel(mp.run, cmds, num_threads) |
|
|
179 |
if sum(s) != 0: |
|
|
180 |
print(','.join(list(np.array(ms)[s]))+' do not successfully generated clumped file.') |
|
|
181 |
return s |
|
|
182 |
|
|
|
183 |
|
|
|
184 |
#def merge_qtl(qtl): |
|
|
185 |
# qtl = qtl.sort_values(by=['CHR','BP']) |
|
|
186 |
# merged_qtl = list() |
|
|
187 |
# for index,row in qtl.iterrows(): |
|
|
188 |
# if not merged_qtl: |
|
|
189 |
# merged_qtl.append(row) |
|
|
190 |
# else: |
|
|
191 |
# if row['CHR'] != merged_qtl[-1]['CHR']: |
|
|
192 |
# merged_qtl.append(row) |
|
|
193 |
# else: |
|
|
194 |
# if row['BP'] - merged_qtl[-1]['BP'] <= 1000000: |
|
|
195 |
# if row['P'] < merged_qtl[-1]['P']: |
|
|
196 |
# merged_qtl[-1]['P'] = row['P'] |
|
|
197 |
# merged_qtl[-1]['BP'] = row['BP'] |
|
|
198 |
# merged_qtl[-1]['SNP'] = row['SNP'] |
|
|
199 |
# merged_qtl[-1]['SP2_num'] += row['SP2_num'] |
|
|
200 |
# merged_qtl[-1]['SP2']+= ',' + row['SP2'] |
|
|
201 |
# else: |
|
|
202 |
# merged_qtl.append(row) |
|
|
203 |
# merged_qtl = pd.DataFrame(merged_qtl) |
|
|
204 |
# return merged_qtl |
|
|
205 |
|
|
|
206 |
def merge_qtl_phe(qtl): |
|
|
207 |
qtl = qtl.sort_values(by=['CHR','qtl_start']) |
|
|
208 |
merged_phe_qtl = list() |
|
|
209 |
for index,row in qtl.iterrows(): |
|
|
210 |
if not merged_phe_qtl: |
|
|
211 |
merged_phe_qtl.append(row) |
|
|
212 |
else: |
|
|
213 |
if row['CHR'] != merged_phe_qtl[-1]['CHR']: |
|
|
214 |
merged_phe_qtl.append(row) |
|
|
215 |
else: |
|
|
216 |
if row['qtl_start'] < merged_phe_qtl[-1]['qtl_end'] + 3000000: |
|
|
217 |
if row['P'] < merged_phe_qtl[-1]['P']: |
|
|
218 |
merged_phe_qtl[-1]['P'] = row['P'] |
|
|
219 |
merged_phe_qtl[-1]['SNP'] = row['SNP'] |
|
|
220 |
merged_phe_qtl[-1]['qtl_start'] = min(merged_phe_qtl[-1]['qtl_start'],row['qtl_start']) |
|
|
221 |
merged_phe_qtl[-1]['qtl_end'] = max(merged_phe_qtl[-1]['qtl_end'], row['qtl_end']) |
|
|
222 |
merged_phe_qtl[-1]['SP2_num'] += row['SP2_num'] |
|
|
223 |
else: |
|
|
224 |
merged_phe_qtl.append(row) |
|
|
225 |
merged_phe_qtl = pd.DataFrame(merged_phe_qtl) |
|
|
226 |
return merged_phe_qtl |
|
|
227 |
|
|
|
228 |
|
|
|
229 |
def merge_qtl(qtl): |
|
|
230 |
qtl = qtl.sort_values(by=['CHR','qtl_start']) |
|
|
231 |
merged_qtl = pd.DataFrame() |
|
|
232 |
for index,row in qtl.iterrows(): |
|
|
233 |
if merged_qtl.empty: |
|
|
234 |
merged_qtl = pd.concat([merged_qtl, row.to_frame().T]) |
|
|
235 |
else: |
|
|
236 |
qtl_length = row['qtl_end'] - row['qtl_start'] |
|
|
237 |
qtl_ratio = (merged_qtl['qtl_end'] - row['qtl_start']) / qtl_length |
|
|
238 |
qtl_index = (merged_qtl['CHR'] == row['CHR']) & (qtl_ratio >=0.1) |
|
|
239 |
if qtl_index.sum()>0: |
|
|
240 |
peak_dis = (merged_qtl.loc[qtl_index,'SNP'].apply(lambda x: int(x.split('_')[-1])) - int(row['SNP'].split('_')[-1])).abs() |
|
|
241 |
if (peak_dis <= 2000000).sum()==0: |
|
|
242 |
merged_qtl = pd.concat([merged_qtl, row.to_frame().T]) |
|
|
243 |
else: |
|
|
244 |
merged_qtl_index = peak_dis[qtl_index].idxmin() |
|
|
245 |
if merged_qtl.loc[merged_qtl_index,'P'] > row['P']: |
|
|
246 |
merged_qtl.loc[merged_qtl_index,'P'] = row['P'] |
|
|
247 |
merged_qtl.loc[merged_qtl_index,'SNP'] = row['SNP'] |
|
|
248 |
merged_qtl.loc[merged_qtl_index,'qtl_start'] = min(merged_qtl.loc[merged_qtl_index,'qtl_start'],row['qtl_start']) |
|
|
249 |
merged_qtl.loc[merged_qtl_index,'qtl_end'] = max(merged_qtl.loc[merged_qtl_index,'qtl_end'], row['qtl_end']) |
|
|
250 |
merged_qtl.loc[merged_qtl_index,'SP2_num'] += row['SP2_num'] |
|
|
251 |
merged_qtl.loc[merged_qtl_index,'phe_name'] = merged_qtl.loc[merged_qtl_index,'phe_name'] + ',' + row['phe_name'] |
|
|
252 |
|
|
|
253 |
else: |
|
|
254 |
merged_qtl = pd.concat([merged_qtl, row.to_frame().T]) |
|
|
255 |
return merged_qtl |
|
|
256 |
|
|
|
257 |
|
|
|
258 |
def phe_cluster(phe, phe_labeled, n): |
|
|
259 |
pca = PCA(n_components=1) |
|
|
260 |
phe_pc1 = pca.fit_transform(phe) |
|
|
261 |
phe_corr = phe.corrwith(pd.Series(phe_pc1[:,0],index=phe.index)).abs() |
|
|
262 |
if (phe_corr >= 0.6).all(): |
|
|
263 |
phe_labeled.loc[phe_corr.index,'label'] = n |
|
|
264 |
return pd.DataFrame(phe_pc1,index=phe.index,columns=['cluster'+str(n)+'_PC1']), phe_labeled, n+1 |
|
|
265 |
else: |
|
|
266 |
phe_pc1= pd.DataFrame() |
|
|
267 |
while not phe.empty: |
|
|
268 |
phe_corr = phe.corrwith(pd.Series(pca.fit_transform(phe)[:,0],index=phe.index)).abs() |
|
|
269 |
if (phe_corr < 0.6).sum()==1: |
|
|
270 |
if phe_corr.shape[0]==2: |
|
|
271 |
phe_pc1 = pd.concat([phe_pc1,phe.loc[:,phe_corr.index]],axis=1) |
|
|
272 |
else: |
|
|
273 |
phe_pc1 = pd.concat([phe_pc1,pd.DataFrame(pca.fit_transform(phe.loc[:,phe_corr>=0.6]),index=phe.index,columns=['cluster'+str(n)+'_PC1'])],axis=1) |
|
|
274 |
phe_labeled.loc[phe.loc[:,phe_corr>=0.6].columns,'label'] = n |
|
|
275 |
n = n + 1 |
|
|
276 |
phe_pc1 = pd.concat([phe_pc1,phe.loc[:, phe_corr < 0.6]],axis=1) |
|
|
277 |
phe = pd.DataFrame() |
|
|
278 |
else: |
|
|
279 |
if (phe_corr>=0.6).any(): |
|
|
280 |
if (phe_corr>=0.6).sum()==1: |
|
|
281 |
phe_pc1 = pd.concat([phe_pc1,phe.loc[:,phe_corr>=0.6]],axis=1) |
|
|
282 |
else: |
|
|
283 |
phe_pc1 = pd.concat([phe_pc1,pd.DataFrame(pca.fit_transform(phe.loc[:,phe_corr>=0.6]),index=phe.index,columns=['cluster'+str(n)+'_PC1'])],axis=1) |
|
|
284 |
phe_labeled.loc[phe.loc[:,phe_corr>=0.6].columns,'label'] = n |
|
|
285 |
n = n + 1 |
|
|
286 |
phe = phe.loc[:,phe_corr < 0.6] |
|
|
287 |
else: |
|
|
288 |
phe_pc1 = pd.concat([phe_pc1,phe],axis=1) |
|
|
289 |
phe= pd.DataFrame() |
|
|
290 |
#phe_corr = phe.corrwith(pd.Series(pca.fit_transform(phe)[:,0],index=phe.index)).abs() |
|
|
291 |
return phe_pc1,phe_labeled,n |
|
|
292 |
|
|
|
293 |
|
|
|
294 |
|
|
|
295 |
def generate_qtl(clump_result_dir, p2): |
|
|
296 |
qtl_res = list() |
|
|
297 |
bad_qtl = list() |
|
|
298 |
for fn in glob.glob(clump_result_dir.strip('/')+'/*clumped'): |
|
|
299 |
phe_name = '_'.join(fn.split('/')[-1].split('_')[1:-1]) |
|
|
300 |
clump_result = pd.read_csv(fn,sep='\s+') |
|
|
301 |
clump_result = clump_result.loc[clump_result.SP2!='NONE',:] |
|
|
302 |
qtl = clump_result[['CHR','BP','SNP','P','SP2']] |
|
|
303 |
qtl.loc[:,'SP2_num'] = qtl['SP2'].apply(lambda x: len(x.split(','))) |
|
|
304 |
qtl.loc[:,'log10P'] = -np.log10(qtl['P']) |
|
|
305 |
if (qtl['SP2_num'] >= 10).sum() > 0: |
|
|
306 |
qtl['qtl_start'] = qtl['SP2'].apply(lambda x:int(re.findall(r'_(\d+)',x)[0])) |
|
|
307 |
qtl['qtl_end'] = qtl['SP2'].apply(lambda x:int(re.findall(r'_(\d+)',x)[-1])) |
|
|
308 |
qtl['phe_name'] = phe_name |
|
|
309 |
qtl_filter = qtl.loc[qtl.SP2_num>=5,:] |
|
|
310 |
mer_qtl_filter = merge_qtl_phe(qtl_filter) |
|
|
311 |
mer_qtl_filter.loc[:,'qtl_length'] = mer_qtl_filter['qtl_end'] - mer_qtl_filter['qtl_start'] + 1 |
|
|
312 |
if mer_qtl_filter.shape[0] < 10: |
|
|
313 |
qtl = qtl.loc[(qtl.SP2_num>=5) & (qtl.log10P >= -np.log10(p2)), ['CHR','qtl_start','qtl_end','SNP','P','SP2_num','phe_name']] |
|
|
314 |
mer_qtl = merge_qtl_phe(qtl) |
|
|
315 |
mer_qtl.loc[:,'qtl_length'] = mer_qtl['qtl_end'] - mer_qtl['qtl_start'] + 1 |
|
|
316 |
mer_qtl = mer_qtl.loc[:,['CHR','qtl_start','qtl_end','SNP','P','SP2_num','qtl_length','phe_name']] |
|
|
317 |
if mer_qtl.shape[0] < 4: |
|
|
318 |
qtl_res.append(mer_qtl) |
|
|
319 |
else: |
|
|
320 |
bad_qtl.append(mer_qtl.loc[mer_qtl['SP2_num']>=10,:]) |
|
|
321 |
else: |
|
|
322 |
bad_qtl.append(mer_qtl_filter.loc[mer_qtl_filter['SP2_num']>=10,['CHR','qtl_start','qtl_end','SNP','P','SP2_num','qtl_length','phe_name']]) |
|
|
323 |
qtl_res = pd.concat(qtl_res) |
|
|
324 |
qtl_res = qtl_res.loc[qtl_res['SP2_num']>=10,:] |
|
|
325 |
if not bad_qtl: |
|
|
326 |
bad_qtl = pd.DataFrame() |
|
|
327 |
else: |
|
|
328 |
bad_qtl = pd.concat(bad_qtl) |
|
|
329 |
return qtl_res, bad_qtl |
|
|
330 |
|
|
|
331 |
|
|
|
332 |
def phe_PCA(omics_phe, qtl): |
|
|
333 |
omics_phe = omics_phe.fillna(omics_phe.mean()) |
|
|
334 |
qtl_uniq = pd.DataFrame() |
|
|
335 |
for phe_name in qtl['phe_name'].unique(): |
|
|
336 |
qtl_sub = qtl.loc[qtl.phe_name==phe_name,:] |
|
|
337 |
if qtl_sub.shape[0]==1: |
|
|
338 |
qtl_uniq = pd.concat([qtl_uniq,qtl_sub]) |
|
|
339 |
else: |
|
|
340 |
qtl_sub = qtl_sub.sort_values(by=['SP2_num'],ascending=False) |
|
|
341 |
if qtl_sub.iloc[0,:]['SP2_num'] / qtl_sub.iloc[1,:]['SP2_num'] > 2: |
|
|
342 |
qtl_uniq = pd.concat([qtl_uniq,qtl_sub.iloc[0,:].to_frame().T]) |
|
|
343 |
else: |
|
|
344 |
qtl_uniq = pd.concat([qtl_uniq,qtl_sub.loc[qtl_sub['P'].idxmin(),:].to_frame().T]) |
|
|
345 |
merged_qtl_uniq = merge_qtl(qtl_uniq) |
|
|
346 |
merged_qtl_uniq.loc[:,'phe_name_num'] = merged_qtl_uniq['phe_name'].apply(lambda x:len(x.split(','))) |
|
|
347 |
merged_qtl_uniq = merged_qtl_uniq.sort_values(by='phe_name_num',ascending=False) |
|
|
348 |
omics_sig_phe = omics_phe.loc[:,[p.replace('m.z','m/z') for p in qtl['phe_name'].unique()]] |
|
|
349 |
omics_sig_phe_labeled = omics_sig_phe.T |
|
|
350 |
omics_sig_phe_labeled.loc[:,'label'] = 0 |
|
|
351 |
omics_phe_pc = pd.DataFrame() |
|
|
352 |
n=1 |
|
|
353 |
for phe_name in merged_qtl_uniq.loc[merged_qtl_uniq['phe_name_num']>=2,'phe_name']: |
|
|
354 |
omics_phe_sub = omics_sig_phe.loc[:,[p.replace('m.z','m/z') for p in phe_name.split(',')]] |
|
|
355 |
omics_phe_sub_pc,omics_sig_phe_labeled,n = phe_cluster(omics_phe_sub, omics_sig_phe_labeled, n) |
|
|
356 |
omics_phe_pc = pd.concat([omics_phe_pc, omics_phe_sub_pc],axis=1) |
|
|
357 |
clustered_omics_phe = pd.merge(omics_phe_pc.loc[:,~omics_phe_pc.columns.isin(omics_sig_phe.columns)],omics_sig_phe_labeled.loc[omics_sig_phe_labeled.label==0,:].drop('label',axis=1).T,left_index=True,right_index=True) |
|
|
358 |
return clustered_omics_phe, omics_sig_phe_labeled |