[c66173]: / DeepMod_tools / sum_chr_mod.py

Download this file

119 lines (94 with data), 3.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
import os, sys, string
import multiprocessing
import time;
import glob
from collections import defaultdict;
def mprint(mstr):
print(mstr); sys.stdout.flush()
if len(sys.argv)<4:
print ("Usage: python {} pred_folder-of-DeepMod Base-of-interest unique-fileid-in-sum-file [chr-list]".format(sys.argv[0]))
print (" pred_folder-of-DeepMod: the prediction must in its sub-folder.")
sys.exit(1)
pred_folder = sys.argv[1]
baseofint = sys.argv[2]
sum_fileid = sys.argv[3]
if len(sys.argv)>4:
chrkeys = ["%s" % cid for cid in sys.argv[4].split(',')]
else:
chrkeys = []
for i in range(1, 23):
chrkeys.append("chr%d" % i)
chrkeys.append("chrX")
chrkeys.append("chrY")
chrkeys.append("chrM")
chrkeys = set(chrkeys)
#####################################
def readbed2(bedf):
cur_ad = defaultdict();
with open(bedf, 'r') as mr:
while True:
line = mr.readline();
if not line: break;
line = line.strip();
lsp = line.split();
cur_ad[(lsp[0], int(lsp[1]), lsp[5])] = [int(lsp[9]), int(lsp[11])]
return cur_ad
def mergeMod(g_ad, cur_ad):
for k in cur_ad:
if k in g_ad:
g_ad[k][0] += cur_ad[k][0]
g_ad[k][1] += cur_ad[k][1]
else: g_ad[k] = cur_ad[k]
def save_mod(res_file, amod_dict, baseOfin):
poskeys = sorted(list(amod_dict.keys()))
for pk in poskeys:
if amod_dict[pk][1]==0: del amod_dict[pk]
poskeys = list(amod_dict.keys())
poskeys = sorted(poskeys);
with open(res_file, 'w') as mw:
for pk in poskeys:
mw.write('%s %d %d %s %d %s %d %d 0,0,0 %d %d %d\n' % (pk[0], pk[1],pk[1]+1, baseOfin,amod_dict[pk][0] if amod_dict[pk][0]<1000 else 1000,pk[2], pk[1],pk[1]+1, amod_dict[pk][0], int(amod_dict[pk][1]*100/amod_dict[pk][0]) if amod_dict[pk][0]>0 else 0, amod_dict[pk][1] ))
def sum_amod_handler(run_Q):
handli = 0;
while not run_Q.empty():
try:
ck, pred_folder, baseOfin = run_Q.get(block=False)
#print (ck, pred_folder, baseOfin); continue
except:
break;
allbedfiles = glob.glob(os.path.join(pred_folder, ("*/*/*/*.%s-.%s.bed" % (ck, baseOfin)) ))
allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*/*.%s-.%s.bed" % (ck, baseOfin)) )))
allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*.%s-.%s.bed" % (ck, baseOfin)) )))
mprint ("%s - %s: %d" % (ck, baseOfin, len(allbedfiles) ))
allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*/*/*.%s+.%s.bed" % (ck, baseOfin)) )))
allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*/*.%s+.%s.bed" % (ck, baseOfin)) )))
allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*.%s+.%s.bed" % (ck, baseOfin)) )))
mprint ("%s -+ %s: %d" % (ck, baseOfin, len(allbedfiles) ))
# 0 1 2 3 4 5 6 7 8 9 0 1
#chr1 949802 949803 T 1 - 949802 949803 0,0,0 1 0 0
amod_dict = defaultdict();
res_file = "%s/%s.%s.%s.bed" % (pred_folder, sum_fileid, ck, baseOfin)
for bedf_ind in range(len(allbedfiles)):
mprint("\t %s %s %d/%d" % (ck, baseOfin, bedf_ind+1, len(allbedfiles)))
cur_ad = readbed2(allbedfiles[bedf_ind])
mergeMod(amod_dict, cur_ad)
save_mod(res_file, amod_dict, baseOfin)
##############################
pmanager = multiprocessing.Manager();
run_Q = pmanager.Queue();
for ck in chrkeys:
run_Q.put((ck, pred_folder, baseofint))
mhandlers = [];
share_var = (run_Q, )
m_thread_num = len(chrkeys);
for i in range(m_thread_num+1):
p = multiprocessing.Process(target=sum_amod_handler, args=share_var)
p.start();
mhandlers.append(p);
while any(p.is_alive() for p in mhandlers):
try:
time.sleep(1);
except:
time.sleep(1);
continue;