[c66173]: / DeepMod_tools / generate_motif_pos.py

Download this file

141 lines (114 with data), 4.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
import os, sys, string, time
from collections import defaultdict;
import multiprocessing
def read_genome(mfafile):
ref_genome = defaultdict();
with open(mfafile, 'r') as mr:
cur_chr = None;
while True:
line = mr.readline();
if not line: break;
line = line.strip();
if len(line)==0: continue;
if line[0]=='>':
if not cur_chr==None:
ref_genome[cur_chr] = ''.join(seqlist);
cur_chr = line[1:].split()[0];
seqlist = []
else:
seqlist.append(line.upper())
ref_genome[cur_chr] = ''.join(seqlist);
print("Total chr: {}".format(len(ref_genome))); sys.stdout.flush()
return ref_genome
#ref_genome = read_genome(ref_fa);
def handle_motif_pos(run_Q):
handli = 0;
while not run_Q.empty():
try:
rgkey, ref_genome, res_folder, na_bp, curna, curmotif = run_Q.get(block=False)
#print(rgkey, ref_genome, res_folder, na_bp, curna, curmotif); continue
except:
break;
#curna_dict = defaultdict();
#curmotif_dict = defaultdict();
nafile = '%sna_%s_%s.bed' % (res_folder, rgkey, curna)
motiffile = '%smotif_%s_%s.bed' % (res_folder, rgkey, curna)
mw_na = open(nafile, 'w')
mw_motif = open(motiffile, 'w')
batchsize = 500000
print("get motif for {}={}".format(rgkey, len(ref_genome))); sys.stdout.flush()
cur_hi = 0; start_time = time.time();
for na_ind in range(len(ref_genome)):
cur_hi += 1
if cur_hi % batchsize == 0:
print('\t time consuming ({})= {} {}'.format( rgkey, cur_hi, time.time() - start_time ) )
sys.stdout.flush()
start_time = time.time()
if (ref_genome[na_ind]==curna or na_bp[ref_genome[na_ind]]==curna):
#curna_dict[(rgkey, na_ind, '+' if ref_genome[rgkey][na_ind]==curna else '-')] = True;
mw_na.write('%s\t%s\t%s\n' % (rgkey, na_ind, '+' if ref_genome[na_ind]==curna else '-'))
if ref_genome[na_ind]==curna and (not curmotif==None):
for cur_mot in curmotif:
is_mot = True; mot_ind = 0;
for cur_sub_r_ind in range(na_ind - curmotif[cur_mot], na_ind + len(cur_mot) - curmotif[cur_mot] ):
if cur_sub_r_ind<0 or cur_sub_r_ind>len(ref_genome)-1:
is_mot = False; break;
if not ref_genome[cur_sub_r_ind] == cur_mot[mot_ind]:
is_mot = False; break;
mot_ind += 1
if is_mot:
mw_motif.write('%s\t%s\t%s\n' % (rgkey, na_ind, '+'))
mw_motif.write('%s\t%s\t%s\n' % (rgkey, na_ind+1, '-'))
break;
mw_na.close(); mw_motif.close()
ref_fa = 'ref/hg38.fa'
ref_fa = sys.argv[1]
res_folder = 'genome.motif/C/'
res_folder = sys.argv[2]+'/'
if not os.path.isdir(res_folder):
os.system('mkdir -p '+res_folder)
curna='C'
curmotif={'CG':0}
curna = sys.argv[3];
curmotif = {sys.argv[4]:int(sys.argv[5])}
if len(sys.argv)>6:
chrkeys = ["chr%s" % cid for cid in sys.argv[6].split(',')]
else:
chrkeys = []
for i in range(1, 23):
chrkeys.append("chr%d" % i)
chrkeys.append("chrX")
chrkeys.append("chrY")
chrkeys.append("chrM")
chrkeys = set(chrkeys)
na_bp = {"A":"T", \
"C":"G", \
"G":"C", \
"T":"A", \
"a":"t", \
"c":"g", \
"g":"c", \
"t":"a", \
"N":"N", \
"n":"n" \
}
ref_genome = read_genome(ref_fa);
##############################
pmanager = multiprocessing.Manager();
run_Q = pmanager.Queue();
for curk in chrkeys:
run_Q.put((curk, ref_genome[curk], res_folder, na_bp, curna, curmotif))
mhandlers = [];
share_var = (run_Q, )
m_thread_num = len(chrkeys);
for i in range(m_thread_num):
p = multiprocessing.Process(target=handle_motif_pos, args=share_var)
p.start();
mhandlers.append(p);
while any(p.is_alive() for p in mhandlers):
try:
time.sleep(1);
except:
time.sleep(1);
continue;