a b/bin/DeepMod_scripts/EventTable.py
1
2
import os,sys
3
import numpy as np
4
import h5py
5
6
7
def get_extreme_N(m_signal_dif, n_splits, p_signal_start, p_signal_end, moptions, sp_param):
8
   cu_region_sort_pos = m_signal_dif[int(p_signal_start-sp_param['min_signal_num']+0.5):int(p_signal_end-sp_param['min_signal_num']+0.5)].argsort()[::-1]+p_signal_start;
9
   m_nb_pos = set();
10
   # print n_splits, type(n_splits), p_signal_start, type(p_signal_start), p_signal_end, type(p_signal_end), sp_param['min_signal_num'], type( sp_param['min_signal_num']), type(p_signal_start+sp_param['min_signal_num']-1)
11
   m_nb_pos.update(range(p_signal_start, int(p_signal_start+sp_param['min_signal_num']-0.5)));
12
   m_nb_pos.update(range(int(p_signal_end-sp_param['min_signal_num']+1.5), p_signal_end));
13
   split_points_list = []
14
   for c_pos in cu_region_sort_pos:
15
      if c_pos not in m_nb_pos:
16
         split_points_list.append(c_pos);
17
         if (len(split_points_list)==n_splits): break;
18
         m_nb_pos.update(range(c_pos-sp_param['min_signal_num']+1, c_pos+sp_param['min_signal_num']+1));
19
   return sorted(split_points_list);
20
21
def getEvent_Info(moptions, sp_param, events_data):
22
   event_info = []
23
   sp_param['min_signal_num'] = 4;
24
25
   signal_sum = np.cumsum(np.insert(np.round(sp_param['raw_signals']/50.0,5), 0, 0));
26
   m_signal_dif = np.abs(signal_sum[sp_param['min_signal_num']:-sp_param['min_signal_num']]*2 - signal_sum[:-2*sp_param['min_signal_num']] - signal_sum[2*sp_param['min_signal_num']:])
27
   #print (sp_param['raw_signals'][:20]);
28
   #print (np.round(sp_param['raw_signals']/50.0,5)[:20]);
29
   #print (signal_sum[:20]);
30
   #print (m_signal_dif[:20])
31
   # sp_param['fq_seq'] = fq_data[1]
32
   last_ev_i = 0;
33
   last_signal_i = events_data[0]['start'];
34
   fq_seq_i = 2;
35
   c_move_num = 1
36
   incrrt_event_list = []
37
   for ev_i in range(1, len(events_data)):
38
      if (events_data['move'][ev_i])==0:
39
         pass;
40
      else:
41
         c_move_num += events_data['move'][ev_i]
42
         split_points = get_extreme_N(m_signal_dif, c_move_num-1, last_signal_i, events_data[ev_i]['start']+events_data[ev_i]['length'], moptions, sp_param);
43
         #print c_move_num-1, last_signal_i, ev_i, events_data[ev_i]['start']+events_data[ev_i]['length'], split_points
44
         #for s_i in range(last_signal_i, events_data[ev_i]['start']+events_data[ev_i]['length']): 
45
         #    if s_i in split_points:
46
         #       print '|',
47
         #    print sp_param['raw_signals'][s_i],
48
         #print '';
49
         for c_m_i in range(c_move_num-1):
50
            if c_m_i < len(split_points): 
51
               h_m_i = c_m_i;
52
               c_e_p = split_points[h_m_i]
53
            else: 
54
               h_m_i = len(split_points)-1
55
               c_e_p = last_signal_i + sp_param['min_signal_num']
56
               incrrt_event_list.append(len(event_info));
57
58
            c_mnn = np.mean(sp_param['raw_signals'][last_signal_i:c_e_p]);
59
            c_std = np.std(sp_param['raw_signals'][last_signal_i:c_e_p]);
60
            c_start = last_signal_i;
61
            c_length = c_e_p - last_signal_i;
62
            c_mode = sp_param['fq_seq'][fq_seq_i-2:fq_seq_i+3];
63
            event_info.append((c_mnn, c_std, c_start, c_length, c_mode))
64
65
            last_signal_i = split_points[h_m_i]
66
            fq_seq_i += 1;
67
 
68
         c_move_num = 1;
69
   ev_i = len(events_data)-1 
70
   c_e_p = events_data[ev_i]['start'] + events_data[ev_i]['length']
71
   c_mnn = np.mean(sp_param['raw_signals'][last_signal_i:c_e_p]);
72
   c_std = np.std(sp_param['raw_signals'][last_signal_i:c_e_p]);
73
   c_start = last_signal_i;
74
   c_length = c_e_p - last_signal_i;
75
   c_mode = sp_param['fq_seq'][fq_seq_i-2:fq_seq_i+3];
76
   event_info.append((c_mnn, c_std, c_start, c_length, c_mode))
77
78
   event_info = np.array(event_info, dtype=[('mean', '<f4'), ('stdv', '<f4'), ('start', np.uint64), ('length', np.uint64), ('model_state', 'U5')])
79
   #c_seq = ''.join([event_model_state[2] for event_model_state in event_info['model_state'] ] )
80
 
81
   #print '\n' 
82
   for c_ev_i in incrrt_event_list:
83
      #print c_ev_i, event_info[c_ev_i-1]['start'], event_info[c_ev_i-1]['length'], event_info[c_ev_i]['start'], event_info[c_ev_i]['length'], event_info[c_ev_i+1]['start'], event_info[c_ev_i+1]['length']
84
      h_2 = int((event_info[c_ev_i+1]['length'] + event_info[c_ev_i+1]['start'] - event_info[c_ev_i]['start'] )/2+0.2)
85
      event_info[c_ev_i]['length'] = h_2
86
      event_info[c_ev_i+1]['start'] = event_info[c_ev_i]['start'] + event_info[c_ev_i]['length']
87
      event_info[c_ev_i+1]['length'] = event_info[c_ev_i+1]['length'] - h_2
88
      #print '\t', c_ev_i, event_info[c_ev_i-1]['start'], event_info[c_ev_i-1]['length'], event_info[c_ev_i]['start'], event_info[c_ev_i]['length'], event_info[c_ev_i+1]['start'], event_info[c_ev_i+1]['length']
89
90
   #for c_ev_i in range(len(event_info)):
91
   #   print c_ev_i, event_info[c_ev_i]['start'], event_info[c_ev_i]['length'], ':', 
92
   #   for s_i in range(event_info[c_ev_i]['start'], event_info[c_ev_i]['start']+event_info[c_ev_i]['length']):
93
   #       pass # print sp_param['raw_signals'][s_i],
94
   #   print ''
95
 
96
   #msi = 50;
97
   #print (c_seq[:msi])
98
   #print (sp_param['fq_seq'][2:(msi+2)])
99
   #print (c_seq[-msi:])
100
   #print (sp_param['fq_seq'][-(msi+2):-2])
101
   #print len(events_data), len(event_info), len(sp_param['fq_seq'])
102
   #ei_i = 0;
103
   #for ev_i in range(0, len(events_data)):
104
   #   if (events_data[ev_i]['move']>0):
105
   #      print ("%d/%s %d-%d vs %d-%d %s=%s%s" % (ev_i, ei_i,events_data[ev_i]['start'], events_data[ev_i]['start']+events_data[ev_i]['length'],  event_info[ei_i]['start'], event_info[ei_i]['start']+event_info[ei_i]['length'], events_data[ev_i]['model_state'][2],event_info[ei_i]['model_state'][2],sp_param['fq_seq'][ei_i+2]))
106
   #      ei_i += events_data[ev_i]['move']
107
108
   return event_info
109
110
111
if __name__=='__main__':
112
   moptions = {}
113
   sp_param = {}
114
115
   exple_data = ['/home/liuq1/project/DeepNanoRepeat/scripts/fortest/f6343e53-9454-41ae-8398-7be6e1b7557d.fast5', \
116
                 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/000a7916-373c-4cc3-a3f2-6bed205b09cb.fast5', \
117
                 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/00264c38-4945-4263-ae0d-253e6c6a39ba.fast5', \
118
                 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/0039f109-46ac-4a81-883d-b55900924dd4.fast5', \
119
                 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/0045bf1d-d7be-44b1-9b6c-9bb76a634e0f.fast5' \
120
                ]
121
122
   sp_param['f5reader'] = h5py.File(sys.argv[1] if len(sys.argv)>1 else exple_data[0], 'r');   
123
124
   fq_str = '/Analyses/Basecall_1D_000/BaseCalled_template/Fastq'
125
   ev_str = '/Analyses/Basecall_1D_000/BaseCalled_template/Events'
126
   fq_str = '/Analyses/Basecall_1D_001/BaseCalled_template/Fastq'
127
   ev_str = '/Analyses/Basecall_1D_001/BaseCalled_template/Events'
128
   sg_str = '/Raw/Reads/'
129
130
   sp_param['fq_seq'] = sp_param['f5reader'][fq_str][()].split('\n')[1];
131
   sp_param['raw_signals'] = sp_param['f5reader'][sg_str].values()[0]['Signal'].value
132
   events_data = sp_param['f5reader'][ev_str].value;
133
134
   getEvent_Info(moptions, sp_param, events_data)
135
136
   sp_param['f5reader'].close();   
137