--- a +++ b/bin/DeepMod_scripts/EventTable.py @@ -0,0 +1,137 @@ + +import os,sys +import numpy as np +import h5py + + +def get_extreme_N(m_signal_dif, n_splits, p_signal_start, p_signal_end, moptions, sp_param): + cu_region_sort_pos = m_signal_dif[int(p_signal_start-sp_param['min_signal_num']+0.5):int(p_signal_end-sp_param['min_signal_num']+0.5)].argsort()[::-1]+p_signal_start; + m_nb_pos = set(); + # print n_splits, type(n_splits), p_signal_start, type(p_signal_start), p_signal_end, type(p_signal_end), sp_param['min_signal_num'], type( sp_param['min_signal_num']), type(p_signal_start+sp_param['min_signal_num']-1) + m_nb_pos.update(range(p_signal_start, int(p_signal_start+sp_param['min_signal_num']-0.5))); + m_nb_pos.update(range(int(p_signal_end-sp_param['min_signal_num']+1.5), p_signal_end)); + split_points_list = [] + for c_pos in cu_region_sort_pos: + if c_pos not in m_nb_pos: + split_points_list.append(c_pos); + if (len(split_points_list)==n_splits): break; + m_nb_pos.update(range(c_pos-sp_param['min_signal_num']+1, c_pos+sp_param['min_signal_num']+1)); + return sorted(split_points_list); + +def getEvent_Info(moptions, sp_param, events_data): + event_info = [] + sp_param['min_signal_num'] = 4; + + signal_sum = np.cumsum(np.insert(np.round(sp_param['raw_signals']/50.0,5), 0, 0)); + m_signal_dif = np.abs(signal_sum[sp_param['min_signal_num']:-sp_param['min_signal_num']]*2 - signal_sum[:-2*sp_param['min_signal_num']] - signal_sum[2*sp_param['min_signal_num']:]) + #print (sp_param['raw_signals'][:20]); + #print (np.round(sp_param['raw_signals']/50.0,5)[:20]); + #print (signal_sum[:20]); + #print (m_signal_dif[:20]) + # sp_param['fq_seq'] = fq_data[1] + last_ev_i = 0; + last_signal_i = events_data[0]['start']; + fq_seq_i = 2; + c_move_num = 1 + incrrt_event_list = [] + for ev_i in range(1, len(events_data)): + if (events_data['move'][ev_i])==0: + pass; + else: + c_move_num += events_data['move'][ev_i] + split_points = get_extreme_N(m_signal_dif, c_move_num-1, last_signal_i, events_data[ev_i]['start']+events_data[ev_i]['length'], moptions, sp_param); + #print c_move_num-1, last_signal_i, ev_i, events_data[ev_i]['start']+events_data[ev_i]['length'], split_points + #for s_i in range(last_signal_i, events_data[ev_i]['start']+events_data[ev_i]['length']): + # if s_i in split_points: + # print '|', + # print sp_param['raw_signals'][s_i], + #print ''; + for c_m_i in range(c_move_num-1): + if c_m_i < len(split_points): + h_m_i = c_m_i; + c_e_p = split_points[h_m_i] + else: + h_m_i = len(split_points)-1 + c_e_p = last_signal_i + sp_param['min_signal_num'] + incrrt_event_list.append(len(event_info)); + + c_mnn = np.mean(sp_param['raw_signals'][last_signal_i:c_e_p]); + c_std = np.std(sp_param['raw_signals'][last_signal_i:c_e_p]); + c_start = last_signal_i; + c_length = c_e_p - last_signal_i; + c_mode = sp_param['fq_seq'][fq_seq_i-2:fq_seq_i+3]; + event_info.append((c_mnn, c_std, c_start, c_length, c_mode)) + + last_signal_i = split_points[h_m_i] + fq_seq_i += 1; + + c_move_num = 1; + ev_i = len(events_data)-1 + c_e_p = events_data[ev_i]['start'] + events_data[ev_i]['length'] + c_mnn = np.mean(sp_param['raw_signals'][last_signal_i:c_e_p]); + c_std = np.std(sp_param['raw_signals'][last_signal_i:c_e_p]); + c_start = last_signal_i; + c_length = c_e_p - last_signal_i; + c_mode = sp_param['fq_seq'][fq_seq_i-2:fq_seq_i+3]; + event_info.append((c_mnn, c_std, c_start, c_length, c_mode)) + + event_info = np.array(event_info, dtype=[('mean', '<f4'), ('stdv', '<f4'), ('start', np.uint64), ('length', np.uint64), ('model_state', 'U5')]) + #c_seq = ''.join([event_model_state[2] for event_model_state in event_info['model_state'] ] ) + + #print '\n' + for c_ev_i in incrrt_event_list: + #print c_ev_i, event_info[c_ev_i-1]['start'], event_info[c_ev_i-1]['length'], event_info[c_ev_i]['start'], event_info[c_ev_i]['length'], event_info[c_ev_i+1]['start'], event_info[c_ev_i+1]['length'] + h_2 = int((event_info[c_ev_i+1]['length'] + event_info[c_ev_i+1]['start'] - event_info[c_ev_i]['start'] )/2+0.2) + event_info[c_ev_i]['length'] = h_2 + event_info[c_ev_i+1]['start'] = event_info[c_ev_i]['start'] + event_info[c_ev_i]['length'] + event_info[c_ev_i+1]['length'] = event_info[c_ev_i+1]['length'] - h_2 + #print '\t', c_ev_i, event_info[c_ev_i-1]['start'], event_info[c_ev_i-1]['length'], event_info[c_ev_i]['start'], event_info[c_ev_i]['length'], event_info[c_ev_i+1]['start'], event_info[c_ev_i+1]['length'] + + #for c_ev_i in range(len(event_info)): + # print c_ev_i, event_info[c_ev_i]['start'], event_info[c_ev_i]['length'], ':', + # for s_i in range(event_info[c_ev_i]['start'], event_info[c_ev_i]['start']+event_info[c_ev_i]['length']): + # pass # print sp_param['raw_signals'][s_i], + # print '' + + #msi = 50; + #print (c_seq[:msi]) + #print (sp_param['fq_seq'][2:(msi+2)]) + #print (c_seq[-msi:]) + #print (sp_param['fq_seq'][-(msi+2):-2]) + #print len(events_data), len(event_info), len(sp_param['fq_seq']) + #ei_i = 0; + #for ev_i in range(0, len(events_data)): + # if (events_data[ev_i]['move']>0): + # print ("%d/%s %d-%d vs %d-%d %s=%s%s" % (ev_i, ei_i,events_data[ev_i]['start'], events_data[ev_i]['start']+events_data[ev_i]['length'], event_info[ei_i]['start'], event_info[ei_i]['start']+event_info[ei_i]['length'], events_data[ev_i]['model_state'][2],event_info[ei_i]['model_state'][2],sp_param['fq_seq'][ei_i+2])) + # ei_i += events_data[ev_i]['move'] + + return event_info + + +if __name__=='__main__': + moptions = {} + sp_param = {} + + exple_data = ['/home/liuq1/project/DeepNanoRepeat/scripts/fortest/f6343e53-9454-41ae-8398-7be6e1b7557d.fast5', \ + 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/000a7916-373c-4cc3-a3f2-6bed205b09cb.fast5', \ + 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/00264c38-4945-4263-ae0d-253e6c6a39ba.fast5', \ + 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/0039f109-46ac-4a81-883d-b55900924dd4.fast5', \ + 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/0045bf1d-d7be-44b1-9b6c-9bb76a634e0f.fast5' \ + ] + + sp_param['f5reader'] = h5py.File(sys.argv[1] if len(sys.argv)>1 else exple_data[0], 'r'); + + fq_str = '/Analyses/Basecall_1D_000/BaseCalled_template/Fastq' + ev_str = '/Analyses/Basecall_1D_000/BaseCalled_template/Events' + fq_str = '/Analyses/Basecall_1D_001/BaseCalled_template/Fastq' + ev_str = '/Analyses/Basecall_1D_001/BaseCalled_template/Events' + sg_str = '/Raw/Reads/' + + sp_param['fq_seq'] = sp_param['f5reader'][fq_str][()].split('\n')[1]; + sp_param['raw_signals'] = sp_param['f5reader'][sg_str].values()[0]['Signal'].value + events_data = sp_param['f5reader'][ev_str].value; + + getEvent_Info(moptions, sp_param, events_data) + + sp_param['f5reader'].close(); +