|
a |
|
b/BioAid/deepSeqInsH/annoInsH.py |
|
|
1 |
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa |
|
|
2 |
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com |
|
|
3 |
|
|
|
4 |
## Contents: |
|
|
5 |
## dataFrameImport |
|
|
6 |
## matchSequence |
|
|
7 |
## findReadLength |
|
|
8 |
## trimseq |
|
|
9 |
## classify |
|
|
10 |
|
|
|
11 |
import regex as re |
|
|
12 |
import os |
|
|
13 |
import pandas as pd |
|
|
14 |
|
|
|
15 |
def dataFrameImport(directory,datatype = 'sam'): |
|
|
16 |
import pysam |
|
|
17 |
|
|
|
18 |
#This function imports all files inside of the specified DIR path into two lists; of panda DF and sample names |
|
|
19 |
frames_list_samples = [] |
|
|
20 |
sample_names = [] |
|
|
21 |
|
|
|
22 |
counter=1 |
|
|
23 |
|
|
|
24 |
#import all files in the directory |
|
|
25 |
for filename in os.listdir(directory): |
|
|
26 |
if filename.endswith(f".{datatype}"): |
|
|
27 |
sample_names.append(filename[:8].strip()) |
|
|
28 |
path = os.path.join(directory, filename) |
|
|
29 |
globals()[f"sample{counter}"] = pd.DataFrame(({'name': x.qname, 'seq': x.seq, 'qual': x.query_qualities} for x in pysam.Samfile(path).fetch())) |
|
|
30 |
print(f'dataframe {sample_names[-1]} has {len(globals()[f"sample{counter}"])} total rows') |
|
|
31 |
frames_list_samples.append(globals()[f"sample{counter}"]) |
|
|
32 |
counter+=1 |
|
|
33 |
print(f"found {len(frames_list_samples)} samples in {directory}") |
|
|
34 |
print(len(sample_names), sample_names) |
|
|
35 |
|
|
|
36 |
return(frames_list_samples, sample_names) |
|
|
37 |
|
|
|
38 |
def matchSequence(row, sequence): |
|
|
39 |
#for searching for perfectly matched sequences. |
|
|
40 |
if (sequence in row): |
|
|
41 |
return(True) |
|
|
42 |
else: |
|
|
43 |
return(False) |
|
|
44 |
|
|
|
45 |
def findReadLength(row): |
|
|
46 |
#returns a length of a read |
|
|
47 |
return(len(row)) |
|
|
48 |
|
|
|
49 |
def trimseq(row, f_seq, r_seq, consensus): |
|
|
50 |
#trims sequences to the 5' end of primers and finds reads that support non-deletions. |
|
|
51 |
#allows for 2 errors in non-deletions. |
|
|
52 |
|
|
|
53 |
if row.forward_primer == True: |
|
|
54 |
f_seq_index = row.seq.find(f_seq) |
|
|
55 |
if f_seq_index == -1: |
|
|
56 |
print("no forward primer found") |
|
|
57 |
row.forward_primer = False |
|
|
58 |
row.seq = row.seq[f_seq_index:] |
|
|
59 |
|
|
|
60 |
if row.reverse_primer == True: |
|
|
61 |
r_seq_index = row.seq.find(r_seq) |
|
|
62 |
if r_seq_index == -1: |
|
|
63 |
print("no reverse primer found") |
|
|
64 |
row.reverse_primer = False |
|
|
65 |
row.seq = row.seq[:r_seq_index+len(r_seq)] |
|
|
66 |
if (row.forward_primer == False) & (row.reverse_primer == False): |
|
|
67 |
row.seq = "no_primers" |
|
|
68 |
|
|
|
69 |
if re.search('('+row.seq+')'+'{e<=2}', consensus): |
|
|
70 |
row.seq = "no_excision" |
|
|
71 |
return(row) |
|
|
72 |
|
|
|
73 |
def classify(row, consensus, classification): |
|
|
74 |
#this function classifies the sequence based on the provided consensus, allows for 2 errors. |
|
|
75 |
|
|
|
76 |
if row.classification != "other": |
|
|
77 |
return row |
|
|
78 |
|
|
|
79 |
if re.search('('+row.seq+')'+'{e<=2}', consensus): |
|
|
80 |
#if row.seq in consensus: |
|
|
81 |
row.classification = classification |
|
|
82 |
return row |