Switch to unified view

a b/BioAid/deepSeqInsH/annoInsH.py
1
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
2
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
3
4
## Contents:
5
## dataFrameImport
6
## matchSequence
7
## findReadLength
8
## trimseq
9
## classify
10
11
import regex as re
12
import os
13
import pandas as pd
14
15
def dataFrameImport(directory,datatype = 'sam'):
16
  import pysam
17
18
  #This function imports all files inside of the specified DIR path into two lists; of panda DF and sample names
19
  frames_list_samples = []
20
  sample_names = []
21
22
  counter=1
23
24
  #import all files in the directory
25
  for filename in os.listdir(directory):
26
      if filename.endswith(f".{datatype}"):
27
          sample_names.append(filename[:8].strip())
28
          path = os.path.join(directory, filename)
29
          globals()[f"sample{counter}"] = pd.DataFrame(({'name': x.qname, 'seq': x.seq, 'qual': x.query_qualities} for x in pysam.Samfile(path).fetch()))
30
          print(f'dataframe {sample_names[-1]} has {len(globals()[f"sample{counter}"])} total rows')
31
          frames_list_samples.append(globals()[f"sample{counter}"])
32
          counter+=1
33
  print(f"found {len(frames_list_samples)} samples in {directory}")
34
  print(len(sample_names), sample_names)
35
36
  return(frames_list_samples, sample_names)
37
38
def matchSequence(row, sequence):
39
  #for searching for perfectly matched sequences.
40
  if (sequence in row):
41
    return(True)
42
  else:
43
    return(False)
44
45
def findReadLength(row):
46
  #returns a length of a read
47
  return(len(row))
48
49
def trimseq(row, f_seq, r_seq, consensus):
50
  #trims sequences to the 5' end of primers and finds reads that support non-deletions.
51
  #allows for 2 errors in non-deletions.
52
53
  if row.forward_primer == True:
54
    f_seq_index = row.seq.find(f_seq)
55
    if f_seq_index == -1:
56
      print("no forward primer found")
57
      row.forward_primer = False
58
    row.seq = row.seq[f_seq_index:]
59
60
  if row.reverse_primer == True:
61
    r_seq_index = row.seq.find(r_seq)
62
    if r_seq_index == -1:
63
      print("no reverse primer found")
64
      row.reverse_primer = False
65
    row.seq = row.seq[:r_seq_index+len(r_seq)]
66
  if (row.forward_primer == False) & (row.reverse_primer == False):
67
    row.seq = "no_primers"
68
69
  if re.search('('+row.seq+')'+'{e<=2}', consensus):
70
    row.seq = "no_excision"
71
  return(row)
72
73
def classify(row, consensus, classification):
74
  #this function classifies the sequence based on the provided consensus, allows for 2 errors.
75
76
  if row.classification != "other":
77
    return row
78
79
  if re.search('('+row.seq+')'+'{e<=2}', consensus):
80
  #if row.seq in consensus:
81
    row.classification = classification
82
  return row