Switch to unified view

a b/singlecellmultiomics/fastqProcessing/trim_vasa.py
1
#!/usr/bin/env python
2
from more_itertools import chunked
3
import argparse
4
import gzip
5
import re
6
7
if __name__=='__main__':
8
    argparser = argparse.ArgumentParser(
9
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
10
        description="""Trim vasa fastq files for homo-polymers
11
    """)
12
13
14
    argparser.add_argument('R2_fastq', metavar='R2_fastq', type=str)
15
    argparser.add_argument('R2_singletons_out', type=str)
16
17
    argparser.add_argument('-poly_length', type=int, default=10)
18
    argparser.add_argument('-min_read_len', type=int, default=20)
19
    args = argparser.parse_args()
20
21
    poly_A = args.poly_length*'A'
22
    poly_T = args.poly_length*'T'
23
    poly_G = args.poly_length*'G'
24
25
    poly_length = args.poly_length
26
27
    r2_trimmer = re.compile('[GA]*$')
28
29
    def trim_r2(header, sequence, comment, qualities, min_read_len ):
30
31
        start = sequence.find(poly_A)
32
        if start != -1:
33
            sequence, qualities = sequence[:start], qualities[:start]
34
35
        start = sequence.find(poly_G)
36
        if start != -1:
37
            sequence, qualities = sequence[:start], qualities[:start]
38
39
        # Trim any trailing A and G bases from the end and # Trim down 3 bases
40
        sequence = r2_trimmer.sub('',sequence)[:-3]
41
        qualities = qualities[:len(sequence)]
42
43
44
        return f'{header}{sequence.rstrip()}\n{comment}{qualities.rstrip()}\n', len(sequence)>=min_read_len
45
46
    def trim_r1(header, sequence, comment, qualities, min_read_len ):
47
48
        start = sequence.rfind(poly_T) # Trim poly T
49
        if start != -1:
50
            sequence, qualities = sequence[start+poly_length:], qualities[start+poly_length:]
51
52
        start = sequence.find(poly_G)
53
        if start != -1:
54
            sequence, qualities = sequence[:start], qualities[:start]
55
56
57
        return f'{header}{sequence.rstrip()}\n{comment}{qualities.rstrip()}\n', len(sequence)>=min_read_len
58
59
    with gzip.open(args.R2_fastq,'rt') as r2, \
60
         gzip.open(args.R2_singletons_out,'wt',compresslevel=1) as r2_single_out:
61
62
        for read2 in chunked(r2,4):
63
            (r2_o, valid_r2)  = trim_r2(*read2, args.min_read_len)
64
            if valid_r2:
65
                r2_single_out.write(r2_o)