Switch to unified view

a b/singlecellmultiomics/fastaProcessing/createMappabilityIndex.py
1
#!/usr/bin/env python3
2
import singlecellmultiomics.utils
3
import pysam
4
import collections
5
import numpy as np
6
import re
7
import more_itertools
8
import gzip
9
import argparse
10
import uuid
11
import os
12
from singlecellmultiomics.utils import BlockZip
13
14
15
if __name__ == '__main__':
16
    argparser = argparse.ArgumentParser(
17
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
18
        description="""In silico digest genome""")
19
    argparser.add_argument(
20
        'fasta',
21
        metavar='fastafile',
22
        type=str,
23
        help='Fasta file to digest')
24
    argparser.add_argument(
25
        '-minlen',
26
        type=int,
27
        default=20,
28
        help='minimum read length')
29
    argparser.add_argument(
30
        '-maxlen',
31
        type=int,
32
        default=69,
33
        help='maximum read length')
34
    argparser.add_argument('-digest_sequence', type=str, default='CATG')
35
    argparser.add_argument('-head', type=int)
36
    argparser.add_argument(
37
        '-contigs',
38
        type=str,
39
        default=None,
40
        help='Comma separated contigs to run analysis for, when None specified all contigs are used')
41
    args = argparser.parse_args()
42
43
    r1_read_length = args.maxlen
44
    minlen = args.minlen
45
46
    ref = pysam.FastaFile(args.fasta)
47
48
    def seq_to_fastq(seq, header):
49
        return f'@{header}\n{seq}\n+\n{"E"*len(seq)}\n'
50
51
    selected_contigs = None if args.contigs is None else set(
52
        args.contigs.split(','))
53
54
    fastq_path = f'simulated_{args.digest_sequence}_single_{r1_read_length}_{uuid.uuid4()}.fastq.gz'
55
56
    outbam = f'simulated_{args.digest_sequence}_single_{r1_read_length}.bam'
57
58
    with gzip.open(fastq_path, 'wt') as out:
59
        processed = 0
60
61
        for contig, contig_len in zip(ref.references, ref.lengths):
62
            if selected_contigs is not None and contig not in selected_contigs:
63
                continue
64
            print(f'{contig}\t{contig_len}')
65
            seq = ref.fetch(contig).upper()
66
            frag_locations = np.diff(
67
                [m.start() for m in re.finditer(args.digest_sequence, seq)])
68
            # Generate fragments:
69
            for i, (start, end_excluding_catg) in enumerate(more_itertools.windowed(
70
                    (m.start() for m in re.finditer(args.digest_sequence, seq)), 2)):
71
                if end_excluding_catg is None or args.head is not None and i >= (
72
                        args.head - 1):
73
                    continue
74
                end = end_excluding_catg + len(args.digest_sequence)
75
76
                forward_read = seq[start:end][:r1_read_length]
77
                if len(forward_read) >= minlen:
78
                    out.write(
79
                        seq_to_fastq(
80
                            forward_read,
81
                            f'DR:fwd;ct:{contig};ds:{start};qn:{len(forward_read)}'))
82
                reverse_read = singlecellmultiomics.utils.reverse_complement(seq[start:end])[
83
                    :r1_read_length]
84
                if len(reverse_read) >= minlen:
85
                    out.write(
86
                        seq_to_fastq(
87
                            reverse_read,
88
                            f'DR:rev;ct:{contig};ds:{end_excluding_catg};qn:{len(reverse_read)}'))
89
90
            processed += contig_len
91
92
    # Map the fastq file:
93
    print("Now mapping ...")
94
    os.system(f"bwa mem -t 4 {args.fasta} {fastq_path} | samtools view -b - > ./{outbam}.unsorted.bam; samtools sort -T ./temp_sort -@ 4 ./{outbam}.unsorted.bam > ./{outbam}.unfinished.bam ; mv ./{outbam}.unfinished.bam ./{outbam} ; samtools index ./{outbam} & rm {fastq_path}")
95
96
    print("Creating site database ...")
97
    # Create site dictionary:
98
    sites = {}  # site-> wrongly_assinged_to, correctly_assigned, lost
99
100
    ref_reads = pysam.AlignmentFile(outbam)
101
    # Perform site detection:
102
    for read in ref_reads:
103
104
        if read.is_supplementary:
105
            continue
106
        # Check if there is reads mapping to this site
107
108
        kv = {kv.split(':')[0]: kv.split(':')[1]
109
              for kv in read.query_name.split(';')}
110
111
        site_pos = int(kv['ds'])
112
        site_chrom = kv['ct']
113
        strand = (kv['DR'] == 'rev')
114
115
        read_site = read.reference_end - 4 if read.is_reverse else read.reference_start
116
        read_contig = read.reference_name
117
118
        key = (site_chrom, site_pos, strand)
119
        key_mapped = (read_contig, read_site, read.is_reverse)
120
121
        if key not in sites:
122
            sites[key] = {'correct': 0, 'wrong_gain': 0, 'lost': 0}
123
124
        if read_site == site_pos and site_chrom == read_contig and read.is_reverse == strand:  # Correct assignment
125
            sites[key]['correct'] += 1
126
        else:
127
            # Assign a missing count to the origin site:
128
            sites[key]['lost'] += 1
129
130
            # Assing a wrong annotation to the target site:
131
            if key_mapped not in sites:
132
                sites[key_mapped] = {'correct': 0, 'wrong_gain': 0, 'lost': 0}
133
            sites[key_mapped]['wrong_gain'] += 1
134
135
    print("Writing site statistics ...")
136
    outtab = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.stats.bgzf'
137
    outtabsafe = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.safe.bgzf'
138
139
    keys = sorted(
140
    [(contig, pos, strand)
141
     for  (contig, pos, strand) in sites.keys()
142
     if contig is not None])
143
144
    with BlockZip(outtab, 'w') as stats, BlockZip(outtabsafe, 'w') as safe:
145
        for (contig, pos, strand) in keys:
146
            measured= sites[(contig, pos, strand)]
147
            stats.write(
148
                contig,
149
                pos,
150
                strand,
151
                f'{measured["correct"]}\t{measured["lost"]}\t{measured["wrong_gain"]}')
152
153
            if measured['wrong_gain'] == 0 and measured['lost'] == 0 and measured['correct'] == 1:
154
                safe.write(contig, pos, strand, 'ok')