|
a |
|
b/singlecellmultiomics/fastaProcessing/createMappabilityIndex.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
import singlecellmultiomics.utils |
|
|
3 |
import pysam |
|
|
4 |
import collections |
|
|
5 |
import numpy as np |
|
|
6 |
import re |
|
|
7 |
import more_itertools |
|
|
8 |
import gzip |
|
|
9 |
import argparse |
|
|
10 |
import uuid |
|
|
11 |
import os |
|
|
12 |
from singlecellmultiomics.utils import BlockZip |
|
|
13 |
|
|
|
14 |
|
|
|
15 |
if __name__ == '__main__': |
|
|
16 |
argparser = argparse.ArgumentParser( |
|
|
17 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
18 |
description="""In silico digest genome""") |
|
|
19 |
argparser.add_argument( |
|
|
20 |
'fasta', |
|
|
21 |
metavar='fastafile', |
|
|
22 |
type=str, |
|
|
23 |
help='Fasta file to digest') |
|
|
24 |
argparser.add_argument( |
|
|
25 |
'-minlen', |
|
|
26 |
type=int, |
|
|
27 |
default=20, |
|
|
28 |
help='minimum read length') |
|
|
29 |
argparser.add_argument( |
|
|
30 |
'-maxlen', |
|
|
31 |
type=int, |
|
|
32 |
default=69, |
|
|
33 |
help='maximum read length') |
|
|
34 |
argparser.add_argument('-digest_sequence', type=str, default='CATG') |
|
|
35 |
argparser.add_argument('-head', type=int) |
|
|
36 |
argparser.add_argument( |
|
|
37 |
'-contigs', |
|
|
38 |
type=str, |
|
|
39 |
default=None, |
|
|
40 |
help='Comma separated contigs to run analysis for, when None specified all contigs are used') |
|
|
41 |
args = argparser.parse_args() |
|
|
42 |
|
|
|
43 |
r1_read_length = args.maxlen |
|
|
44 |
minlen = args.minlen |
|
|
45 |
|
|
|
46 |
ref = pysam.FastaFile(args.fasta) |
|
|
47 |
|
|
|
48 |
def seq_to_fastq(seq, header): |
|
|
49 |
return f'@{header}\n{seq}\n+\n{"E"*len(seq)}\n' |
|
|
50 |
|
|
|
51 |
selected_contigs = None if args.contigs is None else set( |
|
|
52 |
args.contigs.split(',')) |
|
|
53 |
|
|
|
54 |
fastq_path = f'simulated_{args.digest_sequence}_single_{r1_read_length}_{uuid.uuid4()}.fastq.gz' |
|
|
55 |
|
|
|
56 |
outbam = f'simulated_{args.digest_sequence}_single_{r1_read_length}.bam' |
|
|
57 |
|
|
|
58 |
with gzip.open(fastq_path, 'wt') as out: |
|
|
59 |
processed = 0 |
|
|
60 |
|
|
|
61 |
for contig, contig_len in zip(ref.references, ref.lengths): |
|
|
62 |
if selected_contigs is not None and contig not in selected_contigs: |
|
|
63 |
continue |
|
|
64 |
print(f'{contig}\t{contig_len}') |
|
|
65 |
seq = ref.fetch(contig).upper() |
|
|
66 |
frag_locations = np.diff( |
|
|
67 |
[m.start() for m in re.finditer(args.digest_sequence, seq)]) |
|
|
68 |
# Generate fragments: |
|
|
69 |
for i, (start, end_excluding_catg) in enumerate(more_itertools.windowed( |
|
|
70 |
(m.start() for m in re.finditer(args.digest_sequence, seq)), 2)): |
|
|
71 |
if end_excluding_catg is None or args.head is not None and i >= ( |
|
|
72 |
args.head - 1): |
|
|
73 |
continue |
|
|
74 |
end = end_excluding_catg + len(args.digest_sequence) |
|
|
75 |
|
|
|
76 |
forward_read = seq[start:end][:r1_read_length] |
|
|
77 |
if len(forward_read) >= minlen: |
|
|
78 |
out.write( |
|
|
79 |
seq_to_fastq( |
|
|
80 |
forward_read, |
|
|
81 |
f'DR:fwd;ct:{contig};ds:{start};qn:{len(forward_read)}')) |
|
|
82 |
reverse_read = singlecellmultiomics.utils.reverse_complement(seq[start:end])[ |
|
|
83 |
:r1_read_length] |
|
|
84 |
if len(reverse_read) >= minlen: |
|
|
85 |
out.write( |
|
|
86 |
seq_to_fastq( |
|
|
87 |
reverse_read, |
|
|
88 |
f'DR:rev;ct:{contig};ds:{end_excluding_catg};qn:{len(reverse_read)}')) |
|
|
89 |
|
|
|
90 |
processed += contig_len |
|
|
91 |
|
|
|
92 |
# Map the fastq file: |
|
|
93 |
print("Now mapping ...") |
|
|
94 |
os.system(f"bwa mem -t 4 {args.fasta} {fastq_path} | samtools view -b - > ./{outbam}.unsorted.bam; samtools sort -T ./temp_sort -@ 4 ./{outbam}.unsorted.bam > ./{outbam}.unfinished.bam ; mv ./{outbam}.unfinished.bam ./{outbam} ; samtools index ./{outbam} & rm {fastq_path}") |
|
|
95 |
|
|
|
96 |
print("Creating site database ...") |
|
|
97 |
# Create site dictionary: |
|
|
98 |
sites = {} # site-> wrongly_assinged_to, correctly_assigned, lost |
|
|
99 |
|
|
|
100 |
ref_reads = pysam.AlignmentFile(outbam) |
|
|
101 |
# Perform site detection: |
|
|
102 |
for read in ref_reads: |
|
|
103 |
|
|
|
104 |
if read.is_supplementary: |
|
|
105 |
continue |
|
|
106 |
# Check if there is reads mapping to this site |
|
|
107 |
|
|
|
108 |
kv = {kv.split(':')[0]: kv.split(':')[1] |
|
|
109 |
for kv in read.query_name.split(';')} |
|
|
110 |
|
|
|
111 |
site_pos = int(kv['ds']) |
|
|
112 |
site_chrom = kv['ct'] |
|
|
113 |
strand = (kv['DR'] == 'rev') |
|
|
114 |
|
|
|
115 |
read_site = read.reference_end - 4 if read.is_reverse else read.reference_start |
|
|
116 |
read_contig = read.reference_name |
|
|
117 |
|
|
|
118 |
key = (site_chrom, site_pos, strand) |
|
|
119 |
key_mapped = (read_contig, read_site, read.is_reverse) |
|
|
120 |
|
|
|
121 |
if key not in sites: |
|
|
122 |
sites[key] = {'correct': 0, 'wrong_gain': 0, 'lost': 0} |
|
|
123 |
|
|
|
124 |
if read_site == site_pos and site_chrom == read_contig and read.is_reverse == strand: # Correct assignment |
|
|
125 |
sites[key]['correct'] += 1 |
|
|
126 |
else: |
|
|
127 |
# Assign a missing count to the origin site: |
|
|
128 |
sites[key]['lost'] += 1 |
|
|
129 |
|
|
|
130 |
# Assing a wrong annotation to the target site: |
|
|
131 |
if key_mapped not in sites: |
|
|
132 |
sites[key_mapped] = {'correct': 0, 'wrong_gain': 0, 'lost': 0} |
|
|
133 |
sites[key_mapped]['wrong_gain'] += 1 |
|
|
134 |
|
|
|
135 |
print("Writing site statistics ...") |
|
|
136 |
outtab = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.stats.bgzf' |
|
|
137 |
outtabsafe = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.safe.bgzf' |
|
|
138 |
|
|
|
139 |
keys = sorted( |
|
|
140 |
[(contig, pos, strand) |
|
|
141 |
for (contig, pos, strand) in sites.keys() |
|
|
142 |
if contig is not None]) |
|
|
143 |
|
|
|
144 |
with BlockZip(outtab, 'w') as stats, BlockZip(outtabsafe, 'w') as safe: |
|
|
145 |
for (contig, pos, strand) in keys: |
|
|
146 |
measured= sites[(contig, pos, strand)] |
|
|
147 |
stats.write( |
|
|
148 |
contig, |
|
|
149 |
pos, |
|
|
150 |
strand, |
|
|
151 |
f'{measured["correct"]}\t{measured["lost"]}\t{measured["wrong_gain"]}') |
|
|
152 |
|
|
|
153 |
if measured['wrong_gain'] == 0 and measured['lost'] == 0 and measured['correct'] == 1: |
|
|
154 |
safe.write(contig, pos, strand, 'ok') |