[45ad7e]: / tests / test_fragment.py

Download this file

182 lines (151 with data), 7.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import unittest
import singlecellmultiomics.molecule
import singlecellmultiomics.fragment
import pysam
import pysamiterators.iterators
import os
from singlecellmultiomics.fragment import Fragment, CHICFragment
from singlecellmultiomics.utils import create_MD_tag
from singlecellmultiomics.utils import complement
"""
These tests check if the Molecule module is working correctly
"""
class TestTAPs(unittest.TestCase):
def test_random_priming(self):
temp_folder = 'data'
enable_ref_write=False
ref_path = f'{temp_folder}/chic_ref.fa'
alignments_path = f'{temp_folder}/chic_test_alignments.bam'
if not os.path.exists(temp_folder):
os.makedirs(temp_folder)
# Create reference bam file
refseq = 'TTAATCATGAAACCGTGGAGGCAAATCGGAGTGTAAGGCTTGACTGGATTCCTACGTTGCGTAGGTTCATGGGGGG'
if enable_ref_write:
with open(ref_path, 'w') as f:
f.write(f">chr1\n{refseq}\n>chr2\n{complement(refseq)}\n""")
# This command needs to finish, which is not working properly during testing
pysam.faidx(ref_path)
# CATG at base 5
# Create BAM file with NLA fragment:
def get_reads():
with pysam.AlignmentFile(alignments_path_unsorted,'wb',reference_names=['chr1'],reference_lengths=[len(refseq)]) as bam:
### Nla III mate pair example, containing 2 CpGs and 1 call on the wrong strand
read_A = pysam.AlignedSegment(bam.header)
read_A.reference_name = 'chr1'
read_A.reference_start = 5
# Before last A is a bogus G>A conversion to test strandness:
read_A.query_sequence = 'CATGAAACCGTGGAGGCAAATTGGAGTAT'
read_A.cigarstring = f'{len(read_A.query_sequence)}M'
read_A.qual = 'A'*len(read_A.query_sequence)
read_A.mapping_quality = 60
read_A.query_name = 'EX1_GA_CONV_2x_CpG_TAPS'
read_A.set_tag('SM', 'Cell_A')
read_A.is_read1 = True
read_A.is_read2 = False
read_A.set_tag('lh','TG')
# Set substitution tag:
read_A.set_tag('MD',
create_MD_tag(
refseq[read_A.reference_start:read_A.reference_end], read_A.query_sequence))
read_A.is_paired = True
read_A.is_proper_pair = True
# Create a second read which is a mate of the previous
read_B = pysam.AlignedSegment(bam.header)
read_B.reference_name = 'chr1'
read_B.reference_start = 25
read_B.query_sequence = refseq[25:60].replace('TGT','TAT').replace('CG', 'TG')
read_B.cigarstring = f'{len(read_B.query_sequence)}M'
read_B.qual = 'A'*len(read_B.query_sequence)
read_B.mapping_quality = 60
read_B.is_read2 = True
read_B.is_read1 = False
read_B.is_reverse = True
read_B.query_name = 'EX1_GA_CONV_2x_CpG_TAPS'
read_B.set_tag('SM', 'Cell_A')
read_B.set_tag('lh','TG')
read_B.set_tag('MD',
create_MD_tag(refseq[read_B.reference_start:read_B.reference_end],
read_B.query_sequence,
))
read_B.is_paired = True
read_B.is_proper_pair = True
read_A.next_reference_id = read_B.reference_id
read_A.next_reference_start = read_B.reference_start
read_B.next_reference_id = read_A.reference_id
read_B.next_reference_start = read_A.reference_start
read_A.mate_is_reverse = read_B.is_reverse
read_B.mate_is_reverse = read_A.is_reverse
return read_A, read_B
alignments_path_unsorted = f'{alignments_path}.unsorted.bam'
read_A, read_B = get_reads()
read_B.set_tag('rS','AATTAA') # Set random primer sequence
## Act on reads with random primer set:
frag = Fragment([read_A, read_B])
self.assertTrue(frag.R2_primer_length==0)
self.assertTrue(frag.unsafe_trimmed)
## Act on reads without random primer
read_A, read_B = get_reads()
frag = Fragment([read_A, read_B], R2_primer_length=10)
self.assertEqual(frag.R2_primer_length, 10)
self.assertFalse(frag.unsafe_trimmed)
read_A, read_B = get_reads()
frag = CHICFragment([read_A, read_B], R2_primer_length=0)
self.assertEqual(frag.R2_primer_length, 0)
self.assertFalse(frag.unsafe_trimmed)
read_A, read_B = get_reads()
read_B.set_tag('rS','AATTAA') # Set random primer sequence
frag = CHICFragment([read_A, read_B], R2_primer_length=0)
self.assertEqual(frag.R2_primer_length, 0)
self.assertTrue(frag.unsafe_trimmed)
read_A, read_B = get_reads()
read_B.set_tag('MX','scCHIC384C8U3l')
frag = CHICFragment([read_A, read_B])
self.assertEqual(frag.R2_primer_length, 0)
self.assertFalse(frag.unsafe_trimmed)
read_A, read_B = get_reads()
read_B.set_tag('MX','scCHIC384C8U3')
frag = CHICFragment([read_A, read_B])
self.assertEqual(frag.R2_primer_length,6)
self.assertFalse(frag.unsafe_trimmed)
def test_init(self):
"""Test if the fragment can be initialised"""
with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
for i,(R1,R2) in enumerate(pysamiterators.iterators.MatePairIterator(f)):
frag = singlecellmultiomics.fragment.Fragment([R1,R2])
self.assertIsNotNone(frag)
def test_get(self):
"""Test if the fragment can be initialised"""
with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
for i,(R1,R2) in enumerate(pysamiterators.iterators.MatePairIterator(f)):
frag = singlecellmultiomics.fragment.Fragment([R1,R2])
if R1 is not None:
self.assertEqual(frag.get_R1(), R1 )
self.assertEqual(frag[0], R1 )
if R2 is not None:
self.assertEqual(frag.get_R2(), R2 )
self.assertEqual(frag[1], R2 )
def test_get_sample(self):
"""Test if the sample name of the fragment can be obtained"""
with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
for i,(R1,R2) in enumerate(pysamiterators.iterators.MatePairIterator(f)):
frag = singlecellmultiomics.fragment.Fragment([R1,R2])
self.assertTrue( frag.get_sample().startswith('A') )
def test_set_sample(self):
"""Test if the sample name of the fragment can be changed"""
with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
for i,(R1,R2) in enumerate(pysamiterators.iterators.MatePairIterator(f)):
frag = singlecellmultiomics.fragment.Fragment([R1,R2])
frag.set_sample(f'TEST{i}')
self.assertEqual(frag.get_sample(), f'TEST{i}' )
def test_strand(self):
"""Test if the strand can be obtained (doesn't check validity)"""
with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
for i,(R1,R2) in enumerate(pysamiterators.iterators.MatePairIterator(f)):
frag = singlecellmultiomics.fragment.Fragment([R1,R2])
self.assertIn( frag.get_strand(), [None,0,1])
if frag.is_mapped:
self.assertIn( frag.get_strand(), [0,1])
if __name__ == '__main__':
unittest.main()