|
a |
|
b/singlecellmultiomics/molecule/consensus.py |
|
|
1 |
import sklearn.ensemble |
|
|
2 |
import numpy as np |
|
|
3 |
import pandas as pd |
|
|
4 |
import time |
|
|
5 |
|
|
|
6 |
def calculate_consensus(molecule, consensus_model, molecular_identifier, out, **model_kwargs ): |
|
|
7 |
""" |
|
|
8 |
Create consensus read for molecule |
|
|
9 |
|
|
|
10 |
Args: |
|
|
11 |
molecule (singlecellmultiomics.molecule.Molecule) |
|
|
12 |
|
|
|
13 |
consensus_model |
|
|
14 |
|
|
|
15 |
molecular_identifier (str) : identier for this molecule, will be suffixed to the reference_id |
|
|
16 |
|
|
|
17 |
out(pysam.AlingmentFile) : target bam file |
|
|
18 |
|
|
|
19 |
**model_kwargs : arguments passed to the consensus model |
|
|
20 |
|
|
|
21 |
""" |
|
|
22 |
try: |
|
|
23 |
consensus_reads = molecule.deduplicate_to_single_CIGAR_spaced( |
|
|
24 |
out, |
|
|
25 |
f'c_{molecule.get_a_reference_id()}_{molecular_identifier}', |
|
|
26 |
consensus_model, |
|
|
27 |
NUC_RADIUS=model_kwargs['consensus_k_rad'] |
|
|
28 |
) |
|
|
29 |
for consensus_read in consensus_reads: |
|
|
30 |
consensus_read.set_tag('RG', molecule[0].get_read_group()) |
|
|
31 |
consensus_read.set_tag('mi', molecular_identifier) |
|
|
32 |
out.write(consensus_read) |
|
|
33 |
except Exception as e: |
|
|
34 |
|
|
|
35 |
molecule.set_rejection_reason('CONSENSUS_FAILED',set_qcfail=True) |
|
|
36 |
molecule.write_pysam(out) |
|
|
37 |
|
|
|
38 |
def base_calling_matrix_to_df( |
|
|
39 |
x, |
|
|
40 |
ref_info=None, |
|
|
41 |
NUC_RADIUS=1, |
|
|
42 |
USE_RT=True): |
|
|
43 |
""" |
|
|
44 |
Convert numpy base calling feature matrix to pandas dataframe with annotated columns |
|
|
45 |
|
|
|
46 |
Args: |
|
|
47 |
x(np.array) : feature matrix |
|
|
48 |
ref_info(list) : reference position annotations (will be used as index) |
|
|
49 |
NUC_RADIUS(int) : generate kmer features target nucleotide |
|
|
50 |
USE_RT(bool) : use RT reaction features |
|
|
51 |
|
|
|
52 |
Returns: |
|
|
53 |
df (pd.DataFrame) |
|
|
54 |
""" |
|
|
55 |
df = pd.DataFrame(x) |
|
|
56 |
# annotate the columns |
|
|
57 |
BASE_COUNT = 5 |
|
|
58 |
RT_INDEX = 7 if USE_RT else None |
|
|
59 |
STRAND_INDEX = 0 |
|
|
60 |
PHRED_INDEX = 1 |
|
|
61 |
RC_INDEX = 2 |
|
|
62 |
MATE_INDEX = 3 |
|
|
63 |
CYCLE_INDEX = 4 |
|
|
64 |
MQ_INDEX = 5 |
|
|
65 |
FS_INDEX = 6 |
|
|
66 |
COLUMN_OFFSET = 0 |
|
|
67 |
features_per_block = 8 - (not USE_RT) |
|
|
68 |
|
|
|
69 |
block_header = ["?"] * features_per_block |
|
|
70 |
block_header[STRAND_INDEX] = 'strand' |
|
|
71 |
block_header[PHRED_INDEX] = 'phred' |
|
|
72 |
block_header[RC_INDEX] = 'read_count' |
|
|
73 |
block_header[MATE_INDEX] = 'mate' |
|
|
74 |
block_header[CYCLE_INDEX] = 'cycle' |
|
|
75 |
block_header[MQ_INDEX] = 'mq' |
|
|
76 |
block_header[FS_INDEX] = 'fragment_size' |
|
|
77 |
block_header[RT_INDEX] = 'rt_reactions' |
|
|
78 |
k_header = [] |
|
|
79 |
for k in range(NUC_RADIUS * 2 + 1): |
|
|
80 |
for base in 'ACGTN': |
|
|
81 |
k_header += [(k, b, base) for b in block_header] |
|
|
82 |
|
|
|
83 |
try: |
|
|
84 |
df.columns = pd.MultiIndex.from_tuples(k_header) |
|
|
85 |
except ValueError: # the dataframe is a concateenation of multiple molecules |
|
|
86 |
pass |
|
|
87 |
if ref_info is not None: |
|
|
88 |
df.index = pd.MultiIndex.from_tuples(ref_info) |
|
|
89 |
return df |
|
|
90 |
|
|
|
91 |
def get_consensus_training_data( |
|
|
92 |
molecule_iterator, |
|
|
93 |
mask_variants=None, |
|
|
94 |
n_train=100_000, # When None, training data is created until molecule source depletion |
|
|
95 |
skip_already_covered_bases = True, |
|
|
96 |
#yield_results=False, # Yield results instead of returning a matrix |
|
|
97 |
**feature_matrix_args): |
|
|
98 |
|
|
|
99 |
""" |
|
|
100 |
Create a tensor/matrix containing alignment and base calling information, which can be used for consensus calling. |
|
|
101 |
This function also creates a vector containing the corresponding reference bases, which can be used for training a consensus model. |
|
|
102 |
|
|
|
103 |
Args: |
|
|
104 |
molecule_iterator : generator which generates molecules from which base calling feature matrices are extracted |
|
|
105 |
|
|
|
106 |
mask_variants (pysam.VariantFile) : variant locations which should be excluded from the matrix |
|
|
107 |
|
|
|
108 |
n_train (int) : amount of rows in the matrix |
|
|
109 |
|
|
|
110 |
skip_already_covered_bases(bool) : when True every reference position is at most a single row in the output matrix, this prevents overfitting |
|
|
111 |
|
|
|
112 |
**feature_matrix_args : Arguments to pass to the feature matrix function of the molecules. |
|
|
113 |
|
|
|
114 |
""" |
|
|
115 |
|
|
|
116 |
#if not yield_results: |
|
|
117 |
X = None |
|
|
118 |
y = [] |
|
|
119 |
molecules_used = 0 |
|
|
120 |
training_set_size = 0 |
|
|
121 |
last_end = None |
|
|
122 |
last_chrom = None |
|
|
123 |
try: |
|
|
124 |
|
|
|
125 |
for i, molecule in enumerate(molecule_iterator): |
|
|
126 |
# Never train the same genomic location twice |
|
|
127 |
if skip_already_covered_bases: |
|
|
128 |
if last_chrom is not None and last_chrom != molecule.chromosome: |
|
|
129 |
last_end = None |
|
|
130 |
if last_end is not None and molecule.spanStart < last_end: |
|
|
131 |
continue |
|
|
132 |
|
|
|
133 |
train_result = molecule.get_base_calling_training_data( |
|
|
134 |
mask_variants, **feature_matrix_args) |
|
|
135 |
|
|
|
136 |
if train_result is None: |
|
|
137 |
# Continue when the molecule does not have bases where we can learn from |
|
|
138 |
continue |
|
|
139 |
|
|
|
140 |
x, _y = train_result |
|
|
141 |
training_set_size += len(_y) |
|
|
142 |
|
|
|
143 |
#if yield_results: |
|
|
144 |
# yield x, _y |
|
|
145 |
|
|
|
146 |
#else: |
|
|
147 |
if X is None: |
|
|
148 |
X = np.empty((0, x.shape[1])) |
|
|
149 |
print( |
|
|
150 |
f"Creating feature matrix with {x.shape[1]} dimensions and {n_train} training base-calls") |
|
|
151 |
y += _y |
|
|
152 |
X = np.append(X, x, axis=0) |
|
|
153 |
last_chrom = molecule.chromosome |
|
|
154 |
|
|
|
155 |
if training_set_size >= n_train: |
|
|
156 |
break |
|
|
157 |
|
|
|
158 |
molecules_used+=1 |
|
|
159 |
if molecule.spanEnd is not None: |
|
|
160 |
last_end = molecule.spanEnd |
|
|
161 |
else: |
|
|
162 |
last_end += len(_y) |
|
|
163 |
|
|
|
164 |
|
|
|
165 |
except KeyboardInterrupt as e: |
|
|
166 |
print("Got keyboard interrupt, stopping to load more data") |
|
|
167 |
|
|
|
168 |
if molecules_used > 0: |
|
|
169 |
print( |
|
|
170 |
f'Finished, last genomic coordinate: {molecule.chromosome} {molecule.spanEnd}, training set size is {training_set_size}, used {molecules_used} molecules for training') |
|
|
171 |
#if not yield_results: |
|
|
172 |
return X, y |
|
|
173 |
|
|
|
174 |
|
|
|
175 |
|
|
|
176 |
|
|
|
177 |
|
|
|
178 |
def train_consensus_model( |
|
|
179 |
molecule_iterator, |
|
|
180 |
mask_variants=None, |
|
|
181 |
classifier=None, |
|
|
182 |
n_train=100_000, |
|
|
183 |
skip_already_covered_bases = True, |
|
|
184 |
**feature_matrix_args): |
|
|
185 |
if classifier is None: # default to random forest |
|
|
186 |
classifier = sklearn.ensemble.RandomForestClassifier( |
|
|
187 |
n_jobs=-1, |
|
|
188 |
n_estimators=100, |
|
|
189 |
oob_score=True, |
|
|
190 |
max_depth=7, |
|
|
191 |
min_samples_leaf=5 |
|
|
192 |
) |
|
|
193 |
X, y = get_consensus_training_data( |
|
|
194 |
molecule_iterator, mask_variants=mask_variants, n_train=n_train, |
|
|
195 |
skip_already_covered_bases=skip_already_covered_bases,**feature_matrix_args) |
|
|
196 |
y = np.array(y) |
|
|
197 |
# remove unkown ref bases from set |
|
|
198 |
X = np.array(X)[y != 'N'] |
|
|
199 |
y = y[y != 'N'] |
|
|
200 |
classifier.fit(X, y) |
|
|
201 |
if isinstance(classifier, sklearn.ensemble.forest.RandomForestClassifier): |
|
|
202 |
print(f"Model out of bag accuracy: {classifier.oob_score_}") |
|
|
203 |
classifier.n_jobs = 1 # fix amount of jobs to one, otherwise apply will be very slow |
|
|
204 |
return classifier |