Switch to unified view

a b/singlecellmultiomics/molecule/consensus.py
1
import sklearn.ensemble
2
import numpy as np
3
import pandas as pd
4
import time
5
6
def calculate_consensus(molecule, consensus_model, molecular_identifier, out, **model_kwargs ):
7
    """
8
    Create consensus read for molecule
9
10
    Args:
11
        molecule (singlecellmultiomics.molecule.Molecule)
12
13
        consensus_model
14
15
        molecular_identifier (str) : identier for this molecule, will be suffixed to the reference_id
16
17
        out(pysam.AlingmentFile) : target bam file
18
19
        **model_kwargs : arguments passed to the consensus model
20
21
    """
22
    try:
23
        consensus_reads = molecule.deduplicate_to_single_CIGAR_spaced(
24
            out,
25
            f'c_{molecule.get_a_reference_id()}_{molecular_identifier}',
26
            consensus_model,
27
            NUC_RADIUS=model_kwargs['consensus_k_rad']
28
            )
29
        for consensus_read in consensus_reads:
30
            consensus_read.set_tag('RG', molecule[0].get_read_group())
31
            consensus_read.set_tag('mi', molecular_identifier)
32
            out.write(consensus_read)
33
    except Exception as e:
34
35
        molecule.set_rejection_reason('CONSENSUS_FAILED',set_qcfail=True)
36
        molecule.write_pysam(out)
37
38
def base_calling_matrix_to_df(
39
        x,
40
        ref_info=None,
41
        NUC_RADIUS=1,
42
        USE_RT=True):
43
    """
44
    Convert numpy base calling feature matrix to pandas dataframe with annotated columns
45
46
    Args:
47
        x(np.array) : feature matrix
48
        ref_info(list) : reference position annotations (will be used as index)
49
        NUC_RADIUS(int) : generate kmer features target nucleotide
50
        USE_RT(bool) : use RT reaction features
51
52
    Returns:
53
        df (pd.DataFrame)
54
    """
55
    df = pd.DataFrame(x)
56
    # annotate the columns
57
    BASE_COUNT = 5
58
    RT_INDEX = 7 if USE_RT else None
59
    STRAND_INDEX = 0
60
    PHRED_INDEX = 1
61
    RC_INDEX = 2
62
    MATE_INDEX = 3
63
    CYCLE_INDEX = 4
64
    MQ_INDEX = 5
65
    FS_INDEX = 6
66
    COLUMN_OFFSET = 0
67
    features_per_block = 8 - (not USE_RT)
68
69
    block_header = ["?"] * features_per_block
70
    block_header[STRAND_INDEX] = 'strand'
71
    block_header[PHRED_INDEX] = 'phred'
72
    block_header[RC_INDEX] = 'read_count'
73
    block_header[MATE_INDEX] = 'mate'
74
    block_header[CYCLE_INDEX] = 'cycle'
75
    block_header[MQ_INDEX] = 'mq'
76
    block_header[FS_INDEX] = 'fragment_size'
77
    block_header[RT_INDEX] = 'rt_reactions'
78
    k_header = []
79
    for k in range(NUC_RADIUS * 2 + 1):
80
        for base in 'ACGTN':
81
            k_header += [(k, b, base) for b in block_header]
82
83
    try:
84
        df.columns = pd.MultiIndex.from_tuples(k_header)
85
    except ValueError:  # the dataframe is a concateenation of multiple molecules
86
        pass
87
    if ref_info is not None:
88
        df.index = pd.MultiIndex.from_tuples(ref_info)
89
    return df
90
91
def get_consensus_training_data(
92
        molecule_iterator,
93
        mask_variants=None,
94
        n_train=100_000, # When None, training data is created until molecule source depletion
95
        skip_already_covered_bases = True,
96
        #yield_results=False, # Yield results instead of returning a matrix
97
        **feature_matrix_args):
98
99
    """
100
    Create a tensor/matrix containing alignment and base calling information, which can be used for consensus calling.
101
    This function also creates a vector containing the corresponding reference bases, which can be used for training a consensus model.
102
103
    Args:
104
        molecule_iterator : generator which generates molecules from which base calling feature matrices are extracted
105
106
        mask_variants (pysam.VariantFile) : variant locations which should be excluded from the matrix
107
108
        n_train (int) : amount of rows in the matrix
109
110
        skip_already_covered_bases(bool) : when True every reference position is at most a single row in the output matrix, this prevents overfitting
111
112
        **feature_matrix_args : Arguments to pass to the feature matrix function of the molecules.
113
114
    """
115
116
    #if not yield_results:
117
    X = None
118
    y = []
119
    molecules_used = 0
120
    training_set_size = 0
121
    last_end = None
122
    last_chrom = None
123
    try:
124
125
        for i, molecule in enumerate(molecule_iterator):
126
            # Never train the same genomic location twice
127
            if skip_already_covered_bases:
128
                if last_chrom is not None and last_chrom != molecule.chromosome:
129
                    last_end = None
130
                if last_end is not None and molecule.spanStart < last_end:
131
                    continue
132
133
            train_result = molecule.get_base_calling_training_data(
134
                mask_variants, **feature_matrix_args)
135
136
            if train_result is None:
137
                # Continue when the molecule does not have bases where we can learn from
138
                continue
139
140
            x, _y = train_result
141
            training_set_size += len(_y)
142
143
            #if yield_results:
144
            #    yield x, _y
145
146
            #else:
147
            if X is None:
148
                X = np.empty((0, x.shape[1]))
149
                print(
150
                    f"Creating feature matrix with {x.shape[1]} dimensions and {n_train} training base-calls")
151
            y += _y
152
            X = np.append(X, x, axis=0)
153
            last_chrom = molecule.chromosome
154
155
            if training_set_size >= n_train:
156
                break
157
158
            molecules_used+=1
159
            if molecule.spanEnd is not None:
160
                last_end = molecule.spanEnd
161
            else:
162
                last_end += len(_y)
163
164
165
    except KeyboardInterrupt as e:
166
        print("Got keyboard interrupt, stopping to load more data")
167
168
    if molecules_used > 0:
169
        print(
170
                f'Finished, last genomic coordinate: {molecule.chromosome} {molecule.spanEnd}, training set size is {training_set_size}, used {molecules_used} molecules for training')
171
    #if not yield_results:
172
    return X, y
173
174
175
176
177
178
def train_consensus_model(
179
        molecule_iterator,
180
        mask_variants=None,
181
        classifier=None,
182
        n_train=100_000,
183
        skip_already_covered_bases = True,
184
        **feature_matrix_args):
185
    if classifier is None:  # default to random forest
186
        classifier = sklearn.ensemble.RandomForestClassifier(
187
            n_jobs=-1,
188
            n_estimators=100,
189
            oob_score=True,
190
            max_depth=7,
191
            min_samples_leaf=5
192
        )
193
    X, y = get_consensus_training_data(
194
        molecule_iterator, mask_variants=mask_variants, n_train=n_train,
195
        skip_already_covered_bases=skip_already_covered_bases,**feature_matrix_args)
196
    y = np.array(y)
197
    # remove unkown ref bases from set
198
    X = np.array(X)[y != 'N']
199
    y = y[y != 'N']
200
    classifier.fit(X, y)
201
    if isinstance(classifier, sklearn.ensemble.forest.RandomForestClassifier):
202
        print(f"Model out of bag accuracy: {classifier.oob_score_}")
203
    classifier.n_jobs = 1  # fix amount of jobs to one, otherwise apply will be very slow
204
    return classifier