Diff of /generate_input.py [000000] .. [78e588]

Switch to unified view

a b/generate_input.py
1
#!/usr/bin/env python
2
3
# Created by Christopher Rushton 2020/04/20
4
# This converts a MAF file and copy number matrix (optional) into the input files used be the LymphGen lymphoma
5
# classifier. See https://llmpp.nih.gov/lymphgen/index.php for more information
6
7
import argparse
8
import math
9
import os
10
import bisect
11
import logging
12
from collections import Counter
13
from sortedcontainers import SortedSet
14
15
16
class Gene:
17
    """
18
    A simple class storing the chromosome, start, end, and name of a gene
19
    """
20
    __slots__ = ["name", "chrom", "start", "end"]
21
22
    def __init__(self, chrom, start, end, name):
23
        self.name = name
24
        self.chrom = chrom
25
        self.start = start
26
        self.end = end
27
28
29
class Chromosome:
30
    """
31
    A simple class storying the genomic range of each chromosome, as well as the coordiates of the p and q arm
32
    """
33
34
    def __init__(self, chrom):
35
        self.chrom = chrom
36
        self.p_start = None
37
        self.p_end = None
38
        self.q_start = None
39
        self.q_end = None
40
        self.q_length = 0.0
41
        self.p_length = 0.0
42
43
    def add(self, start, end, arm):
44
45
        if arm == "p":
46
            # Check to see if we already have genomic coordinates for this arm
47
            if self.p_start is not None or self.p_end is not None:
48
                raise AttributeError("Coordinates already assigned for %s arm of chromosome \'%s\'" % (arm, self.chrom))
49
            self.p_start = start
50
            self.p_end = end
51
            self.p_length = end - start
52
53
            # Sanity check this does not overlap the q arm
54
            if self.q_start is not None and self.p_end > self.q_start:
55
                raise AttributeError("For chromosome \'%s\'. The q arm starts before the end of the p arm" % self.chrom)
56
57
        elif arm == "q":
58
            # Check to see if we already have genomic coordinates for this arm
59
            if self.q_start is not None or self.q_end is not None:
60
                raise AttributeError("Coordinates already assigned for %s arm of chromosome \'%s\'" % (arm, self.chrom))
61
            self.q_start = start
62
            self.q_end = end
63
            self.q_length = end - start
64
65
            # Sanity check this does not overlap the p arm
66
            if self.p_end is not None and self.q_start < self.p_end:
67
                raise AttributeError("For chromosome \'%s\'. The q arm starts before the end of the p arm" % self.chrom)
68
        else:
69
            # Not the p or q arm. Invalid
70
            raise AttributeError("Unknown arm specified for chromosome \'%s\':\'%s\'" % (self.chrom, arm))
71
72
73
class SampleCNVs:
74
    """
75
    Stores all the copy number events within a given class
76
    I was going to write this in a function, but having a class is a lot cleaner
77
    """
78
79
    def __init__(self):
80
        self.starts = {}
81
        self.ends = {}
82
        self.cn_states = {}
83
        self.len_seg = {}
84
        self.is_chr_prefixed = None
85
        self.ploidy = None
86
87
    def merge_overlapping_cnvs(self, existStarts: iter, existEnds: iter, existCNs: iter, newStart: int, newEnd: int, newCN:int):
88
        """
89
        Handle overlapping copy number segments, merging and editing existing segments as necessary
90
91
        In essence, if this segment overlaps an existing segment with the same copy number state, those segments are merged
92
        together. If the copy number state is different, the largest increase is kept,
93
         truncating or even breaking apart the copy-neutral segment in the process. In the worse case senario,
94
        this will lead to three different segments being created
95
96
        :param existStarts: The start position of the existing segment
97
        :param existEnds: The end position of the existing segment
98
        :param existCNs: The copy number state of the existing segment
99
        :param newStart: The start position of the new segment
100
        :param newEnd: The end position of the new segment
101
        :param newCN: The copy number state of the new segment
102
        :return: A tuple containing the new and old events. Ex {[newEvent1, newEvent2], [oldEvent1, oldEvent2]}
103
        """
104
105
        def reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd):
106
107
            # The existing event has higher priority than the old event
108
            # Truncate the newer event
109
            # Since this could break the newer event apart into two pieces, lets do that now, and see
110
            # if the output segments are valid
111
            outStart1 = newStart
112
            outEnd1 = oldStart
113
            outStart2 = oldEnd
114
            outEnd2 = newEnd
115
            if outStart1 < outEnd1:  # Valid segment, process
116
                existStarts, existEnds, existCNs = self.merge_overlapping_cnvs(existStarts, existEnds, existCNs,
117
                                                                             outStart1, outEnd1, newCN)
118
            if outStart2 < outEnd2:  # Valid segment
119
                existStarts, existEnds, existCNs = self.merge_overlapping_cnvs(existStarts, existEnds, existCNs,
120
                                                                             outStart2, outEnd2, newCN)
121
            return existStarts, existEnds, existCNs
122
123
        def reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd, oldCN):
124
125
            # The newer event is a "higher"-level deletion
126
            # Split/Truncate the older event
127
            outStart1 = oldStart
128
            outEnd1 = newStart
129
            outStart2 = newEnd
130
            outEnd2 = oldEnd
131
            outCN = oldCN
132
            # Delete existing event
133
            existStarts.pop(start_bisect)
134
            existEnds.pop(start_bisect)
135
            existCNs.pop(start_bisect)
136
137
            if outStart2 < outEnd2:  # Valid segment, do the last one first to maintain sorted order
138
                existStarts.insert(start_bisect, outStart2)
139
                existEnds.insert(start_bisect, outEnd2)
140
                existCNs.insert(start_bisect, outCN)
141
            if outStart1 < outEnd1:  # Also valid
142
                existStarts.insert(start_bisect, outStart1)
143
                existEnds.insert(start_bisect, outEnd1)
144
                existCNs.insert(start_bisect, outCN)
145
146
            # Check for any more overlaps
147
            return self.merge_overlapping_cnvs(existStarts, existEnds, existCNs, newStart, newEnd, newCN)
148
149
        # First, does this new segment actually overlap anything?
150
        start_bisect = bisect.bisect_right(existEnds, newStart)
151
        end_bisect = bisect.bisect_left(existStarts, newEnd)
152
153
        if start_bisect == end_bisect:
154
            # There are no overlaps. Simply add this new event in, and return it
155
            existStarts.insert(start_bisect, newStart)
156
            existEnds.insert(start_bisect, newEnd)
157
            existCNs.insert(start_bisect, newCN)
158
            return existStarts, existEnds, existCNs
159
160
        # Grab the first overlap
161
        oldStart = existStarts[start_bisect]
162
        oldEnd = existEnds[start_bisect]
163
        oldCN = existCNs[start_bisect]
164
165
        # Simplest case first. If both these events have the same CN state, lets merge them together
166
        if oldCN == newCN:
167
168
            outStart = oldStart if oldStart < newStart else newStart
169
            outEnd = oldEnd if oldEnd > newEnd else newEnd
170
            # Delete the existing event
171
            existStarts.pop(start_bisect)
172
            existEnds.pop(start_bisect)
173
            existCNs.pop(start_bisect)
174
175
            # Check for any more overlaps
176
            return self.merge_overlapping_cnvs(existStarts, existEnds, existCNs, outStart, outEnd, newCN)
177
        else:
178
            # These segments overlap and have different CN states.
179
            # Lets keep the highest-level event
180
            if newCN <= 2 and oldCN <= 2: # Deletion
181
                if oldCN < newCN:
182
                    # The older event is a "higher"-level deletion
183
                    return reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd)
184
                else:
185
                    return reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd, oldCN)
186
            if newCN >= 2 and oldCN >= 2:  # Gain/Amp
187
                if oldCN > newCN:
188
                    # The older event is a higher level gain. Split/truncate the new event
189
                    return reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart,
190
                                            oldEnd)
191
                else:
192
                    return reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd,
193
                                          oldCN)
194
            else:
195
                # One event must be a gain/amp, while the other is a deletion. In this case, keep both, and subset the
196
                # larger event by the smaller one
197
                if oldEnd - oldStart < newEnd - newStart:
198
                    # The older event is smaller. Split the newer event
199
                    return reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart,
200
                                            oldEnd)
201
                else:
202
                    # The newer event is smaller. We should split the older event
203
                    return reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd,
204
                                          oldCN)
205
206
207
    def add(self, chrom: str, start: int, end: int, cn: int):
208
209
        # For LymphGen compatability: Strip "chr" prefix
210
        chrom = chrom.replace("chr", "")
211
212
        if end <= start:
213
            if start == end:
214
                # This segment has a length of 0?!. Skip this I guesss
215
                return None
216
            else:
217
                # This segment has a negative length??
218
                raise TypeError("Invalid CNV segment: \'%s:%s-%s\'" % (chrom, start, end))
219
220
        # Have we seen any events for this chromosome before?
221
        if chrom not in self.len_seg:
222
            # If not, just store this segment. Simple!
223
            self.starts[chrom] = [start]
224
            self.ends[chrom] = [end]
225
            self.cn_states[chrom] = [cn]
226
            self.len_seg[chrom] = 1
227
228
            # Are we using chr-prefixed contig names?
229
            if self.is_chr_prefixed is None:
230
                self.is_chr_prefixed = True if chrom.startswith("chr") else False
231
        else:
232
            # If we have seen events on this chromosome before, we need to compare this new segment with existing segments
233
            # In theory, events should be non-overlapping, but I don't want to assume that because then things will break
234
            # horribly later
235
236
            self.starts[chrom], self.ends[chrom], self.cn_states[chrom] = \
237
                self.merge_overlapping_cnvs(self.starts[chrom], self.ends[chrom], self.cn_states[chrom], start, end, cn)
238
            self.len_seg[chrom] = len(self.cn_states)
239
240
241
    def overlap_chrom(self, chromosome: Chromosome, threshold: float = 0.8):
242
243
        """
244
        Calculates the overlap between the segments stored here and a given chromosome
245
        :param chromosome: A Chromosome() object defining the start and end of the p and q arms, respectively
246
        :return: A dictionary containing
247
        """
248
249
        # Do the CNVs within this chromosome encompass a given chromosome more than the specified threshold?
250
        chrom_name = chromosome.chrom
251
252
        # Handle "chr" prefix nonsense
253
        if self.is_chr_prefixed and not chrom_name.startswith("chr"):
254
            chrom_name = "chr" + chrom_name
255
        elif not self.is_chr_prefixed and chrom_name.startswith("chr"):
256
            chrom_name = chrom_name.replace("chr", "")
257
258
        try:
259
            chrom_starts = self.starts[chrom_name]
260
        except KeyError:
261
            # No CN events were provided for this chromosome, hence there can be no overlap
262
            return {}
263
        chrom_ends = self.ends[chrom_name]
264
        chrom_cn = self.cn_states[chrom_name]
265
266
        homdel_sum = {"p": 0, "q": 0, "Chrom": 0}
267
        del_sum = {"p": 0, "q": 0, "Chrom": 0}
268
        gain_sum = {"p": 0, "q": 0, "Chrom": 0}
269
        amp_sum = {"p": 0, "q": 0, "Chrom": 0}
270
271
        # Check for overlapping segments for each arm
272
        for (start, end, cn) in zip(chrom_starts, chrom_ends, chrom_cn):
273
274
            # Ignore copy-neutral segments
275
            if cn == 2:
276
                continue
277
278
            if end < chromosome.p_start or start > chromosome.q_end:
279
                # Segment falls out of range
280
                continue
281
            if start < chromosome.p_end:
282
                olap_start = start if start > chromosome.p_start else chromosome.p_start
283
                olap_end = end if end < chromosome.p_end else chromosome.p_end
284
                if cn < 2:
285
                    del_sum["p"] += olap_end - olap_start
286
                    del_sum["Chrom"] +=  olap_end - olap_start
287
                    if cn < 1:
288
                        homdel_sum["p"] += olap_end - olap_start
289
                        homdel_sum["Chrom"] += olap_end - olap_start
290
                elif cn > 2:
291
                    gain_sum["p"] += olap_end - olap_start
292
                    gain_sum["Chrom"] += olap_end - olap_start
293
                    if cn > 3:
294
                        amp_sum["p"] += olap_end - olap_start
295
                        amp_sum["Chrom"] += olap_end - olap_start
296
297
            if end > chromosome.q_start:  # We use an if, not elif, in case a segment overlaps both the p and q arm
298
                olap_start = start if start > chromosome.q_start else chromosome.q_start
299
                olap_end = end if end < chromosome.q_end else chromosome.q_end
300
                if cn < 2:
301
                    del_sum["q"] += olap_end - olap_start
302
                    del_sum["Chrom"] +=  olap_end - olap_start
303
                    if cn < 1:
304
                        homdel_sum["q"] += olap_end - olap_start
305
                        homdel_sum["Chrom"] += olap_end - olap_start
306
                elif cn > 2:
307
                    gain_sum["q"] += olap_end - olap_start
308
                    gain_sum["Chrom"] += olap_end - olap_start
309
                    if cn > 3:
310
                        amp_sum["q"] += olap_end - olap_start
311
                        amp_sum["Chrom"] += olap_end - olap_start
312
313
        events = {}
314
        # Now, calculate the fraction of each overlap
315
        # Start from the highest level and biggest event, and work our way down/smaller
316
        # Amplifications/gains
317
        if amp_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold: # Whole chromosome Amp
318
            events[chrom_name + "Chrom"] = "AMP"
319
        # Now check for arm level events
320
        else:
321
            if amp_sum["p"] / chromosome.p_length > threshold:  # p Amp
322
                events[chrom_name + "p"] = "AMP"
323
            elif amp_sum["q"] / chromosome.q_length > threshold:  # q Amp
324
                events[chrom_name + "q"] = "AMP"
325
            if chrom_name + "Chrom" not in events and gain_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold:  # Whole chromosome gain
326
                events[chrom_name + "Chrom"] = "GAIN"
327
            else:
328
                if chrom_name + "p" not in events and gain_sum["p"] / chromosome.p_length > threshold:  # p Gain
329
                    events[chrom_name + "p"] = "GAIN"
330
                if  chrom_name + "q" not in events and gain_sum["q"] / chromosome.q_length > threshold:  # q Gain
331
                    events[chrom_name + "q"] = "GAIN"
332
333
        # Homozygous and heterozygous deletions
334
        if homdel_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold: # Whole chromosome Homozygous deletion
335
            events[chrom_name + "Chrom"] = "HOMDEL"
336
        # Now check for arm level events
337
        else:
338
            if homdel_sum["p"] / chromosome.p_length > threshold:  # p HOMDEL
339
                events[chrom_name + "p"] = "HOMDEL"
340
            elif homdel_sum["q"] / chromosome.q_length > threshold:  # q HOMDEL
341
                events[chrom_name + "q"] = "HOMDEL"
342
            if chrom_name + "Chrom" not in events and del_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold:  # Whole chromosome deletions
343
                events[chrom_name + "Chrom"] = "HETLOSS"
344
            else:
345
                if chrom_name + "p" not in events and del_sum["p"] / chromosome.p_length > threshold:  # p deletion
346
                    events[chrom_name + "p"] = "HETLOSS"
347
                if  chrom_name + "q" not in events and del_sum["q"] / chromosome.q_length > threshold:  # q deletions
348
                    events[chrom_name + "q"] = "HETLOSS"
349
350
        return events
351
352
353
    def adjust_ploidy(self, arm_coords, sample_name, redo=False):
354
        """
355
        Adjust a sample's CN states based upon ploidy
356
357
        Calculate the average copy number state across the entire genome. If a sample is not diploid, normalize all the CN
358
        changes
359
        :param arm_coords: A dictionary containing {"chromosome_name": Chromosome()} objects which list chromosomal coordinates
360
        :param sample_name: A string specifying the sample name. Used for debugging/error purposes
361
        :param redo: Should we override any existing ploidy state for this sample?
362
        :return: None
363
        """
364
365
        if self.ploidy is not None and not redo:
366
            # Ploidy for this sample has already been calculated. Don't do anything
367
            return None
368
369
        # Store the length affected for each ploidy state
370
        ploidy_cov = {}
371
        genome_size = 0
372
373
        for chromosome in arm_coords.values():
374
            # Find overlapping CNVs for this arm
375
376
            # if the p or q arm coordinates are not set, use a placeholder
377
            if chromosome.p_start is None:
378
                chromosome.p_start = -100000
379
                chromosome.p_end = -100000
380
                chromosome.p_length = 1
381
            if chromosome.q_start is None:
382
                chromosome.q_start = -100000
383
                chromosome.q_end = -100000
384
                chromosome.q_length = 1
385
386
            # Save the size of this chromosome
387
            genome_size += chromosome.p_length
388
            genome_size += chromosome.q_length
389
390
            chrom_name = chromosome.chrom
391
            # Handle "chr" prefix nonsense
392
            if self.is_chr_prefixed and not chrom_name.startswith("chr"):
393
                chrom_name = "chr" + chrom_name
394
            elif not self.is_chr_prefixed and chrom_name.startswith("chr"):
395
                chrom_name = chrom_name.replace("chr", "")
396
            try:
397
                chrom_starts = self.starts[chrom_name]
398
            except KeyError:
399
                # No CN events were provided for this chromosome, hence there can be no overlap
400
                continue
401
            chrom_ends = self.ends[chrom_name]
402
            chrom_cn = self.cn_states[chrom_name]
403
404
            # Check for overlapping segments for each arm
405
            for (start, end, cn) in zip(chrom_starts, chrom_ends, chrom_cn):
406
407
                # Check p-arm overlap
408
                if end < chromosome.p_start or start > chromosome.q_end:
409
                    # Segment falls out of range
410
                    continue
411
                if start < chromosome.p_end:
412
                    olap_start = start if start > chromosome.p_start else chromosome.p_start
413
                    olap_end = end if end < chromosome.p_end else chromosome.p_end
414
415
                    # Store the length of this overlap and associated CN
416
                    if cn not in ploidy_cov:
417
                        ploidy_cov[cn] = 0
418
                    ploidy_cov[cn] += olap_end - olap_start
419
420
                if end > chromosome.q_start:  # We use an if, not elif, in case a segment overlaps both the p and q arm
421
                    olap_start = start if start > chromosome.q_start else chromosome.q_start
422
423
                    olap_end = end if end < chromosome.q_end else chromosome.q_end
424
                    # Store the length of this overlap and associated CN
425
                    if cn not in ploidy_cov:
426
                        ploidy_cov[cn] = 0
427
                    ploidy_cov[cn] += olap_end - olap_start
428
429
        # Now that we have calculated the number of bases affected by each CN state, calculate the ploidy
430
        x = 0
431
        for cn, bases in ploidy_cov.items():
432
            x += cn * bases
433
434
        ploidy = x / genome_size
435
        av_ploidy = round(ploidy)
436
437
        # Sanity check
438
        if av_ploidy < 1:
439
            raise TypeError("Ploidy of sample \'%s\' was calculated to be below 1!" % sample_name)
440
441
        # If this tumour is not diploid, adjust the ploidy
442
        if av_ploidy != 2:
443
            logging.debug("\'%s\' was calculated to have a ploidy of %s" % (sample_name, av_ploidy))
444
            ploidy_dif = av_ploidy - 2
445
            new_cns = {}
446
            for chrom, cn in self.cn_states.items():
447
                x = list(y - ploidy_dif for y in cn)
448
                new_cns[chrom] = x
449
            self.cn_states = new_cns
450
451
        self.ploidy = av_ploidy
452
453
454
def get_args():
455
456
    def is_valid_dir(path, parser):
457
        """
458
        Checks to ensure the directory path exists
459
        :param path: A string containing a filepath to a directory
460
        :param parser: An argparse.ArgumentParser() object
461
        :return: path, if the string is a valid directory
462
        :raises: parser.error() if the directory does not exist
463
        """
464
        if os.path.exists(path) and os.path.isdir(path):
465
            return path
466
        else:
467
            raise parser.error("Unable to set \'%s\' as the output directory: Not a valid directory" % path)
468
469
    def is_valid_file(path, parser):
470
        """
471
        Checks to ensure the specified file exists
472
        :param path: A string containing a filepath
473
        :param parser: An argparse.ArgumentParser() object
474
        :return: path, if the string is a valid directory
475
        :raises: parser.error() if the file does not exist
476
        """
477
        if os.path.exists(path) and os.path.isfile(path):
478
            return path
479
        else:
480
            raise parser.error("Unable to locate \'%s\': No such file or directory" % path)
481
482
    epilog = os.linesep.join(["The --arms, --entrez_ids, and --lymphgen_genes files can be found in the \'resources\' folder. ",
483
                              "Note that genome and exome sequencing types are handled exactly the same. ",
484
                              "The --entrez-ids file must have the Hugo_Symbol under the column \"Approved Symbol\" and the Entrez ID under the column \"NCBI Gene ID(supplied by NCBI)\". ",
485
                              "The --cnvs file should contain the following colummns: Tumor_Sample_Barcode, chromosome, start, end, CN. ",
486
                              "The --arms file should contain the following columns: chromosome, start, end, arm. "
487
                              ])
488
    parser = argparse.ArgumentParser(description="Generates input files for the LymphGen classifier\nVersion 1.0.0", epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter)
489
    snvs_args = parser.add_argument_group("Input mutation files")
490
    snvs_args.add_argument("-m", "--maf", metavar="MAF", required=True, type=lambda x: is_valid_file(x, parser), help="Input MAF file listing somatic mutations")
491
    snvs_args.add_argument("-e", "--entrez_ids", metavar="TSV", required=True, type=lambda x: is_valid_file(x, parser), help="A tab-delimited file containing gene names (Hugo Symbol) and the corresponding Entrez gene ID")
492
    cnv_args = parser.add_argument_group("(Optional) Input CNV files")
493
    cnv_args.add_argument("-c", "--cnvs", metavar="TSV", default=None, type=lambda x: is_valid_file(x, parser), help="Input tab-delimited file summarizing copy number events")
494
    cnv_args.add_argument("--log2", action="store_true", help="Does the input CNV file use log2 ratios? (i.e. log2(absoluteCN) - 1)")
495
    cnv_args.add_argument("-g", "--genes", metavar="BED", default=None, type=lambda x: is_valid_file(x, parser), help="Input BED4+ file listing start and end positions of genes/exons")
496
    cnv_args.add_argument("-a", "--arms", metavar="TSV", default=None, type=lambda x: is_valid_file(x, parser), help="Input tab-delimited file listing the positions of chromosome arms")
497
498
    parser.add_argument("-l", "--lymphgen_genes", metavar="TXT", default=None, type=lambda x: is_valid_file(x, parser), required=False, help="An optional file listing all Entrez IDs considered by the LymphGen classifier. Output will be subset to these genes")
499
    parser.add_argument("-s", "--sequencing_type", metavar="SEQ", choices=["targeted", "exome", "genome"], required=True, help="Sequencing type used to obtain somatic mutations")
500
    parser.add_argument("-o", "--outdir", metavar="PATH", required=True, type=lambda x: is_valid_dir(x, parser), help="Output directory for LymphGen input files")
501
    parser.add_argument("--outprefix", metavar="STRING", default=None, help="Output files will be prefixed using this string [Default: Use the base name of the MAF file]")
502
    parser.add_argument("-v", "--verbose", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default="INFO", help="Set logging verbosity")
503
504
    args = parser.parse_args()
505
    # If no outprefix was set, use the basename of the maf file as the output prefix
506
    if args.outprefix is None:
507
        args.outprefix = os.path.basename(args.maf).split(".")[0]
508
509
    # If --cnvs is specified, the annotation files are also required
510
    if args.cnvs:
511
        if not args.genes or not args.arms:
512
            raise parser.error("--genes and --arms are required if --cnvs is specified")
513
514
    logging.basicConfig(level=getattr(logging, args.verbose), format='%(levelname)s:%(message)s')
515
516
    return args
517
518
519
def load_entrez_ids(entrez_file: str, hugo_column: str="Approved symbol", entrez_column: str="NCBI Gene ID(supplied by NCBI)",
520
                    status_column: str="Status", prev_name_column: str="Previous symbols"):
521
    """
522
    Creates a dictionary which contains the gene name (Hugo Symbol) and corresponding entrez ID (NCBI ID) from the
523
    specified text file
524
525
    Note the input file must have a header. An example file can be obtained from https://www.genenames.org/download/custom/
526
    Ensure the "Approved symbol" and "NCBI Gene ID" columns are selected. The default column names reflect those obtained
527
    from this source.
528
    The "Approval Status" column is optional, but will be used to remove gene names which have since been withdrawn.
529
    If the "Previous symbols" solumn is included, these gene names will also be assigned to that Entrez ID
530
    The subset_file should list one Entrez ID per line
531
    All extra columns are ignored
532
533
    :param entrez_file: A string containing a filepath to a tsv file containing the gene names and gene IDs
534
    :param hugo_column: A string specifying the name of the column which contains
535
    :param entrez_column: A string specifying the name of the column which contains the Entrez gene ID
536
    :param status_column: A string specifying the name of the column which contains the Approval Status. Optional.
537
    :param prev_name_column: A string specifting the name of the column which specifies previously used names for each gene. Optional
538
    :return: Two dictionaries containing {"Gene Name": "Entrez_ID"}
539
    """
540
    hugo_name_to_id = {}
541
    alt_hugo_name_to_id = {}  # If Previous symbols are included
542
    skipped_genes = []
543
544
    hugo_col_num = None
545
    entrez_col_num = None
546
    status_col_num = None  # Optional
547
    prev_name_col_num = None  # Optional
548
    i = 0
549
550
    with open(entrez_file) as f:
551
        for line in f:
552
            i += 1
553
            line = line.rstrip("\n").rstrip("\r")  # Remove line endings
554
            cols = line.split("\t")
555
            if len(cols) < 2:  # Sanity check this file has multiple columns
556
                raise TypeError("Input file %s does not appear to be a tab-delimited file" % entrez_file)
557
558
            # If we have not assigned column IDs yet, then we have not processed the file header
559
            # Load the file header
560
            if hugo_col_num is None or entrez_col_num is None:
561
                j = 0
562
                for col in cols:
563
                    if col == hugo_column:
564
                        hugo_col_num = j
565
                    if col == entrez_column:
566
                        entrez_col_num = j
567
                    if status_column is not None and col == status_column:
568
                        status_col_num = j
569
                    if prev_name_column is not None and col == prev_name_column:
570
                        prev_name_col_num = j
571
572
                    j += 1
573
                # Sanity check that we have found all the columns we need
574
                if hugo_col_num is None:
575
                    raise AttributeError("Unable to locate a column corresponding to \'%s\' in the input file \'%s\'" % (hugo_column, entrez_file))
576
                elif entrez_col_num is None:
577
                    raise AttributeError("Unable to locate a column corresponding to \'%s\' in the input file \'%s\'" % (entrez_column, entrez_file))
578
                # If we have made it this far, then the header was valid
579
                continue
580
581
            # Check the Approval Status column to see if the gene name has been withdrawn
582
            if status_col_num is not None:
583
                if cols[status_col_num] != "Approved":
584
                    continue
585
586
            # Parse the gene name and ID from the input file
587
            try:
588
                hugo_name = cols[hugo_col_num]
589
                entrez_id = cols[entrez_col_num]
590
                # Sanity check: The entrez ID is ALWAYS a number
591
                if not entrez_id.isdigit():
592
                    # If there is no entrez ID, skip this gene
593
                    if entrez_id == "":
594
                        skipped_genes.append(hugo_name)
595
                        continue
596
                    else:
597
                        raise TypeError("When parsing line %s of \'%s\', the Entrez gene ID \'%s\' does not appear to be a valid Entrez Gene ID" % (i, entrez_file, entrez_id))
598
599
            except IndexError as e:
600
                # If we end up in here, then there were too few columns in the current line to parse the IDs
601
                raise AttributeError("Unable to parse line %s of \'%s\': Line is truncated?" % (i, entrez_file)) from e
602
603
            # Sanity check: Make sure we haven't seen this gene name before
604
            if hugo_name in hugo_name_to_id:
605
                raise AttributeError("Gene \'%s\' appears duplicated in \'%s\'" % (hugo_name, entrez_file))
606
            hugo_name_to_id[hugo_name] = entrez_id
607
608
            # If the Previous symbols column was found, annotate these gene names with the same ID
609
            previous_names = cols[prev_name_col_num].split(",")
610
            if previous_names[0] == "":
611
                continue  # There are no previous names for this gene
612
            previous_names = [x.replace(" ", "") for x in previous_names]  # Remove extra spaces
613
            for hugo_name in previous_names:
614
                # If we have seen this gene name before, then don't overwrite it. Just assume that the first instance
615
                # encountered is correct. This likely isn't the case, but these cases should be so rare that they are
616
                # not worth worrying about
617
                if hugo_name in alt_hugo_name_to_id:
618
                    continue
619
                alt_hugo_name_to_id[hugo_name] = entrez_id
620
621
    # Final sanity check: As Hugo Symbols and Entrez IDs have a 1-1 mapping, make sure there are no duplicate Entrez IDs
622
    if len(hugo_name_to_id) != len(set(hugo_name_to_id.values())):
623
        num_ids = Counter(hugo_name_to_id.values())
624
        # For simplicity, just print out one Entrez ID which is duplicated
625
        bad_id = num_ids.most_common(1)[0][0]
626
        bad_count = num_ids.most_common(1)[0][1]
627
        raise AttributeError("Entrez gene ID \'\%s\' appears %s times in the input file" % (bad_id, bad_count))
628
629
    return hugo_name_to_id, alt_hugo_name_to_id
630
631
632
def load_subset_ids(id_file):
633
    """
634
    Loads a set of Entrez IDs from a specified file. One ID/line
635
636
    :param id_file:
637
    """
638
639
    subset_ids = []
640
    with open(id_file) as f:
641
        for line in f:
642
            line = line.rstrip("\n").rstrip("\r")
643
            if not line.isdigit():
644
                raise TypeError("Invalid Entrez ID when processing the --lyphmgen_genes file: %s" % line)
645
            subset_ids.append(line)
646
647
    subset_ids = set(subset_ids)
648
    return subset_ids
649
650
651
def generate_mut_flat(in_maf: str, seq_type: str, gene_ids: dict, out_mut_flat: str, out_gene_list: str, alt_gene_ids: dict = None, subset_ids: iter = None):
652
    """
653
    Converts a MAF file into the mutation flat file and gene list used by LymphGen
654
    WARNING: GRCh37/hg19 is currently only supported!
655
656
    An example of the output mutation FLAT file:
657
    Sample  ENTREZ.ID   Type    Location
658
    Sample1 604 MUTATION    187451395
659
    Sample1 604 MUTATION    187451408
660
    Sample1 613 MUTATION    23523596
661
    Sample1 970 TRUNC   6590925
662
    Sample1 1840    Synon   113495932
663
664
    An example of the output gene list:
665
    "ENTREZ.ID"
666
    60
667
    355
668
    567
669
    596
670
    604
671
    605
672
    613
673
    639
674
    673
675
676
    If targeted sequencing is specified, only the genes with at least one non-synonmymous mutation in the MAF file are used for
677
    the gene list. Otherwise, ALL genes are used.
678
679
    The following MAF columns are required: Hugo_Symbol, Variant_Classification, Tumor_Sample_Barcode. If NCBI_Build and
680
    Start_Position are found, the output mutation flat file will contain an additional column specifying "Location".
681
     If Tumor_Seq_Allele2 is provided, the MYD88 hotspot mutations will be annotated as "L265P"
682
683
    :param in_maf: A string containing a filepath to a MAF file containing the variants of interest
684
    :param seq_type: A string specifying the sequencing type used to identify mutations. Only "targeted", "exome", and "genome" are supported
685
    :param gene_ids: A dictionary containing the mapping between Hugo_Symbol: Entrez_Gene_ID.
686
    :param out_mut_flat: A string specifying the output path to the mutation flat file
687
    :param out_gene_list: A string specifying the output path to the gene list
688
    :param alt_gene_ids: An additional dictionary specifying alternate Hugo_Symbol: Entrez_Gene_ID mappings
689
    :param subset_ids: An interable specifying a list of Entrez IDs. Only mutations within these Entrez IDs will be output
690
    :return: A list specifying all samples analyzed for mutations
691
    """
692
    logging.info("Processing MAF file")
693
694
    # Non-synonmymous mutation types:
695
    non_syn = {"In_Frame_Del", "In_Frame_Ins", "Missense_Mutation"}
696
    trunc_mut = {"Frame_Shift_Del", "Frame_Shift_Ins", "Nonsense_Mutation", "Nonstop_Mutation", "Splice_Site", "Translation_Start_Site"}
697
    synon_mut = {"Silent", "5'UTR", "5'Flank", "Intron", "3'UTR"}  # Use for the "Synon" mutation type
698
699
    # Which samples are we analyzing?
700
    sample_list = SortedSet()  # Used to ensure the output files are in somewhat sorted order
701
702
    required_cols = ["Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode"]
703
    optional_cols = ["NCBI_Build", "Start_Position", "Tumor_Seq_Allele2"]
704
    header_cols = {}
705
    out_header_written = False
706
    out_header_cols = ["Sample", "ENTREZ.ID", "Type"]
707
    if alt_gene_ids is None:
708
        alt_gene_ids = {}
709
710
    # For targeted sequencing, which genes have we sequenced and seen mutations in?
711
    genes_seen = {}
712
    skipped_mut = []  # Store mutations in genes with no Entrez ID
713
714
    i = 0
715
    with open(in_maf) as f, open(out_mut_flat, "w") as o:
716
        for line in f:
717
            i += 1
718
            # Ignore comment lines
719
            if line[0] == "#":
720
                continue
721
            line = line.rstrip("\n").rstrip("\r")
722
            cols = line.split("\t")
723
724
            # Skip empty lines
725
            if not line:
726
                continue
727
728
            # If we haven't parsed the header yet, assume we are parsing the first line of the file, and that is the header
729
            if not header_cols:
730
                j = 0
731
                for col in cols:
732
                    if col in required_cols:
733
                        header_cols[col] = j
734
                    elif col in optional_cols:
735
                        header_cols[col] = j
736
                    j += 1
737
738
                # Check that we have all required columns
739
                for col in required_cols:
740
                    if col not in header_cols:
741
                        raise AttributeError("Unable to locate a column specifying \'%s\' in the MAF file header" % col)
742
                # See which optional columns we have
743
                if "Start_Position" in header_cols:
744
                    out_header_cols.append("Location")
745
                    logging.info("\'Start_Position\' column found in MAF file. Output mutation flat file will be annotated with \'Position\'")
746
                if "Tumor_Seq_Allele2" in header_cols:
747
                    logging.info(
748
                        "\'Tumor_Seq_Allele2\' column found in MAF file. MYD88 hotspot mutations will be annotated as \'L265P\'")
749
                continue
750
751
            # Process mutations
752
            mut_attributes = {x: cols[y] for x, y in header_cols.items()}
753
754
            # Check genome build, as only GRCh37 is supported
755
            if "NCBI_Build" in mut_attributes and "Start_Position" in mut_attributes and mut_attributes["NCBI_Build"] != "GRCh37":
756
                raise NotImplementedError("Only GRCh37 is currently supported as a reference genome")
757
758
            # Lets get the easiest stuff out of the way first. Specify the output sample name and gene ID
759
            sample_name = mut_attributes["Tumor_Sample_Barcode"]
760
            if sample_name == "" or sample_name == ".":
761
                raise AttributeError("No tumor_sample_barcode was found when processing line %s of the MAF file" % i)
762
763
            hugo_name = mut_attributes["Hugo_Symbol"]
764
            try:
765
                # Skip intergenic mutations
766
                if hugo_name == "Unknown":
767
                    continue
768
                entrez_id = gene_ids[hugo_name]
769
            except KeyError:
770
                # If this gene does not have a Entrez ID, are we prehaps using an older Hugo_Symbol?
771
                if hugo_name in alt_gene_ids:
772
                    entrez_id = alt_gene_ids[hugo_name]
773
                else:
774
                    # Skip this mutation if there is no Entrez ID
775
                    skipped_mut.append(line)
776
                    continue
777
778
            # Obtain position (if Start_Position is specified)
779
            if "Start_Position" in mut_attributes:
780
                position = mut_attributes["Start_Position"]
781
            else:
782
                position = None
783
784
            # Finally, specify the variant type
785
            if mut_attributes["Variant_Classification"] in trunc_mut:
786
                genes_seen[entrez_id] = hugo_name  # This gene has a non-synonmymous mutation, so lets assume we have sequenced it
787
                type = "TRUNC"
788
            elif mut_attributes["Variant_Classification"] in non_syn:
789
                genes_seen[entrez_id] = hugo_name  # This gene has a non-synonmymous mutation, so lets assume we have sequenced it
790
                type = "MUTATION"
791
            elif mut_attributes["Variant_Classification"] in synon_mut:
792
                genes_seen[entrez_id] = hugo_name
793
                type = "Synon"
794
            else:
795
                # Ignore intronic mutations, 3'UTRs etc
796
                continue
797
798
            # If this mutation is in MYD88, does it affect the hotspot?
799
            if hugo_name == "MYD88" and "Tumor_Seq_Allele2" in mut_attributes:
800
                if mut_attributes["Start_Position"] == "38182641" and mut_attributes["Tumor_Seq_Allele2"] == "C":
801
                    type = "L265P"
802
803
            if sample_name not in sample_list:
804
                sample_list.add(sample_name)
805
806
            # If this gene isn't in the subset list provide it, skip it
807
            if subset_ids is not None and entrez_id not in subset_ids:
808
                continue
809
810
            # Finally, write this variant
811
            # Write a header line, if we have not done so already
812
            if not out_header_written:
813
                out_header_written = True
814
                o.write("\t".join(out_header_cols))
815
                o.write(os.linesep)
816
817
            out_line = [sample_name, entrez_id, type, position + os.linesep]
818
            o.write("\t".join(out_line))
819
820
    # Check to see that we actually converted mutations
821
    if len(genes_seen) == 0:
822
        raise AttributeError("No mutations were successfully converted. Check that the --entrez_ids file has valid Hugo Symbols matching the --maf file")
823
    elif skipped_mut:
824
        logging.warning("%s mutations in the MAF file were not converted. These either don't have a valid Entrez ID, or they were not in the --lymphgen_gene file" % len(skipped_mut))
825
826
    logging.info("Finished processing MAF file")
827
    logging.info("Generating gene list")
828
    # Generate the gene list file
829
    # This file a single column with the Entrez ID, as that is all LymphGen currently supports
830
    with open(out_gene_list, "w") as o:
831
        # Write header
832
        o.write("\"ENTREZ.ID\"" + os.linesep)
833
834
        # Write out all genes we have seen a mutation in
835
        for entrez_id in genes_seen.keys():
836
            if subset_ids is not None and entrez_id not in subset_ids:
837
                continue  # Skip these gene, as it wasn't in the --lymphgen_gene list
838
            o.write(entrez_id)
839
            o.write(os.linesep)
840
841
        if seq_type == "exome" or seq_type == "genome":
842
            # We have sequenced (effectively) all genes in the human genome. Write out all remaining genes
843
            for entrez_id in gene_ids.values():
844
                if entrez_id in genes_seen:  # We have already written out this gene. Skip it
845
                    continue
846
                if subset_ids is not None and entrez_id not in subset_ids:
847
                    continue  # Skip these gene, as it wasn't in the --lymphgen_gene list
848
                o.write(entrez_id)
849
                o.write(os.linesep)
850
851
    return list(sample_list)
852
853
854
def load_gene_coords_bed(bed_file, gene_ids, alt_gene_ids=None):
855
    """
856
    Load in the genomic coordinates of genes in the human genome from the specified BED4+ file.
857
858
    The following columns are required (no header)
859
    chrom   start   end gene
860
861
    A gene can be split across multiple BED entries (ex. exonic coordinates). In this case, the start and end of the first and last exon,
862
    respectively, will be used as the gene start/end coordinates
863
864
    :param bed_file: A string containing a filepath to a BED4+ file listing the positions of genes
865
    :return: A dictionary storing {gene_name: Gene()}
866
    """
867
    if alt_gene_ids is None:
868
        alt_gene_ids = {}
869
    gene_coords = {}
870
    skipped_genes = []
871
872
    i = 0
873
    with open(bed_file) as f:
874
        for line in f:
875
            i += 1
876
            line = line.rstrip("\n").rstrip("\r")  # Remove line endings
877
            cols = line.split("\t")
878
879
            # Skip empty lines
880
            if not line:
881
                continue
882
883
            # Since this BED file could contain more than 4 columns (ex. a BED6 or BED12 file), manually unpack and inspect the first
884
            # four columns, since those are the columns we care about
885
            try:
886
                chrom = cols[0]
887
                start = cols[1]
888
                end = cols[2]
889
                gene = cols[3]
890
            except IndexError:
891
                # This BED entry was trucated
892
                raise AttributeError("Unable to parse line %s of %s as it contains less than four columns (chrom, start, end, gene)" % (i, bed_file))
893
894
            # Check that the start and end are actually genomic coordinates
895
            if not start.isdigit():
896
                raise TypeError("When parsing line %s of %s, the start position \'%s\' is not a valid genomic coordinate" % (i, bed_file, start))
897
            if not end.isdigit():
898
                raise TypeError("When parsing line %s of %s, the end position \'%s\' is not a valid genomic coordinate" % (i, bed_file, end))
899
900
            start = int(start)
901
            end = int(end)
902
            chrom = chrom.replace("chr", "")
903
904
            # Is the gene name the Hugo Symbol or the Entrez gene ID?
905
            # If its an integer, lets assume its the Entrez ID
906
            # If its not, assume it is the Hugo Symbol and convert it to the Entrez ID
907
            if not gene.isdigit():
908
                try:
909
                    entrez_id = gene_ids[gene]
910
                except KeyError:
911
                    # If we can't find this Hugo Symbol in the default mappings, check the alt gene IDs
912
                    if gene in alt_gene_ids:
913
                        entrez_id = alt_gene_ids[gene]
914
                    else:
915
                        # This gene doesn't have an Entrez ID. Skip it
916
                        skipped_genes.append(gene)
917
                        continue
918
            else:
919
                entrez_id = gene
920
921
            # Have we seen this chromosome before?
922
            if chrom not in gene_coords:
923
                gene_coords[chrom] = {}
924
            # Have we seen this gene before?
925
            if entrez_id in gene_coords[chrom]:
926
                # If so, then we need to update the start/end of this gene based on this new BED entry
927
                existing_gene_entry = gene_coords[chrom][entrez_id]
928
929
                # Sanity check
930
                if chrom != existing_gene_entry.chrom:
931
                    raise AttributeError("We found two entries for gene \'%s\'. One is found on chromosome \'%s\', while the other is found on \'%s\'"
932
                                         % (gene, chrom, existing_gene_entry.chrom))
933
                if start < existing_gene_entry.start:
934
                    existing_gene_entry.start = start
935
                if end > existing_gene_entry.end:
936
                    existing_gene_entry.end = end
937
            else:
938
                # If we haven't seen this gene before, create a new entry for it
939
                gene_coords[chrom][entrez_id] = Gene(chrom, start, end, entrez_id)
940
941
    # Now that we have processed all genes, make sure we actually found Entrez IDs for a handful of genes
942
    # If we haven't, it means the input file likely didn't contain Hugo Symbols
943
    if len(gene_coords) == 0:
944
        raise AttributeError("No Hugo Symbols from the --genes file (column 4) were found in the --entrez_ids file. An example gene is %s" % (skipped_genes[0]))
945
946
    return gene_coords
947
948
949
def load_chrom_arm(arm_file):
950
    """
951
    Loads in the genomic coordinates corresponding to the arm of each chromosome
952
953
    The input should be a tab-delimited file with the following columns
954
    chromosome  start   end arm
955
956
    :param arm_file: A string containing a filepath to a tab-delimited file containing genomic coordinates for chromosome arms
957
    :return: A dictionary storing the genomic range of each chromosome, and each individual arm
958
    """
959
960
    arm_chrom = {}
961
    required_cols = ["chromosome", "start", "end", "arm"]
962
    header_cols = {}
963
964
    i = 0
965
    with open(arm_file) as f:
966
        for line in f:
967
            i += 1
968
            line = line.rstrip("\n").rstrip("\r")  # Remove line endings
969
            cols = line.split("\t")
970
971
            # Skip empty lines
972
            if not line:
973
                continue
974
975
            # If we haven't parsed the header yet, assume this is the first line of the file (aka the header)
976
            if not header_cols:
977
                j = 0
978
                for col in cols:
979
                    if col in required_cols:
980
                        header_cols[col] = j
981
                    j += 1
982
983
                # Check to make sure all required columns are found
984
                for col in required_cols:
985
                    if col not in header_cols:
986
                        raise AttributeError("Unable to locate column %s in the chromosome arm positions file \'%s\'" % (col, arm_file))
987
                # If we get this far, the header is valid
988
                continue
989
990
            try:
991
                arm_attributes = {x: cols[y] for x, y in header_cols.items()}
992
            except IndexError:
993
                # We were unable to find a required column. This line is likely truncated
994
                raise AttributeError("Unable to parse line %s of the chromosome arm positions file %s: The line appears truncated" % (i, arm_file))
995
996
            # Have we seen this chromosome before? If not, lets create an object to store its genomic features
997
            chrom = arm_attributes["chromosome"]
998
999
            # Strip chr prefix
1000
            chrom = chrom.replace("chr", "")
1001
1002
            if chrom not in arm_chrom:
1003
                arm_chrom[chrom] = Chromosome(chrom)
1004
1005
            # Save the arm coordinates in the chromosome
1006
            try:
1007
                arm_chrom[chrom].add(int(arm_attributes["start"]), int(arm_attributes["end"]), arm_attributes["arm"])
1008
            except ValueError as e:
1009
                raise TypeError("Unable to process line %s of \'%s\': start and end must be integers") from e
1010
1011
    return arm_chrom
1012
1013
1014
def get_overlap_genes(chrom: str, start: int, end: int, copy_num: int, gene_cords: dict):
1015
1016
    """
1017
    Find the genes which overlap a given segment
1018
1019
    This is a very brute-force approach which will check all genes on the target chromosome for overlap. Pre-sorting and bisecting genomic
1020
    regions would be significantly faster, but 1) You need to handle overlapping genes, and 2) I am assuming the performance penalty won't
1021
    matter too much.
1022
1023
    Note that gains and losses are handled differently if a segment partially overlaps a gene. If the event is a loss, the gene is included.
1024
    If the event is a gain, the gene is NOT included, as that copy is likely not functional
1025
1026
    :param chrom: A string coresponding to the contig name
1027
    :param start: An int specifying the start of the segment
1028
    :param end: An int specifying the end of the segment
1029
    :param copy_num: An integer specifying the copy number of this segment
1030
    :param gene_cords: A dictionary storing the positions of genes, in the format {chromosome: {gene1: attr, gene2: attr...}}
1031
    :return:
1032
    """
1033
1034
    # Handle chr-prefix shenanigans
1035
    is_chr_prefixed = next(iter(gene_cords.keys())).startswith("chr")
1036
    if is_chr_prefixed and not chrom.startswith("chr"):
1037
        chrom = "chr" + chrom
1038
    elif not is_chr_prefixed and chrom.startswith("chr"):
1039
        chrom = chrom.replace("chr", "")
1040
1041
    try:
1042
        genes_on_chrom = gene_cords[chrom]
1043
    except KeyError:
1044
        # No genes are on this contig. This could be a user error, or this a contig with no annotated genes
1045
        return []
1046
1047
    # Go through each gene on this chromosome and see if the coordinates overlap with our regions
1048
    olap_genes = []
1049
    for entrez_id, gene_info in genes_on_chrom.items():
1050
        if start < gene_info.end and end > gene_info.start:
1051
            # Overlap found
1052
            # Now for the tricky part
1053
            # If the segment partially overlaps this gene, then we need to handle gains and losses differently
1054
            # Deleting or gaining half of a gene will likely cause it to no longer function
1055
            # Thus, partial deletions of genes could be drivers, while partial gains of genes are likely never drivers
1056
            if copy_num < 2:
1057
                olap_genes.append(entrez_id)
1058
            elif copy_num > 2:
1059
                if start > gene_info.start or end < gene_info.end:
1060
                    continue  # Partially overlapping
1061
                else:
1062
                    olap_genes.append(entrez_id)
1063
            else:
1064
                raise NotImplementedError("Not calculating overlapping genes for copy-neutral segments")
1065
1066
    return olap_genes
1067
1068
1069
def generate_cnv_files(cnv_segs, gene_regions_bed, arm_regions, gene_ids, out_cnv_gene, out_cnv_arm, sample_ids, alt_gene_ids=None, subset_ids=None, input_log2: bool= False, focal_cn_thresh:int = 30000000):
1070
    """
1071
    Characterize focal and arm-level copy number events, and summarize them in the respective output files.
1072
1073
    For focal events (i.e. events smaller than the specified threshold), identify all genes which overlap the even, and save them to the out_cnv_gene file
1074
    All events are used to flag chromosomes or individual chromosomal arms which are gained or amplified.
1075
1076
    The cnv_segs file should have the following columns (extra columns are ignored):
1077
    Tumor_Sample_Barcode    chromosome  start"  end CN
1078
1079
    Where CN is the absolute copy number
1080
1081
    The gene_regions_bed can have multiple entries for each gene (ex. exons). The maximum and minimum of the cumulative entries is used to define the
1082
    gene boundaries
1083
1084
    The arm_regions file should contain the following columns:
1085
    chromosome  start   end arm
1086
1087
    :param cnv_segs: A string containing a filepath to a tab-delimited file containing copy number events
1088
    :param gene_regions_bed: A string specifying a filepath to a BED4+ file containing gene positions
1089
    :param arm_regions: A string specifying a filepath to a tab-delimited file specifying the positions of chromosomal arms
1090
    :param gene_ids: A dictionary mapping {Hugo_Symbol: Entrez_ID}
1091
    :param out_cnv_gene: A string specifying the cnv_flat output file
1092
    :param out_cnv_arm: A string specifying the cnv_arm output file
1093
    :param sample_ids: A list containing samples which have one or more somatic mutations
1094
    :param alt_gene_ids: An additional dictionary mapping {Hugo_Symbol: Entrez_IDs}. Used if previous Hugo_Symbols were assigned to a gene
1095
    :param subset_ids: An iterable listing a set of Entrez IDs. Only CNVs overlapping these genes will be output
1096
    :param input_log2: Does the input file contain log2 ratios instead of absolute CN?
1097
    :param focal_cn_thresh: The maximum size of an event for it to be considered "focal", in bp
1098
    :return: A list of samples which have CNV information
1099
    """
1100
    logging.info("Processing copy-number segments")
1101
    if input_log2:
1102
        logging.debug("Converting from log2 ratios")
1103
    # First things first, lets load in the gene regions file and figure out where each gene is
1104
    gene_coords = load_gene_coords_bed(gene_regions_bed, gene_ids, alt_gene_ids=alt_gene_ids)
1105
1106
    # Load in the chromosome arm regions
1107
    arm_coods = load_chrom_arm(arm_regions)
1108
1109
    # Process copy number segments
1110
    required_cols = ["Tumor_Sample_Barcode", "chromosome", "start", "end", "CN"]
1111
    header_cols = {}
1112
1113
    sample_cnvs = {}
1114
1115
    i = 0
1116
    with open(cnv_segs) as f:
1117
        for line in f:
1118
            i += 1
1119
            line = line.rstrip("\n").rstrip("\r")  # Remove line endings
1120
            cols = line.split("\t")
1121
1122
            # Skip empty lines
1123
            if not line:
1124
                continue
1125
1126
            # If we haven't parsed the header yet, assume this is the first line of the file (aka the header)
1127
            if not header_cols:
1128
                j = 0
1129
                for col in cols:
1130
                    if col in required_cols:
1131
                        header_cols[col] = j
1132
                    j += 1
1133
1134
                # Check to make sure all required columns are found
1135
                for col in required_cols:
1136
                    if col not in header_cols:
1137
                        raise AttributeError("Unable to locate column \'%s\' in the CNV segments file \'%s\'" % (col, cnv_segs))
1138
                # If we get this far, the header is valid
1139
                continue
1140
1141
            # Process this CNV entry
1142
            cnv_attributes = {x: cols[y] for x, y in header_cols.items()}
1143
1144
            # Have we processed CNVs from this sample before?
1145
            if cnv_attributes["Tumor_Sample_Barcode"] not in sample_cnvs:
1146
                sample_cnvs[cnv_attributes["Tumor_Sample_Barcode"]] = SampleCNVs()
1147
            # Sterilize input and ensure it is valid, and store these events
1148
            try:
1149
                start = int(cnv_attributes["start"])
1150
                end = int(cnv_attributes["end"])
1151
                # Remove segments which are of length 0
1152
                if start == end:
1153
                    continue
1154
                # Is the CN state absolute CN or log2 ratios? If log2 ratios, convert to absolute CN
1155
                cn_state = float(cnv_attributes["CN"])
1156
                if input_log2:
1157
                    cn_state = math.pow(2, cn_state + 1)
1158
                    if cn_state < 0:
1159
                        cn_state = 0
1160
                elif cn_state < 0:  # Sanity check that the user didn't accidentally provide log2 ratios
1161
                    raise AttributeError("Unable to process line %s of \'%s\': \'%s\': Negative copy number segment. Did you mean to specify the --log2 flag?" % (i, cnv_segs, line))
1162
1163
                cnv_attributes["CN"] = round(cn_state)
1164
1165
                sample_cnvs[cnv_attributes["Tumor_Sample_Barcode"]].add(
1166
                    cnv_attributes["chromosome"],
1167
                    start,
1168
                    end,
1169
                    cnv_attributes["CN"])
1170
            except ValueError as e:
1171
                raise TypeError("Unable to process line %s of \'%s\': start, end, and CN must be numeric" % (i, cnv_segs)) from e
1172
            except TypeError as e:
1173
                raise AttributeError("Unable to process line %s of \'%s\': \'%s\': End coordinate of segment occurs before start" % (i, cnv_segs, line)) from e
1174
1175
    # Now adjust for ploidy
1176
    logging.info("Adjusting sample ploidy")
1177
    for sample, cnvs in sample_cnvs.items():
1178
        cnvs.adjust_ploidy(arm_coods, sample)
1179
1180
    # Now that we have processed all CNVs, lets see which genes have events, and write those out
1181
    logging.info("Calculating gene overlap with CNVs")
1182
    with open(out_cnv_gene, "w") as o:
1183
1184
        # Write output file header
1185
        out_header = ["Sample", "ENTREZ.ID", "Type"]
1186
        o.write("\t".join(out_header))
1187
        o.write(os.linesep)
1188
1189
        # Process segments
1190
        for sample, cnvs in sample_cnvs.items():
1191
            for chrom in cnvs.cn_states.keys():
1192
                # parse each segment
1193
                for start, end, cn_state in zip(cnvs.starts[chrom], cnvs.ends[chrom], cnvs.cn_states[chrom]):
1194
1195
                    # Ignore copy-neutral events
1196
                    if cn_state == 2:
1197
                        continue
1198
                    # Is this event focal? If so, lets find the genes it overlaps
1199
                    if end - start < focal_cn_thresh:
1200
                        # What type of event is this?
1201
                        if cn_state > 3:
1202
                            event_type = "AMP"
1203
                        elif cn_state > 2:
1204
                            event_type = "GAIN"
1205
                        elif cn_state < 1:
1206
                            event_type = "HOMDEL"
1207
                        elif cn_state < 2:
1208
                            event_type = "HETLOSS"
1209
                        else:
1210
                            raise TypeError("Invalid copy number state \'%s\'" % cn_state)
1211
1212
                        olap_genes = get_overlap_genes(chrom, start, end, cn_state, gene_coords)
1213
                        # If any genes overlap, write out these genes
1214
                        for gene in olap_genes:
1215
1216
                            # If a list of genes to subset to was provided, subset those genes
1217
                            if subset_ids is not None and gene not in subset_ids:
1218
                                continue  # Skip this gene, as it was not in the list provided
1219
                            out_line = [sample,
1220
                                        gene,
1221
                                        event_type
1222
                                        ]
1223
                            o.write("\t".join(out_line))
1224
                            o.write(os.linesep)
1225
1226
    # Now that we have processed all the CNVs, identify which samples have arm-level and whole chromosomal copy number changes
1227
    logging.info("Calculating arm-level CNVs")
1228
    with open(out_cnv_arm, "w") as o:
1229
        # Write output header
1230
        out_header = ["Sample", "Arm", "Type"]
1231
        o.write("\t".join(out_header))
1232
        o.write(os.linesep)
1233
1234
        for sample, cnvs in sample_cnvs.items():
1235
            if sample not in sample_ids:
1236
                logging.warning("Sample \'%s\' was provided in the input CNVs file, but no SNVs were found. Skipping..." % sample)
1237
                continue
1238
            for chrom, chrom_info in arm_coods.items():
1239
1240
                # For now, skip chromosome X and Y as those aren't supported?
1241
                if chrom == "X" or chrom == "chrX" or chrom == "Y" or chrom == "chrY":
1242
                    continue
1243
1244
                events = cnvs.overlap_chrom(chrom_info)
1245
                # If there are large-scale CNVs in this sample, output them to the Arm flat file
1246
                for arm, c_type in events.items():
1247
                    out_line = [
1248
                        sample,
1249
                        arm,
1250
                        c_type
1251
                    ]
1252
                    o.write("\t".join(out_line))
1253
                    o.write(os.linesep)
1254
1255
    return list(sample_cnvs.keys())
1256
1257
1258
def generate_sample_annot(samples: iter, cnv_samples: iter, out_sample_annot: str):
1259
1260
    logging.info("Generating sample annotation file")
1261
    with open(out_sample_annot , "w") as o:
1262
        # Write sample annotation file header
1263
        out_header_cols = ["Sample.ID", "Copy.Number", "BCL2.transloc","BCL6.transloc"]
1264
        o.write("\t".join(out_header_cols))
1265
        o.write(os.linesep)
1266
1267
        for sample in samples:
1268
            # Were CNVs provided for this sample?
1269
            if sample in cnv_samples:
1270
                has_cn = "1"
1271
            else:
1272
                has_cn = "0"
1273
            out_line = [sample,
1274
                        has_cn,
1275
                        "NA",
1276
                        "NA"]
1277
            o.write("\t".join(out_line))
1278
            o.write(os.linesep)
1279
1280
1281
def main(args=None):
1282
1283
    # Obtain input
1284
    if args is None:
1285
        args = get_args()
1286
1287
    # First, load in the mapping of Gene names and NCBI/Entrez IDs
1288
    gene_ids, alt_gene_ids = load_entrez_ids(args.entrez_ids)
1289
1290
    # If a --lymphgen_genes file was provided, load those genes
1291
    subset_ids = None
1292
    if args.lymphgen_genes:
1293
        subset_ids = load_subset_ids(args.lymphgen_genes)
1294
1295
    # Generate the mutation flat file and gene list using the input MAF file and Entrez IDs
1296
    out_mut_flat = args.outdir + os.sep + args.outprefix + "_mutation_flat.tsv"
1297
    out_gene_list = args.outdir + os.sep + args.outprefix + "_gene_list.txt"
1298
    sample_list = generate_mut_flat(args.maf, args.sequencing_type, gene_ids, out_mut_flat, out_gene_list, alt_gene_ids = alt_gene_ids, subset_ids=subset_ids)
1299
1300
    # Generate the copy number gene list file and arm flat file
1301
    if args.cnvs:
1302
        out_cnv_gene = args.outdir + os.sep + args.outprefix + "_cnv_flat.tsv"
1303
        out_cnv_arm = args.outdir + os.sep + args.outprefix + "_cnv_arm.tsv"
1304
        cnv_sample_list = generate_cnv_files(args.cnvs, args.genes, args.arms, gene_ids, out_cnv_gene, out_cnv_arm, sample_list, alt_gene_ids=alt_gene_ids, subset_ids=subset_ids, input_log2=args.log2)
1305
    else:
1306
        cnv_sample_list = set()
1307
1308
    # Generate a sample annotation file
1309
    out_sample_annot = args.outdir + os.sep + args.outprefix + "_sample_annotation.tsv"
1310
    generate_sample_annot(sample_list, set(cnv_sample_list), out_sample_annot)
1311
1312
1313
if __name__ == "__main__":
1314
    main()