--- a +++ b/generate_input.py @@ -0,0 +1,1314 @@ +#!/usr/bin/env python + +# Created by Christopher Rushton 2020/04/20 +# This converts a MAF file and copy number matrix (optional) into the input files used be the LymphGen lymphoma +# classifier. See https://llmpp.nih.gov/lymphgen/index.php for more information + +import argparse +import math +import os +import bisect +import logging +from collections import Counter +from sortedcontainers import SortedSet + + +class Gene: + """ + A simple class storing the chromosome, start, end, and name of a gene + """ + __slots__ = ["name", "chrom", "start", "end"] + + def __init__(self, chrom, start, end, name): + self.name = name + self.chrom = chrom + self.start = start + self.end = end + + +class Chromosome: + """ + A simple class storying the genomic range of each chromosome, as well as the coordiates of the p and q arm + """ + + def __init__(self, chrom): + self.chrom = chrom + self.p_start = None + self.p_end = None + self.q_start = None + self.q_end = None + self.q_length = 0.0 + self.p_length = 0.0 + + def add(self, start, end, arm): + + if arm == "p": + # Check to see if we already have genomic coordinates for this arm + if self.p_start is not None or self.p_end is not None: + raise AttributeError("Coordinates already assigned for %s arm of chromosome \'%s\'" % (arm, self.chrom)) + self.p_start = start + self.p_end = end + self.p_length = end - start + + # Sanity check this does not overlap the q arm + if self.q_start is not None and self.p_end > self.q_start: + raise AttributeError("For chromosome \'%s\'. The q arm starts before the end of the p arm" % self.chrom) + + elif arm == "q": + # Check to see if we already have genomic coordinates for this arm + if self.q_start is not None or self.q_end is not None: + raise AttributeError("Coordinates already assigned for %s arm of chromosome \'%s\'" % (arm, self.chrom)) + self.q_start = start + self.q_end = end + self.q_length = end - start + + # Sanity check this does not overlap the p arm + if self.p_end is not None and self.q_start < self.p_end: + raise AttributeError("For chromosome \'%s\'. The q arm starts before the end of the p arm" % self.chrom) + else: + # Not the p or q arm. Invalid + raise AttributeError("Unknown arm specified for chromosome \'%s\':\'%s\'" % (self.chrom, arm)) + + +class SampleCNVs: + """ + Stores all the copy number events within a given class + I was going to write this in a function, but having a class is a lot cleaner + """ + + def __init__(self): + self.starts = {} + self.ends = {} + self.cn_states = {} + self.len_seg = {} + self.is_chr_prefixed = None + self.ploidy = None + + def merge_overlapping_cnvs(self, existStarts: iter, existEnds: iter, existCNs: iter, newStart: int, newEnd: int, newCN:int): + """ + Handle overlapping copy number segments, merging and editing existing segments as necessary + + In essence, if this segment overlaps an existing segment with the same copy number state, those segments are merged + together. If the copy number state is different, the largest increase is kept, + truncating or even breaking apart the copy-neutral segment in the process. In the worse case senario, + this will lead to three different segments being created + + :param existStarts: The start position of the existing segment + :param existEnds: The end position of the existing segment + :param existCNs: The copy number state of the existing segment + :param newStart: The start position of the new segment + :param newEnd: The end position of the new segment + :param newCN: The copy number state of the new segment + :return: A tuple containing the new and old events. Ex {[newEvent1, newEvent2], [oldEvent1, oldEvent2]} + """ + + def reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd): + + # The existing event has higher priority than the old event + # Truncate the newer event + # Since this could break the newer event apart into two pieces, lets do that now, and see + # if the output segments are valid + outStart1 = newStart + outEnd1 = oldStart + outStart2 = oldEnd + outEnd2 = newEnd + if outStart1 < outEnd1: # Valid segment, process + existStarts, existEnds, existCNs = self.merge_overlapping_cnvs(existStarts, existEnds, existCNs, + outStart1, outEnd1, newCN) + if outStart2 < outEnd2: # Valid segment + existStarts, existEnds, existCNs = self.merge_overlapping_cnvs(existStarts, existEnds, existCNs, + outStart2, outEnd2, newCN) + return existStarts, existEnds, existCNs + + def reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd, oldCN): + + # The newer event is a "higher"-level deletion + # Split/Truncate the older event + outStart1 = oldStart + outEnd1 = newStart + outStart2 = newEnd + outEnd2 = oldEnd + outCN = oldCN + # Delete existing event + existStarts.pop(start_bisect) + existEnds.pop(start_bisect) + existCNs.pop(start_bisect) + + if outStart2 < outEnd2: # Valid segment, do the last one first to maintain sorted order + existStarts.insert(start_bisect, outStart2) + existEnds.insert(start_bisect, outEnd2) + existCNs.insert(start_bisect, outCN) + if outStart1 < outEnd1: # Also valid + existStarts.insert(start_bisect, outStart1) + existEnds.insert(start_bisect, outEnd1) + existCNs.insert(start_bisect, outCN) + + # Check for any more overlaps + return self.merge_overlapping_cnvs(existStarts, existEnds, existCNs, newStart, newEnd, newCN) + + # First, does this new segment actually overlap anything? + start_bisect = bisect.bisect_right(existEnds, newStart) + end_bisect = bisect.bisect_left(existStarts, newEnd) + + if start_bisect == end_bisect: + # There are no overlaps. Simply add this new event in, and return it + existStarts.insert(start_bisect, newStart) + existEnds.insert(start_bisect, newEnd) + existCNs.insert(start_bisect, newCN) + return existStarts, existEnds, existCNs + + # Grab the first overlap + oldStart = existStarts[start_bisect] + oldEnd = existEnds[start_bisect] + oldCN = existCNs[start_bisect] + + # Simplest case first. If both these events have the same CN state, lets merge them together + if oldCN == newCN: + + outStart = oldStart if oldStart < newStart else newStart + outEnd = oldEnd if oldEnd > newEnd else newEnd + # Delete the existing event + existStarts.pop(start_bisect) + existEnds.pop(start_bisect) + existCNs.pop(start_bisect) + + # Check for any more overlaps + return self.merge_overlapping_cnvs(existStarts, existEnds, existCNs, outStart, outEnd, newCN) + else: + # These segments overlap and have different CN states. + # Lets keep the highest-level event + if newCN <= 2 and oldCN <= 2: # Deletion + if oldCN < newCN: + # The older event is a "higher"-level deletion + return reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd) + else: + return reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd, oldCN) + if newCN >= 2 and oldCN >= 2: # Gain/Amp + if oldCN > newCN: + # The older event is a higher level gain. Split/truncate the new event + return reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, + oldEnd) + else: + return reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd, + oldCN) + else: + # One event must be a gain/amp, while the other is a deletion. In this case, keep both, and subset the + # larger event by the smaller one + if oldEnd - oldStart < newEnd - newStart: + # The older event is smaller. Split the newer event + return reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, + oldEnd) + else: + # The newer event is smaller. We should split the older event + return reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd, + oldCN) + + + def add(self, chrom: str, start: int, end: int, cn: int): + + # For LymphGen compatability: Strip "chr" prefix + chrom = chrom.replace("chr", "") + + if end <= start: + if start == end: + # This segment has a length of 0?!. Skip this I guesss + return None + else: + # This segment has a negative length?? + raise TypeError("Invalid CNV segment: \'%s:%s-%s\'" % (chrom, start, end)) + + # Have we seen any events for this chromosome before? + if chrom not in self.len_seg: + # If not, just store this segment. Simple! + self.starts[chrom] = [start] + self.ends[chrom] = [end] + self.cn_states[chrom] = [cn] + self.len_seg[chrom] = 1 + + # Are we using chr-prefixed contig names? + if self.is_chr_prefixed is None: + self.is_chr_prefixed = True if chrom.startswith("chr") else False + else: + # If we have seen events on this chromosome before, we need to compare this new segment with existing segments + # In theory, events should be non-overlapping, but I don't want to assume that because then things will break + # horribly later + + self.starts[chrom], self.ends[chrom], self.cn_states[chrom] = \ + self.merge_overlapping_cnvs(self.starts[chrom], self.ends[chrom], self.cn_states[chrom], start, end, cn) + self.len_seg[chrom] = len(self.cn_states) + + + def overlap_chrom(self, chromosome: Chromosome, threshold: float = 0.8): + + """ + Calculates the overlap between the segments stored here and a given chromosome + :param chromosome: A Chromosome() object defining the start and end of the p and q arms, respectively + :return: A dictionary containing + """ + + # Do the CNVs within this chromosome encompass a given chromosome more than the specified threshold? + chrom_name = chromosome.chrom + + # Handle "chr" prefix nonsense + if self.is_chr_prefixed and not chrom_name.startswith("chr"): + chrom_name = "chr" + chrom_name + elif not self.is_chr_prefixed and chrom_name.startswith("chr"): + chrom_name = chrom_name.replace("chr", "") + + try: + chrom_starts = self.starts[chrom_name] + except KeyError: + # No CN events were provided for this chromosome, hence there can be no overlap + return {} + chrom_ends = self.ends[chrom_name] + chrom_cn = self.cn_states[chrom_name] + + homdel_sum = {"p": 0, "q": 0, "Chrom": 0} + del_sum = {"p": 0, "q": 0, "Chrom": 0} + gain_sum = {"p": 0, "q": 0, "Chrom": 0} + amp_sum = {"p": 0, "q": 0, "Chrom": 0} + + # Check for overlapping segments for each arm + for (start, end, cn) in zip(chrom_starts, chrom_ends, chrom_cn): + + # Ignore copy-neutral segments + if cn == 2: + continue + + if end < chromosome.p_start or start > chromosome.q_end: + # Segment falls out of range + continue + if start < chromosome.p_end: + olap_start = start if start > chromosome.p_start else chromosome.p_start + olap_end = end if end < chromosome.p_end else chromosome.p_end + if cn < 2: + del_sum["p"] += olap_end - olap_start + del_sum["Chrom"] += olap_end - olap_start + if cn < 1: + homdel_sum["p"] += olap_end - olap_start + homdel_sum["Chrom"] += olap_end - olap_start + elif cn > 2: + gain_sum["p"] += olap_end - olap_start + gain_sum["Chrom"] += olap_end - olap_start + if cn > 3: + amp_sum["p"] += olap_end - olap_start + amp_sum["Chrom"] += olap_end - olap_start + + if end > chromosome.q_start: # We use an if, not elif, in case a segment overlaps both the p and q arm + olap_start = start if start > chromosome.q_start else chromosome.q_start + olap_end = end if end < chromosome.q_end else chromosome.q_end + if cn < 2: + del_sum["q"] += olap_end - olap_start + del_sum["Chrom"] += olap_end - olap_start + if cn < 1: + homdel_sum["q"] += olap_end - olap_start + homdel_sum["Chrom"] += olap_end - olap_start + elif cn > 2: + gain_sum["q"] += olap_end - olap_start + gain_sum["Chrom"] += olap_end - olap_start + if cn > 3: + amp_sum["q"] += olap_end - olap_start + amp_sum["Chrom"] += olap_end - olap_start + + events = {} + # Now, calculate the fraction of each overlap + # Start from the highest level and biggest event, and work our way down/smaller + # Amplifications/gains + if amp_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold: # Whole chromosome Amp + events[chrom_name + "Chrom"] = "AMP" + # Now check for arm level events + else: + if amp_sum["p"] / chromosome.p_length > threshold: # p Amp + events[chrom_name + "p"] = "AMP" + elif amp_sum["q"] / chromosome.q_length > threshold: # q Amp + events[chrom_name + "q"] = "AMP" + if chrom_name + "Chrom" not in events and gain_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold: # Whole chromosome gain + events[chrom_name + "Chrom"] = "GAIN" + else: + if chrom_name + "p" not in events and gain_sum["p"] / chromosome.p_length > threshold: # p Gain + events[chrom_name + "p"] = "GAIN" + if chrom_name + "q" not in events and gain_sum["q"] / chromosome.q_length > threshold: # q Gain + events[chrom_name + "q"] = "GAIN" + + # Homozygous and heterozygous deletions + if homdel_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold: # Whole chromosome Homozygous deletion + events[chrom_name + "Chrom"] = "HOMDEL" + # Now check for arm level events + else: + if homdel_sum["p"] / chromosome.p_length > threshold: # p HOMDEL + events[chrom_name + "p"] = "HOMDEL" + elif homdel_sum["q"] / chromosome.q_length > threshold: # q HOMDEL + events[chrom_name + "q"] = "HOMDEL" + if chrom_name + "Chrom" not in events and del_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold: # Whole chromosome deletions + events[chrom_name + "Chrom"] = "HETLOSS" + else: + if chrom_name + "p" not in events and del_sum["p"] / chromosome.p_length > threshold: # p deletion + events[chrom_name + "p"] = "HETLOSS" + if chrom_name + "q" not in events and del_sum["q"] / chromosome.q_length > threshold: # q deletions + events[chrom_name + "q"] = "HETLOSS" + + return events + + + def adjust_ploidy(self, arm_coords, sample_name, redo=False): + """ + Adjust a sample's CN states based upon ploidy + + Calculate the average copy number state across the entire genome. If a sample is not diploid, normalize all the CN + changes + :param arm_coords: A dictionary containing {"chromosome_name": Chromosome()} objects which list chromosomal coordinates + :param sample_name: A string specifying the sample name. Used for debugging/error purposes + :param redo: Should we override any existing ploidy state for this sample? + :return: None + """ + + if self.ploidy is not None and not redo: + # Ploidy for this sample has already been calculated. Don't do anything + return None + + # Store the length affected for each ploidy state + ploidy_cov = {} + genome_size = 0 + + for chromosome in arm_coords.values(): + # Find overlapping CNVs for this arm + + # if the p or q arm coordinates are not set, use a placeholder + if chromosome.p_start is None: + chromosome.p_start = -100000 + chromosome.p_end = -100000 + chromosome.p_length = 1 + if chromosome.q_start is None: + chromosome.q_start = -100000 + chromosome.q_end = -100000 + chromosome.q_length = 1 + + # Save the size of this chromosome + genome_size += chromosome.p_length + genome_size += chromosome.q_length + + chrom_name = chromosome.chrom + # Handle "chr" prefix nonsense + if self.is_chr_prefixed and not chrom_name.startswith("chr"): + chrom_name = "chr" + chrom_name + elif not self.is_chr_prefixed and chrom_name.startswith("chr"): + chrom_name = chrom_name.replace("chr", "") + try: + chrom_starts = self.starts[chrom_name] + except KeyError: + # No CN events were provided for this chromosome, hence there can be no overlap + continue + chrom_ends = self.ends[chrom_name] + chrom_cn = self.cn_states[chrom_name] + + # Check for overlapping segments for each arm + for (start, end, cn) in zip(chrom_starts, chrom_ends, chrom_cn): + + # Check p-arm overlap + if end < chromosome.p_start or start > chromosome.q_end: + # Segment falls out of range + continue + if start < chromosome.p_end: + olap_start = start if start > chromosome.p_start else chromosome.p_start + olap_end = end if end < chromosome.p_end else chromosome.p_end + + # Store the length of this overlap and associated CN + if cn not in ploidy_cov: + ploidy_cov[cn] = 0 + ploidy_cov[cn] += olap_end - olap_start + + if end > chromosome.q_start: # We use an if, not elif, in case a segment overlaps both the p and q arm + olap_start = start if start > chromosome.q_start else chromosome.q_start + + olap_end = end if end < chromosome.q_end else chromosome.q_end + # Store the length of this overlap and associated CN + if cn not in ploidy_cov: + ploidy_cov[cn] = 0 + ploidy_cov[cn] += olap_end - olap_start + + # Now that we have calculated the number of bases affected by each CN state, calculate the ploidy + x = 0 + for cn, bases in ploidy_cov.items(): + x += cn * bases + + ploidy = x / genome_size + av_ploidy = round(ploidy) + + # Sanity check + if av_ploidy < 1: + raise TypeError("Ploidy of sample \'%s\' was calculated to be below 1!" % sample_name) + + # If this tumour is not diploid, adjust the ploidy + if av_ploidy != 2: + logging.debug("\'%s\' was calculated to have a ploidy of %s" % (sample_name, av_ploidy)) + ploidy_dif = av_ploidy - 2 + new_cns = {} + for chrom, cn in self.cn_states.items(): + x = list(y - ploidy_dif for y in cn) + new_cns[chrom] = x + self.cn_states = new_cns + + self.ploidy = av_ploidy + + +def get_args(): + + def is_valid_dir(path, parser): + """ + Checks to ensure the directory path exists + :param path: A string containing a filepath to a directory + :param parser: An argparse.ArgumentParser() object + :return: path, if the string is a valid directory + :raises: parser.error() if the directory does not exist + """ + if os.path.exists(path) and os.path.isdir(path): + return path + else: + raise parser.error("Unable to set \'%s\' as the output directory: Not a valid directory" % path) + + def is_valid_file(path, parser): + """ + Checks to ensure the specified file exists + :param path: A string containing a filepath + :param parser: An argparse.ArgumentParser() object + :return: path, if the string is a valid directory + :raises: parser.error() if the file does not exist + """ + if os.path.exists(path) and os.path.isfile(path): + return path + else: + raise parser.error("Unable to locate \'%s\': No such file or directory" % path) + + epilog = os.linesep.join(["The --arms, --entrez_ids, and --lymphgen_genes files can be found in the \'resources\' folder. ", + "Note that genome and exome sequencing types are handled exactly the same. ", + "The --entrez-ids file must have the Hugo_Symbol under the column \"Approved Symbol\" and the Entrez ID under the column \"NCBI Gene ID(supplied by NCBI)\". ", + "The --cnvs file should contain the following colummns: Tumor_Sample_Barcode, chromosome, start, end, CN. ", + "The --arms file should contain the following columns: chromosome, start, end, arm. " + ]) + parser = argparse.ArgumentParser(description="Generates input files for the LymphGen classifier\nVersion 1.0.0", epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter) + snvs_args = parser.add_argument_group("Input mutation files") + snvs_args.add_argument("-m", "--maf", metavar="MAF", required=True, type=lambda x: is_valid_file(x, parser), help="Input MAF file listing somatic mutations") + snvs_args.add_argument("-e", "--entrez_ids", metavar="TSV", required=True, type=lambda x: is_valid_file(x, parser), help="A tab-delimited file containing gene names (Hugo Symbol) and the corresponding Entrez gene ID") + cnv_args = parser.add_argument_group("(Optional) Input CNV files") + cnv_args.add_argument("-c", "--cnvs", metavar="TSV", default=None, type=lambda x: is_valid_file(x, parser), help="Input tab-delimited file summarizing copy number events") + cnv_args.add_argument("--log2", action="store_true", help="Does the input CNV file use log2 ratios? (i.e. log2(absoluteCN) - 1)") + cnv_args.add_argument("-g", "--genes", metavar="BED", default=None, type=lambda x: is_valid_file(x, parser), help="Input BED4+ file listing start and end positions of genes/exons") + cnv_args.add_argument("-a", "--arms", metavar="TSV", default=None, type=lambda x: is_valid_file(x, parser), help="Input tab-delimited file listing the positions of chromosome arms") + + parser.add_argument("-l", "--lymphgen_genes", metavar="TXT", default=None, type=lambda x: is_valid_file(x, parser), required=False, help="An optional file listing all Entrez IDs considered by the LymphGen classifier. Output will be subset to these genes") + parser.add_argument("-s", "--sequencing_type", metavar="SEQ", choices=["targeted", "exome", "genome"], required=True, help="Sequencing type used to obtain somatic mutations") + parser.add_argument("-o", "--outdir", metavar="PATH", required=True, type=lambda x: is_valid_dir(x, parser), help="Output directory for LymphGen input files") + parser.add_argument("--outprefix", metavar="STRING", default=None, help="Output files will be prefixed using this string [Default: Use the base name of the MAF file]") + parser.add_argument("-v", "--verbose", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default="INFO", help="Set logging verbosity") + + args = parser.parse_args() + # If no outprefix was set, use the basename of the maf file as the output prefix + if args.outprefix is None: + args.outprefix = os.path.basename(args.maf).split(".")[0] + + # If --cnvs is specified, the annotation files are also required + if args.cnvs: + if not args.genes or not args.arms: + raise parser.error("--genes and --arms are required if --cnvs is specified") + + logging.basicConfig(level=getattr(logging, args.verbose), format='%(levelname)s:%(message)s') + + return args + + +def load_entrez_ids(entrez_file: str, hugo_column: str="Approved symbol", entrez_column: str="NCBI Gene ID(supplied by NCBI)", + status_column: str="Status", prev_name_column: str="Previous symbols"): + """ + Creates a dictionary which contains the gene name (Hugo Symbol) and corresponding entrez ID (NCBI ID) from the + specified text file + + Note the input file must have a header. An example file can be obtained from https://www.genenames.org/download/custom/ + Ensure the "Approved symbol" and "NCBI Gene ID" columns are selected. The default column names reflect those obtained + from this source. + The "Approval Status" column is optional, but will be used to remove gene names which have since been withdrawn. + If the "Previous symbols" solumn is included, these gene names will also be assigned to that Entrez ID + The subset_file should list one Entrez ID per line + All extra columns are ignored + + :param entrez_file: A string containing a filepath to a tsv file containing the gene names and gene IDs + :param hugo_column: A string specifying the name of the column which contains + :param entrez_column: A string specifying the name of the column which contains the Entrez gene ID + :param status_column: A string specifying the name of the column which contains the Approval Status. Optional. + :param prev_name_column: A string specifting the name of the column which specifies previously used names for each gene. Optional + :return: Two dictionaries containing {"Gene Name": "Entrez_ID"} + """ + hugo_name_to_id = {} + alt_hugo_name_to_id = {} # If Previous symbols are included + skipped_genes = [] + + hugo_col_num = None + entrez_col_num = None + status_col_num = None # Optional + prev_name_col_num = None # Optional + i = 0 + + with open(entrez_file) as f: + for line in f: + i += 1 + line = line.rstrip("\n").rstrip("\r") # Remove line endings + cols = line.split("\t") + if len(cols) < 2: # Sanity check this file has multiple columns + raise TypeError("Input file %s does not appear to be a tab-delimited file" % entrez_file) + + # If we have not assigned column IDs yet, then we have not processed the file header + # Load the file header + if hugo_col_num is None or entrez_col_num is None: + j = 0 + for col in cols: + if col == hugo_column: + hugo_col_num = j + if col == entrez_column: + entrez_col_num = j + if status_column is not None and col == status_column: + status_col_num = j + if prev_name_column is not None and col == prev_name_column: + prev_name_col_num = j + + j += 1 + # Sanity check that we have found all the columns we need + if hugo_col_num is None: + raise AttributeError("Unable to locate a column corresponding to \'%s\' in the input file \'%s\'" % (hugo_column, entrez_file)) + elif entrez_col_num is None: + raise AttributeError("Unable to locate a column corresponding to \'%s\' in the input file \'%s\'" % (entrez_column, entrez_file)) + # If we have made it this far, then the header was valid + continue + + # Check the Approval Status column to see if the gene name has been withdrawn + if status_col_num is not None: + if cols[status_col_num] != "Approved": + continue + + # Parse the gene name and ID from the input file + try: + hugo_name = cols[hugo_col_num] + entrez_id = cols[entrez_col_num] + # Sanity check: The entrez ID is ALWAYS a number + if not entrez_id.isdigit(): + # If there is no entrez ID, skip this gene + if entrez_id == "": + skipped_genes.append(hugo_name) + continue + else: + raise TypeError("When parsing line %s of \'%s\', the Entrez gene ID \'%s\' does not appear to be a valid Entrez Gene ID" % (i, entrez_file, entrez_id)) + + except IndexError as e: + # If we end up in here, then there were too few columns in the current line to parse the IDs + raise AttributeError("Unable to parse line %s of \'%s\': Line is truncated?" % (i, entrez_file)) from e + + # Sanity check: Make sure we haven't seen this gene name before + if hugo_name in hugo_name_to_id: + raise AttributeError("Gene \'%s\' appears duplicated in \'%s\'" % (hugo_name, entrez_file)) + hugo_name_to_id[hugo_name] = entrez_id + + # If the Previous symbols column was found, annotate these gene names with the same ID + previous_names = cols[prev_name_col_num].split(",") + if previous_names[0] == "": + continue # There are no previous names for this gene + previous_names = [x.replace(" ", "") for x in previous_names] # Remove extra spaces + for hugo_name in previous_names: + # If we have seen this gene name before, then don't overwrite it. Just assume that the first instance + # encountered is correct. This likely isn't the case, but these cases should be so rare that they are + # not worth worrying about + if hugo_name in alt_hugo_name_to_id: + continue + alt_hugo_name_to_id[hugo_name] = entrez_id + + # Final sanity check: As Hugo Symbols and Entrez IDs have a 1-1 mapping, make sure there are no duplicate Entrez IDs + if len(hugo_name_to_id) != len(set(hugo_name_to_id.values())): + num_ids = Counter(hugo_name_to_id.values()) + # For simplicity, just print out one Entrez ID which is duplicated + bad_id = num_ids.most_common(1)[0][0] + bad_count = num_ids.most_common(1)[0][1] + raise AttributeError("Entrez gene ID \'\%s\' appears %s times in the input file" % (bad_id, bad_count)) + + return hugo_name_to_id, alt_hugo_name_to_id + + +def load_subset_ids(id_file): + """ + Loads a set of Entrez IDs from a specified file. One ID/line + + :param id_file: + """ + + subset_ids = [] + with open(id_file) as f: + for line in f: + line = line.rstrip("\n").rstrip("\r") + if not line.isdigit(): + raise TypeError("Invalid Entrez ID when processing the --lyphmgen_genes file: %s" % line) + subset_ids.append(line) + + subset_ids = set(subset_ids) + return subset_ids + + +def generate_mut_flat(in_maf: str, seq_type: str, gene_ids: dict, out_mut_flat: str, out_gene_list: str, alt_gene_ids: dict = None, subset_ids: iter = None): + """ + Converts a MAF file into the mutation flat file and gene list used by LymphGen + WARNING: GRCh37/hg19 is currently only supported! + + An example of the output mutation FLAT file: + Sample ENTREZ.ID Type Location + Sample1 604 MUTATION 187451395 + Sample1 604 MUTATION 187451408 + Sample1 613 MUTATION 23523596 + Sample1 970 TRUNC 6590925 + Sample1 1840 Synon 113495932 + + An example of the output gene list: + "ENTREZ.ID" + 60 + 355 + 567 + 596 + 604 + 605 + 613 + 639 + 673 + + If targeted sequencing is specified, only the genes with at least one non-synonmymous mutation in the MAF file are used for + the gene list. Otherwise, ALL genes are used. + + The following MAF columns are required: Hugo_Symbol, Variant_Classification, Tumor_Sample_Barcode. If NCBI_Build and + Start_Position are found, the output mutation flat file will contain an additional column specifying "Location". + If Tumor_Seq_Allele2 is provided, the MYD88 hotspot mutations will be annotated as "L265P" + + :param in_maf: A string containing a filepath to a MAF file containing the variants of interest + :param seq_type: A string specifying the sequencing type used to identify mutations. Only "targeted", "exome", and "genome" are supported + :param gene_ids: A dictionary containing the mapping between Hugo_Symbol: Entrez_Gene_ID. + :param out_mut_flat: A string specifying the output path to the mutation flat file + :param out_gene_list: A string specifying the output path to the gene list + :param alt_gene_ids: An additional dictionary specifying alternate Hugo_Symbol: Entrez_Gene_ID mappings + :param subset_ids: An interable specifying a list of Entrez IDs. Only mutations within these Entrez IDs will be output + :return: A list specifying all samples analyzed for mutations + """ + logging.info("Processing MAF file") + + # Non-synonmymous mutation types: + non_syn = {"In_Frame_Del", "In_Frame_Ins", "Missense_Mutation"} + trunc_mut = {"Frame_Shift_Del", "Frame_Shift_Ins", "Nonsense_Mutation", "Nonstop_Mutation", "Splice_Site", "Translation_Start_Site"} + synon_mut = {"Silent", "5'UTR", "5'Flank", "Intron", "3'UTR"} # Use for the "Synon" mutation type + + # Which samples are we analyzing? + sample_list = SortedSet() # Used to ensure the output files are in somewhat sorted order + + required_cols = ["Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode"] + optional_cols = ["NCBI_Build", "Start_Position", "Tumor_Seq_Allele2"] + header_cols = {} + out_header_written = False + out_header_cols = ["Sample", "ENTREZ.ID", "Type"] + if alt_gene_ids is None: + alt_gene_ids = {} + + # For targeted sequencing, which genes have we sequenced and seen mutations in? + genes_seen = {} + skipped_mut = [] # Store mutations in genes with no Entrez ID + + i = 0 + with open(in_maf) as f, open(out_mut_flat, "w") as o: + for line in f: + i += 1 + # Ignore comment lines + if line[0] == "#": + continue + line = line.rstrip("\n").rstrip("\r") + cols = line.split("\t") + + # Skip empty lines + if not line: + continue + + # If we haven't parsed the header yet, assume we are parsing the first line of the file, and that is the header + if not header_cols: + j = 0 + for col in cols: + if col in required_cols: + header_cols[col] = j + elif col in optional_cols: + header_cols[col] = j + j += 1 + + # Check that we have all required columns + for col in required_cols: + if col not in header_cols: + raise AttributeError("Unable to locate a column specifying \'%s\' in the MAF file header" % col) + # See which optional columns we have + if "Start_Position" in header_cols: + out_header_cols.append("Location") + logging.info("\'Start_Position\' column found in MAF file. Output mutation flat file will be annotated with \'Position\'") + if "Tumor_Seq_Allele2" in header_cols: + logging.info( + "\'Tumor_Seq_Allele2\' column found in MAF file. MYD88 hotspot mutations will be annotated as \'L265P\'") + continue + + # Process mutations + mut_attributes = {x: cols[y] for x, y in header_cols.items()} + + # Check genome build, as only GRCh37 is supported + if "NCBI_Build" in mut_attributes and "Start_Position" in mut_attributes and mut_attributes["NCBI_Build"] != "GRCh37": + raise NotImplementedError("Only GRCh37 is currently supported as a reference genome") + + # Lets get the easiest stuff out of the way first. Specify the output sample name and gene ID + sample_name = mut_attributes["Tumor_Sample_Barcode"] + if sample_name == "" or sample_name == ".": + raise AttributeError("No tumor_sample_barcode was found when processing line %s of the MAF file" % i) + + hugo_name = mut_attributes["Hugo_Symbol"] + try: + # Skip intergenic mutations + if hugo_name == "Unknown": + continue + entrez_id = gene_ids[hugo_name] + except KeyError: + # If this gene does not have a Entrez ID, are we prehaps using an older Hugo_Symbol? + if hugo_name in alt_gene_ids: + entrez_id = alt_gene_ids[hugo_name] + else: + # Skip this mutation if there is no Entrez ID + skipped_mut.append(line) + continue + + # Obtain position (if Start_Position is specified) + if "Start_Position" in mut_attributes: + position = mut_attributes["Start_Position"] + else: + position = None + + # Finally, specify the variant type + if mut_attributes["Variant_Classification"] in trunc_mut: + genes_seen[entrez_id] = hugo_name # This gene has a non-synonmymous mutation, so lets assume we have sequenced it + type = "TRUNC" + elif mut_attributes["Variant_Classification"] in non_syn: + genes_seen[entrez_id] = hugo_name # This gene has a non-synonmymous mutation, so lets assume we have sequenced it + type = "MUTATION" + elif mut_attributes["Variant_Classification"] in synon_mut: + genes_seen[entrez_id] = hugo_name + type = "Synon" + else: + # Ignore intronic mutations, 3'UTRs etc + continue + + # If this mutation is in MYD88, does it affect the hotspot? + if hugo_name == "MYD88" and "Tumor_Seq_Allele2" in mut_attributes: + if mut_attributes["Start_Position"] == "38182641" and mut_attributes["Tumor_Seq_Allele2"] == "C": + type = "L265P" + + if sample_name not in sample_list: + sample_list.add(sample_name) + + # If this gene isn't in the subset list provide it, skip it + if subset_ids is not None and entrez_id not in subset_ids: + continue + + # Finally, write this variant + # Write a header line, if we have not done so already + if not out_header_written: + out_header_written = True + o.write("\t".join(out_header_cols)) + o.write(os.linesep) + + out_line = [sample_name, entrez_id, type, position + os.linesep] + o.write("\t".join(out_line)) + + # Check to see that we actually converted mutations + if len(genes_seen) == 0: + raise AttributeError("No mutations were successfully converted. Check that the --entrez_ids file has valid Hugo Symbols matching the --maf file") + elif skipped_mut: + logging.warning("%s mutations in the MAF file were not converted. These either don't have a valid Entrez ID, or they were not in the --lymphgen_gene file" % len(skipped_mut)) + + logging.info("Finished processing MAF file") + logging.info("Generating gene list") + # Generate the gene list file + # This file a single column with the Entrez ID, as that is all LymphGen currently supports + with open(out_gene_list, "w") as o: + # Write header + o.write("\"ENTREZ.ID\"" + os.linesep) + + # Write out all genes we have seen a mutation in + for entrez_id in genes_seen.keys(): + if subset_ids is not None and entrez_id not in subset_ids: + continue # Skip these gene, as it wasn't in the --lymphgen_gene list + o.write(entrez_id) + o.write(os.linesep) + + if seq_type == "exome" or seq_type == "genome": + # We have sequenced (effectively) all genes in the human genome. Write out all remaining genes + for entrez_id in gene_ids.values(): + if entrez_id in genes_seen: # We have already written out this gene. Skip it + continue + if subset_ids is not None and entrez_id not in subset_ids: + continue # Skip these gene, as it wasn't in the --lymphgen_gene list + o.write(entrez_id) + o.write(os.linesep) + + return list(sample_list) + + +def load_gene_coords_bed(bed_file, gene_ids, alt_gene_ids=None): + """ + Load in the genomic coordinates of genes in the human genome from the specified BED4+ file. + + The following columns are required (no header) + chrom start end gene + + A gene can be split across multiple BED entries (ex. exonic coordinates). In this case, the start and end of the first and last exon, + respectively, will be used as the gene start/end coordinates + + :param bed_file: A string containing a filepath to a BED4+ file listing the positions of genes + :return: A dictionary storing {gene_name: Gene()} + """ + if alt_gene_ids is None: + alt_gene_ids = {} + gene_coords = {} + skipped_genes = [] + + i = 0 + with open(bed_file) as f: + for line in f: + i += 1 + line = line.rstrip("\n").rstrip("\r") # Remove line endings + cols = line.split("\t") + + # Skip empty lines + if not line: + continue + + # Since this BED file could contain more than 4 columns (ex. a BED6 or BED12 file), manually unpack and inspect the first + # four columns, since those are the columns we care about + try: + chrom = cols[0] + start = cols[1] + end = cols[2] + gene = cols[3] + except IndexError: + # This BED entry was trucated + raise AttributeError("Unable to parse line %s of %s as it contains less than four columns (chrom, start, end, gene)" % (i, bed_file)) + + # Check that the start and end are actually genomic coordinates + if not start.isdigit(): + raise TypeError("When parsing line %s of %s, the start position \'%s\' is not a valid genomic coordinate" % (i, bed_file, start)) + if not end.isdigit(): + raise TypeError("When parsing line %s of %s, the end position \'%s\' is not a valid genomic coordinate" % (i, bed_file, end)) + + start = int(start) + end = int(end) + chrom = chrom.replace("chr", "") + + # Is the gene name the Hugo Symbol or the Entrez gene ID? + # If its an integer, lets assume its the Entrez ID + # If its not, assume it is the Hugo Symbol and convert it to the Entrez ID + if not gene.isdigit(): + try: + entrez_id = gene_ids[gene] + except KeyError: + # If we can't find this Hugo Symbol in the default mappings, check the alt gene IDs + if gene in alt_gene_ids: + entrez_id = alt_gene_ids[gene] + else: + # This gene doesn't have an Entrez ID. Skip it + skipped_genes.append(gene) + continue + else: + entrez_id = gene + + # Have we seen this chromosome before? + if chrom not in gene_coords: + gene_coords[chrom] = {} + # Have we seen this gene before? + if entrez_id in gene_coords[chrom]: + # If so, then we need to update the start/end of this gene based on this new BED entry + existing_gene_entry = gene_coords[chrom][entrez_id] + + # Sanity check + if chrom != existing_gene_entry.chrom: + raise AttributeError("We found two entries for gene \'%s\'. One is found on chromosome \'%s\', while the other is found on \'%s\'" + % (gene, chrom, existing_gene_entry.chrom)) + if start < existing_gene_entry.start: + existing_gene_entry.start = start + if end > existing_gene_entry.end: + existing_gene_entry.end = end + else: + # If we haven't seen this gene before, create a new entry for it + gene_coords[chrom][entrez_id] = Gene(chrom, start, end, entrez_id) + + # Now that we have processed all genes, make sure we actually found Entrez IDs for a handful of genes + # If we haven't, it means the input file likely didn't contain Hugo Symbols + if len(gene_coords) == 0: + raise AttributeError("No Hugo Symbols from the --genes file (column 4) were found in the --entrez_ids file. An example gene is %s" % (skipped_genes[0])) + + return gene_coords + + +def load_chrom_arm(arm_file): + """ + Loads in the genomic coordinates corresponding to the arm of each chromosome + + The input should be a tab-delimited file with the following columns + chromosome start end arm + + :param arm_file: A string containing a filepath to a tab-delimited file containing genomic coordinates for chromosome arms + :return: A dictionary storing the genomic range of each chromosome, and each individual arm + """ + + arm_chrom = {} + required_cols = ["chromosome", "start", "end", "arm"] + header_cols = {} + + i = 0 + with open(arm_file) as f: + for line in f: + i += 1 + line = line.rstrip("\n").rstrip("\r") # Remove line endings + cols = line.split("\t") + + # Skip empty lines + if not line: + continue + + # If we haven't parsed the header yet, assume this is the first line of the file (aka the header) + if not header_cols: + j = 0 + for col in cols: + if col in required_cols: + header_cols[col] = j + j += 1 + + # Check to make sure all required columns are found + for col in required_cols: + if col not in header_cols: + raise AttributeError("Unable to locate column %s in the chromosome arm positions file \'%s\'" % (col, arm_file)) + # If we get this far, the header is valid + continue + + try: + arm_attributes = {x: cols[y] for x, y in header_cols.items()} + except IndexError: + # We were unable to find a required column. This line is likely truncated + raise AttributeError("Unable to parse line %s of the chromosome arm positions file %s: The line appears truncated" % (i, arm_file)) + + # Have we seen this chromosome before? If not, lets create an object to store its genomic features + chrom = arm_attributes["chromosome"] + + # Strip chr prefix + chrom = chrom.replace("chr", "") + + if chrom not in arm_chrom: + arm_chrom[chrom] = Chromosome(chrom) + + # Save the arm coordinates in the chromosome + try: + arm_chrom[chrom].add(int(arm_attributes["start"]), int(arm_attributes["end"]), arm_attributes["arm"]) + except ValueError as e: + raise TypeError("Unable to process line %s of \'%s\': start and end must be integers") from e + + return arm_chrom + + +def get_overlap_genes(chrom: str, start: int, end: int, copy_num: int, gene_cords: dict): + + """ + Find the genes which overlap a given segment + + This is a very brute-force approach which will check all genes on the target chromosome for overlap. Pre-sorting and bisecting genomic + regions would be significantly faster, but 1) You need to handle overlapping genes, and 2) I am assuming the performance penalty won't + matter too much. + + Note that gains and losses are handled differently if a segment partially overlaps a gene. If the event is a loss, the gene is included. + If the event is a gain, the gene is NOT included, as that copy is likely not functional + + :param chrom: A string coresponding to the contig name + :param start: An int specifying the start of the segment + :param end: An int specifying the end of the segment + :param copy_num: An integer specifying the copy number of this segment + :param gene_cords: A dictionary storing the positions of genes, in the format {chromosome: {gene1: attr, gene2: attr...}} + :return: + """ + + # Handle chr-prefix shenanigans + is_chr_prefixed = next(iter(gene_cords.keys())).startswith("chr") + if is_chr_prefixed and not chrom.startswith("chr"): + chrom = "chr" + chrom + elif not is_chr_prefixed and chrom.startswith("chr"): + chrom = chrom.replace("chr", "") + + try: + genes_on_chrom = gene_cords[chrom] + except KeyError: + # No genes are on this contig. This could be a user error, or this a contig with no annotated genes + return [] + + # Go through each gene on this chromosome and see if the coordinates overlap with our regions + olap_genes = [] + for entrez_id, gene_info in genes_on_chrom.items(): + if start < gene_info.end and end > gene_info.start: + # Overlap found + # Now for the tricky part + # If the segment partially overlaps this gene, then we need to handle gains and losses differently + # Deleting or gaining half of a gene will likely cause it to no longer function + # Thus, partial deletions of genes could be drivers, while partial gains of genes are likely never drivers + if copy_num < 2: + olap_genes.append(entrez_id) + elif copy_num > 2: + if start > gene_info.start or end < gene_info.end: + continue # Partially overlapping + else: + olap_genes.append(entrez_id) + else: + raise NotImplementedError("Not calculating overlapping genes for copy-neutral segments") + + return olap_genes + + +def generate_cnv_files(cnv_segs, gene_regions_bed, arm_regions, gene_ids, out_cnv_gene, out_cnv_arm, sample_ids, alt_gene_ids=None, subset_ids=None, input_log2: bool= False, focal_cn_thresh:int = 30000000): + """ + Characterize focal and arm-level copy number events, and summarize them in the respective output files. + + For focal events (i.e. events smaller than the specified threshold), identify all genes which overlap the even, and save them to the out_cnv_gene file + All events are used to flag chromosomes or individual chromosomal arms which are gained or amplified. + + The cnv_segs file should have the following columns (extra columns are ignored): + Tumor_Sample_Barcode chromosome start" end CN + + Where CN is the absolute copy number + + The gene_regions_bed can have multiple entries for each gene (ex. exons). The maximum and minimum of the cumulative entries is used to define the + gene boundaries + + The arm_regions file should contain the following columns: + chromosome start end arm + + :param cnv_segs: A string containing a filepath to a tab-delimited file containing copy number events + :param gene_regions_bed: A string specifying a filepath to a BED4+ file containing gene positions + :param arm_regions: A string specifying a filepath to a tab-delimited file specifying the positions of chromosomal arms + :param gene_ids: A dictionary mapping {Hugo_Symbol: Entrez_ID} + :param out_cnv_gene: A string specifying the cnv_flat output file + :param out_cnv_arm: A string specifying the cnv_arm output file + :param sample_ids: A list containing samples which have one or more somatic mutations + :param alt_gene_ids: An additional dictionary mapping {Hugo_Symbol: Entrez_IDs}. Used if previous Hugo_Symbols were assigned to a gene + :param subset_ids: An iterable listing a set of Entrez IDs. Only CNVs overlapping these genes will be output + :param input_log2: Does the input file contain log2 ratios instead of absolute CN? + :param focal_cn_thresh: The maximum size of an event for it to be considered "focal", in bp + :return: A list of samples which have CNV information + """ + logging.info("Processing copy-number segments") + if input_log2: + logging.debug("Converting from log2 ratios") + # First things first, lets load in the gene regions file and figure out where each gene is + gene_coords = load_gene_coords_bed(gene_regions_bed, gene_ids, alt_gene_ids=alt_gene_ids) + + # Load in the chromosome arm regions + arm_coods = load_chrom_arm(arm_regions) + + # Process copy number segments + required_cols = ["Tumor_Sample_Barcode", "chromosome", "start", "end", "CN"] + header_cols = {} + + sample_cnvs = {} + + i = 0 + with open(cnv_segs) as f: + for line in f: + i += 1 + line = line.rstrip("\n").rstrip("\r") # Remove line endings + cols = line.split("\t") + + # Skip empty lines + if not line: + continue + + # If we haven't parsed the header yet, assume this is the first line of the file (aka the header) + if not header_cols: + j = 0 + for col in cols: + if col in required_cols: + header_cols[col] = j + j += 1 + + # Check to make sure all required columns are found + for col in required_cols: + if col not in header_cols: + raise AttributeError("Unable to locate column \'%s\' in the CNV segments file \'%s\'" % (col, cnv_segs)) + # If we get this far, the header is valid + continue + + # Process this CNV entry + cnv_attributes = {x: cols[y] for x, y in header_cols.items()} + + # Have we processed CNVs from this sample before? + if cnv_attributes["Tumor_Sample_Barcode"] not in sample_cnvs: + sample_cnvs[cnv_attributes["Tumor_Sample_Barcode"]] = SampleCNVs() + # Sterilize input and ensure it is valid, and store these events + try: + start = int(cnv_attributes["start"]) + end = int(cnv_attributes["end"]) + # Remove segments which are of length 0 + if start == end: + continue + # Is the CN state absolute CN or log2 ratios? If log2 ratios, convert to absolute CN + cn_state = float(cnv_attributes["CN"]) + if input_log2: + cn_state = math.pow(2, cn_state + 1) + if cn_state < 0: + cn_state = 0 + elif cn_state < 0: # Sanity check that the user didn't accidentally provide log2 ratios + raise AttributeError("Unable to process line %s of \'%s\': \'%s\': Negative copy number segment. Did you mean to specify the --log2 flag?" % (i, cnv_segs, line)) + + cnv_attributes["CN"] = round(cn_state) + + sample_cnvs[cnv_attributes["Tumor_Sample_Barcode"]].add( + cnv_attributes["chromosome"], + start, + end, + cnv_attributes["CN"]) + except ValueError as e: + raise TypeError("Unable to process line %s of \'%s\': start, end, and CN must be numeric" % (i, cnv_segs)) from e + except TypeError as e: + raise AttributeError("Unable to process line %s of \'%s\': \'%s\': End coordinate of segment occurs before start" % (i, cnv_segs, line)) from e + + # Now adjust for ploidy + logging.info("Adjusting sample ploidy") + for sample, cnvs in sample_cnvs.items(): + cnvs.adjust_ploidy(arm_coods, sample) + + # Now that we have processed all CNVs, lets see which genes have events, and write those out + logging.info("Calculating gene overlap with CNVs") + with open(out_cnv_gene, "w") as o: + + # Write output file header + out_header = ["Sample", "ENTREZ.ID", "Type"] + o.write("\t".join(out_header)) + o.write(os.linesep) + + # Process segments + for sample, cnvs in sample_cnvs.items(): + for chrom in cnvs.cn_states.keys(): + # parse each segment + for start, end, cn_state in zip(cnvs.starts[chrom], cnvs.ends[chrom], cnvs.cn_states[chrom]): + + # Ignore copy-neutral events + if cn_state == 2: + continue + # Is this event focal? If so, lets find the genes it overlaps + if end - start < focal_cn_thresh: + # What type of event is this? + if cn_state > 3: + event_type = "AMP" + elif cn_state > 2: + event_type = "GAIN" + elif cn_state < 1: + event_type = "HOMDEL" + elif cn_state < 2: + event_type = "HETLOSS" + else: + raise TypeError("Invalid copy number state \'%s\'" % cn_state) + + olap_genes = get_overlap_genes(chrom, start, end, cn_state, gene_coords) + # If any genes overlap, write out these genes + for gene in olap_genes: + + # If a list of genes to subset to was provided, subset those genes + if subset_ids is not None and gene not in subset_ids: + continue # Skip this gene, as it was not in the list provided + out_line = [sample, + gene, + event_type + ] + o.write("\t".join(out_line)) + o.write(os.linesep) + + # Now that we have processed all the CNVs, identify which samples have arm-level and whole chromosomal copy number changes + logging.info("Calculating arm-level CNVs") + with open(out_cnv_arm, "w") as o: + # Write output header + out_header = ["Sample", "Arm", "Type"] + o.write("\t".join(out_header)) + o.write(os.linesep) + + for sample, cnvs in sample_cnvs.items(): + if sample not in sample_ids: + logging.warning("Sample \'%s\' was provided in the input CNVs file, but no SNVs were found. Skipping..." % sample) + continue + for chrom, chrom_info in arm_coods.items(): + + # For now, skip chromosome X and Y as those aren't supported? + if chrom == "X" or chrom == "chrX" or chrom == "Y" or chrom == "chrY": + continue + + events = cnvs.overlap_chrom(chrom_info) + # If there are large-scale CNVs in this sample, output them to the Arm flat file + for arm, c_type in events.items(): + out_line = [ + sample, + arm, + c_type + ] + o.write("\t".join(out_line)) + o.write(os.linesep) + + return list(sample_cnvs.keys()) + + +def generate_sample_annot(samples: iter, cnv_samples: iter, out_sample_annot: str): + + logging.info("Generating sample annotation file") + with open(out_sample_annot , "w") as o: + # Write sample annotation file header + out_header_cols = ["Sample.ID", "Copy.Number", "BCL2.transloc","BCL6.transloc"] + o.write("\t".join(out_header_cols)) + o.write(os.linesep) + + for sample in samples: + # Were CNVs provided for this sample? + if sample in cnv_samples: + has_cn = "1" + else: + has_cn = "0" + out_line = [sample, + has_cn, + "NA", + "NA"] + o.write("\t".join(out_line)) + o.write(os.linesep) + + +def main(args=None): + + # Obtain input + if args is None: + args = get_args() + + # First, load in the mapping of Gene names and NCBI/Entrez IDs + gene_ids, alt_gene_ids = load_entrez_ids(args.entrez_ids) + + # If a --lymphgen_genes file was provided, load those genes + subset_ids = None + if args.lymphgen_genes: + subset_ids = load_subset_ids(args.lymphgen_genes) + + # Generate the mutation flat file and gene list using the input MAF file and Entrez IDs + out_mut_flat = args.outdir + os.sep + args.outprefix + "_mutation_flat.tsv" + out_gene_list = args.outdir + os.sep + args.outprefix + "_gene_list.txt" + sample_list = generate_mut_flat(args.maf, args.sequencing_type, gene_ids, out_mut_flat, out_gene_list, alt_gene_ids = alt_gene_ids, subset_ids=subset_ids) + + # Generate the copy number gene list file and arm flat file + if args.cnvs: + out_cnv_gene = args.outdir + os.sep + args.outprefix + "_cnv_flat.tsv" + out_cnv_arm = args.outdir + os.sep + args.outprefix + "_cnv_arm.tsv" + cnv_sample_list = generate_cnv_files(args.cnvs, args.genes, args.arms, gene_ids, out_cnv_gene, out_cnv_arm, sample_list, alt_gene_ids=alt_gene_ids, subset_ids=subset_ids, input_log2=args.log2) + else: + cnv_sample_list = set() + + # Generate a sample annotation file + out_sample_annot = args.outdir + os.sep + args.outprefix + "_sample_annotation.tsv" + generate_sample_annot(sample_list, set(cnv_sample_list), out_sample_annot) + + +if __name__ == "__main__": + main()