|
a |
|
b/generate_input.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
|
|
|
3 |
# Created by Christopher Rushton 2020/04/20 |
|
|
4 |
# This converts a MAF file and copy number matrix (optional) into the input files used be the LymphGen lymphoma |
|
|
5 |
# classifier. See https://llmpp.nih.gov/lymphgen/index.php for more information |
|
|
6 |
|
|
|
7 |
import argparse |
|
|
8 |
import math |
|
|
9 |
import os |
|
|
10 |
import bisect |
|
|
11 |
import logging |
|
|
12 |
from collections import Counter |
|
|
13 |
from sortedcontainers import SortedSet |
|
|
14 |
|
|
|
15 |
|
|
|
16 |
class Gene: |
|
|
17 |
""" |
|
|
18 |
A simple class storing the chromosome, start, end, and name of a gene |
|
|
19 |
""" |
|
|
20 |
__slots__ = ["name", "chrom", "start", "end"] |
|
|
21 |
|
|
|
22 |
def __init__(self, chrom, start, end, name): |
|
|
23 |
self.name = name |
|
|
24 |
self.chrom = chrom |
|
|
25 |
self.start = start |
|
|
26 |
self.end = end |
|
|
27 |
|
|
|
28 |
|
|
|
29 |
class Chromosome: |
|
|
30 |
""" |
|
|
31 |
A simple class storying the genomic range of each chromosome, as well as the coordiates of the p and q arm |
|
|
32 |
""" |
|
|
33 |
|
|
|
34 |
def __init__(self, chrom): |
|
|
35 |
self.chrom = chrom |
|
|
36 |
self.p_start = None |
|
|
37 |
self.p_end = None |
|
|
38 |
self.q_start = None |
|
|
39 |
self.q_end = None |
|
|
40 |
self.q_length = 0.0 |
|
|
41 |
self.p_length = 0.0 |
|
|
42 |
|
|
|
43 |
def add(self, start, end, arm): |
|
|
44 |
|
|
|
45 |
if arm == "p": |
|
|
46 |
# Check to see if we already have genomic coordinates for this arm |
|
|
47 |
if self.p_start is not None or self.p_end is not None: |
|
|
48 |
raise AttributeError("Coordinates already assigned for %s arm of chromosome \'%s\'" % (arm, self.chrom)) |
|
|
49 |
self.p_start = start |
|
|
50 |
self.p_end = end |
|
|
51 |
self.p_length = end - start |
|
|
52 |
|
|
|
53 |
# Sanity check this does not overlap the q arm |
|
|
54 |
if self.q_start is not None and self.p_end > self.q_start: |
|
|
55 |
raise AttributeError("For chromosome \'%s\'. The q arm starts before the end of the p arm" % self.chrom) |
|
|
56 |
|
|
|
57 |
elif arm == "q": |
|
|
58 |
# Check to see if we already have genomic coordinates for this arm |
|
|
59 |
if self.q_start is not None or self.q_end is not None: |
|
|
60 |
raise AttributeError("Coordinates already assigned for %s arm of chromosome \'%s\'" % (arm, self.chrom)) |
|
|
61 |
self.q_start = start |
|
|
62 |
self.q_end = end |
|
|
63 |
self.q_length = end - start |
|
|
64 |
|
|
|
65 |
# Sanity check this does not overlap the p arm |
|
|
66 |
if self.p_end is not None and self.q_start < self.p_end: |
|
|
67 |
raise AttributeError("For chromosome \'%s\'. The q arm starts before the end of the p arm" % self.chrom) |
|
|
68 |
else: |
|
|
69 |
# Not the p or q arm. Invalid |
|
|
70 |
raise AttributeError("Unknown arm specified for chromosome \'%s\':\'%s\'" % (self.chrom, arm)) |
|
|
71 |
|
|
|
72 |
|
|
|
73 |
class SampleCNVs: |
|
|
74 |
""" |
|
|
75 |
Stores all the copy number events within a given class |
|
|
76 |
I was going to write this in a function, but having a class is a lot cleaner |
|
|
77 |
""" |
|
|
78 |
|
|
|
79 |
def __init__(self): |
|
|
80 |
self.starts = {} |
|
|
81 |
self.ends = {} |
|
|
82 |
self.cn_states = {} |
|
|
83 |
self.len_seg = {} |
|
|
84 |
self.is_chr_prefixed = None |
|
|
85 |
self.ploidy = None |
|
|
86 |
|
|
|
87 |
def merge_overlapping_cnvs(self, existStarts: iter, existEnds: iter, existCNs: iter, newStart: int, newEnd: int, newCN:int): |
|
|
88 |
""" |
|
|
89 |
Handle overlapping copy number segments, merging and editing existing segments as necessary |
|
|
90 |
|
|
|
91 |
In essence, if this segment overlaps an existing segment with the same copy number state, those segments are merged |
|
|
92 |
together. If the copy number state is different, the largest increase is kept, |
|
|
93 |
truncating or even breaking apart the copy-neutral segment in the process. In the worse case senario, |
|
|
94 |
this will lead to three different segments being created |
|
|
95 |
|
|
|
96 |
:param existStarts: The start position of the existing segment |
|
|
97 |
:param existEnds: The end position of the existing segment |
|
|
98 |
:param existCNs: The copy number state of the existing segment |
|
|
99 |
:param newStart: The start position of the new segment |
|
|
100 |
:param newEnd: The end position of the new segment |
|
|
101 |
:param newCN: The copy number state of the new segment |
|
|
102 |
:return: A tuple containing the new and old events. Ex {[newEvent1, newEvent2], [oldEvent1, oldEvent2]} |
|
|
103 |
""" |
|
|
104 |
|
|
|
105 |
def reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd): |
|
|
106 |
|
|
|
107 |
# The existing event has higher priority than the old event |
|
|
108 |
# Truncate the newer event |
|
|
109 |
# Since this could break the newer event apart into two pieces, lets do that now, and see |
|
|
110 |
# if the output segments are valid |
|
|
111 |
outStart1 = newStart |
|
|
112 |
outEnd1 = oldStart |
|
|
113 |
outStart2 = oldEnd |
|
|
114 |
outEnd2 = newEnd |
|
|
115 |
if outStart1 < outEnd1: # Valid segment, process |
|
|
116 |
existStarts, existEnds, existCNs = self.merge_overlapping_cnvs(existStarts, existEnds, existCNs, |
|
|
117 |
outStart1, outEnd1, newCN) |
|
|
118 |
if outStart2 < outEnd2: # Valid segment |
|
|
119 |
existStarts, existEnds, existCNs = self.merge_overlapping_cnvs(existStarts, existEnds, existCNs, |
|
|
120 |
outStart2, outEnd2, newCN) |
|
|
121 |
return existStarts, existEnds, existCNs |
|
|
122 |
|
|
|
123 |
def reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd, oldCN): |
|
|
124 |
|
|
|
125 |
# The newer event is a "higher"-level deletion |
|
|
126 |
# Split/Truncate the older event |
|
|
127 |
outStart1 = oldStart |
|
|
128 |
outEnd1 = newStart |
|
|
129 |
outStart2 = newEnd |
|
|
130 |
outEnd2 = oldEnd |
|
|
131 |
outCN = oldCN |
|
|
132 |
# Delete existing event |
|
|
133 |
existStarts.pop(start_bisect) |
|
|
134 |
existEnds.pop(start_bisect) |
|
|
135 |
existCNs.pop(start_bisect) |
|
|
136 |
|
|
|
137 |
if outStart2 < outEnd2: # Valid segment, do the last one first to maintain sorted order |
|
|
138 |
existStarts.insert(start_bisect, outStart2) |
|
|
139 |
existEnds.insert(start_bisect, outEnd2) |
|
|
140 |
existCNs.insert(start_bisect, outCN) |
|
|
141 |
if outStart1 < outEnd1: # Also valid |
|
|
142 |
existStarts.insert(start_bisect, outStart1) |
|
|
143 |
existEnds.insert(start_bisect, outEnd1) |
|
|
144 |
existCNs.insert(start_bisect, outCN) |
|
|
145 |
|
|
|
146 |
# Check for any more overlaps |
|
|
147 |
return self.merge_overlapping_cnvs(existStarts, existEnds, existCNs, newStart, newEnd, newCN) |
|
|
148 |
|
|
|
149 |
# First, does this new segment actually overlap anything? |
|
|
150 |
start_bisect = bisect.bisect_right(existEnds, newStart) |
|
|
151 |
end_bisect = bisect.bisect_left(existStarts, newEnd) |
|
|
152 |
|
|
|
153 |
if start_bisect == end_bisect: |
|
|
154 |
# There are no overlaps. Simply add this new event in, and return it |
|
|
155 |
existStarts.insert(start_bisect, newStart) |
|
|
156 |
existEnds.insert(start_bisect, newEnd) |
|
|
157 |
existCNs.insert(start_bisect, newCN) |
|
|
158 |
return existStarts, existEnds, existCNs |
|
|
159 |
|
|
|
160 |
# Grab the first overlap |
|
|
161 |
oldStart = existStarts[start_bisect] |
|
|
162 |
oldEnd = existEnds[start_bisect] |
|
|
163 |
oldCN = existCNs[start_bisect] |
|
|
164 |
|
|
|
165 |
# Simplest case first. If both these events have the same CN state, lets merge them together |
|
|
166 |
if oldCN == newCN: |
|
|
167 |
|
|
|
168 |
outStart = oldStart if oldStart < newStart else newStart |
|
|
169 |
outEnd = oldEnd if oldEnd > newEnd else newEnd |
|
|
170 |
# Delete the existing event |
|
|
171 |
existStarts.pop(start_bisect) |
|
|
172 |
existEnds.pop(start_bisect) |
|
|
173 |
existCNs.pop(start_bisect) |
|
|
174 |
|
|
|
175 |
# Check for any more overlaps |
|
|
176 |
return self.merge_overlapping_cnvs(existStarts, existEnds, existCNs, outStart, outEnd, newCN) |
|
|
177 |
else: |
|
|
178 |
# These segments overlap and have different CN states. |
|
|
179 |
# Lets keep the highest-level event |
|
|
180 |
if newCN <= 2 and oldCN <= 2: # Deletion |
|
|
181 |
if oldCN < newCN: |
|
|
182 |
# The older event is a "higher"-level deletion |
|
|
183 |
return reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd) |
|
|
184 |
else: |
|
|
185 |
return reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd, oldCN) |
|
|
186 |
if newCN >= 2 and oldCN >= 2: # Gain/Amp |
|
|
187 |
if oldCN > newCN: |
|
|
188 |
# The older event is a higher level gain. Split/truncate the new event |
|
|
189 |
return reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, |
|
|
190 |
oldEnd) |
|
|
191 |
else: |
|
|
192 |
return reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd, |
|
|
193 |
oldCN) |
|
|
194 |
else: |
|
|
195 |
# One event must be a gain/amp, while the other is a deletion. In this case, keep both, and subset the |
|
|
196 |
# larger event by the smaller one |
|
|
197 |
if oldEnd - oldStart < newEnd - newStart: |
|
|
198 |
# The older event is smaller. Split the newer event |
|
|
199 |
return reduce_new_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, |
|
|
200 |
oldEnd) |
|
|
201 |
else: |
|
|
202 |
# The newer event is smaller. We should split the older event |
|
|
203 |
return reduce_old_event(existStarts, existEnds, existCNs, newStart, newEnd, newCN, oldStart, oldEnd, |
|
|
204 |
oldCN) |
|
|
205 |
|
|
|
206 |
|
|
|
207 |
def add(self, chrom: str, start: int, end: int, cn: int): |
|
|
208 |
|
|
|
209 |
# For LymphGen compatability: Strip "chr" prefix |
|
|
210 |
chrom = chrom.replace("chr", "") |
|
|
211 |
|
|
|
212 |
if end <= start: |
|
|
213 |
if start == end: |
|
|
214 |
# This segment has a length of 0?!. Skip this I guesss |
|
|
215 |
return None |
|
|
216 |
else: |
|
|
217 |
# This segment has a negative length?? |
|
|
218 |
raise TypeError("Invalid CNV segment: \'%s:%s-%s\'" % (chrom, start, end)) |
|
|
219 |
|
|
|
220 |
# Have we seen any events for this chromosome before? |
|
|
221 |
if chrom not in self.len_seg: |
|
|
222 |
# If not, just store this segment. Simple! |
|
|
223 |
self.starts[chrom] = [start] |
|
|
224 |
self.ends[chrom] = [end] |
|
|
225 |
self.cn_states[chrom] = [cn] |
|
|
226 |
self.len_seg[chrom] = 1 |
|
|
227 |
|
|
|
228 |
# Are we using chr-prefixed contig names? |
|
|
229 |
if self.is_chr_prefixed is None: |
|
|
230 |
self.is_chr_prefixed = True if chrom.startswith("chr") else False |
|
|
231 |
else: |
|
|
232 |
# If we have seen events on this chromosome before, we need to compare this new segment with existing segments |
|
|
233 |
# In theory, events should be non-overlapping, but I don't want to assume that because then things will break |
|
|
234 |
# horribly later |
|
|
235 |
|
|
|
236 |
self.starts[chrom], self.ends[chrom], self.cn_states[chrom] = \ |
|
|
237 |
self.merge_overlapping_cnvs(self.starts[chrom], self.ends[chrom], self.cn_states[chrom], start, end, cn) |
|
|
238 |
self.len_seg[chrom] = len(self.cn_states) |
|
|
239 |
|
|
|
240 |
|
|
|
241 |
def overlap_chrom(self, chromosome: Chromosome, threshold: float = 0.8): |
|
|
242 |
|
|
|
243 |
""" |
|
|
244 |
Calculates the overlap between the segments stored here and a given chromosome |
|
|
245 |
:param chromosome: A Chromosome() object defining the start and end of the p and q arms, respectively |
|
|
246 |
:return: A dictionary containing |
|
|
247 |
""" |
|
|
248 |
|
|
|
249 |
# Do the CNVs within this chromosome encompass a given chromosome more than the specified threshold? |
|
|
250 |
chrom_name = chromosome.chrom |
|
|
251 |
|
|
|
252 |
# Handle "chr" prefix nonsense |
|
|
253 |
if self.is_chr_prefixed and not chrom_name.startswith("chr"): |
|
|
254 |
chrom_name = "chr" + chrom_name |
|
|
255 |
elif not self.is_chr_prefixed and chrom_name.startswith("chr"): |
|
|
256 |
chrom_name = chrom_name.replace("chr", "") |
|
|
257 |
|
|
|
258 |
try: |
|
|
259 |
chrom_starts = self.starts[chrom_name] |
|
|
260 |
except KeyError: |
|
|
261 |
# No CN events were provided for this chromosome, hence there can be no overlap |
|
|
262 |
return {} |
|
|
263 |
chrom_ends = self.ends[chrom_name] |
|
|
264 |
chrom_cn = self.cn_states[chrom_name] |
|
|
265 |
|
|
|
266 |
homdel_sum = {"p": 0, "q": 0, "Chrom": 0} |
|
|
267 |
del_sum = {"p": 0, "q": 0, "Chrom": 0} |
|
|
268 |
gain_sum = {"p": 0, "q": 0, "Chrom": 0} |
|
|
269 |
amp_sum = {"p": 0, "q": 0, "Chrom": 0} |
|
|
270 |
|
|
|
271 |
# Check for overlapping segments for each arm |
|
|
272 |
for (start, end, cn) in zip(chrom_starts, chrom_ends, chrom_cn): |
|
|
273 |
|
|
|
274 |
# Ignore copy-neutral segments |
|
|
275 |
if cn == 2: |
|
|
276 |
continue |
|
|
277 |
|
|
|
278 |
if end < chromosome.p_start or start > chromosome.q_end: |
|
|
279 |
# Segment falls out of range |
|
|
280 |
continue |
|
|
281 |
if start < chromosome.p_end: |
|
|
282 |
olap_start = start if start > chromosome.p_start else chromosome.p_start |
|
|
283 |
olap_end = end if end < chromosome.p_end else chromosome.p_end |
|
|
284 |
if cn < 2: |
|
|
285 |
del_sum["p"] += olap_end - olap_start |
|
|
286 |
del_sum["Chrom"] += olap_end - olap_start |
|
|
287 |
if cn < 1: |
|
|
288 |
homdel_sum["p"] += olap_end - olap_start |
|
|
289 |
homdel_sum["Chrom"] += olap_end - olap_start |
|
|
290 |
elif cn > 2: |
|
|
291 |
gain_sum["p"] += olap_end - olap_start |
|
|
292 |
gain_sum["Chrom"] += olap_end - olap_start |
|
|
293 |
if cn > 3: |
|
|
294 |
amp_sum["p"] += olap_end - olap_start |
|
|
295 |
amp_sum["Chrom"] += olap_end - olap_start |
|
|
296 |
|
|
|
297 |
if end > chromosome.q_start: # We use an if, not elif, in case a segment overlaps both the p and q arm |
|
|
298 |
olap_start = start if start > chromosome.q_start else chromosome.q_start |
|
|
299 |
olap_end = end if end < chromosome.q_end else chromosome.q_end |
|
|
300 |
if cn < 2: |
|
|
301 |
del_sum["q"] += olap_end - olap_start |
|
|
302 |
del_sum["Chrom"] += olap_end - olap_start |
|
|
303 |
if cn < 1: |
|
|
304 |
homdel_sum["q"] += olap_end - olap_start |
|
|
305 |
homdel_sum["Chrom"] += olap_end - olap_start |
|
|
306 |
elif cn > 2: |
|
|
307 |
gain_sum["q"] += olap_end - olap_start |
|
|
308 |
gain_sum["Chrom"] += olap_end - olap_start |
|
|
309 |
if cn > 3: |
|
|
310 |
amp_sum["q"] += olap_end - olap_start |
|
|
311 |
amp_sum["Chrom"] += olap_end - olap_start |
|
|
312 |
|
|
|
313 |
events = {} |
|
|
314 |
# Now, calculate the fraction of each overlap |
|
|
315 |
# Start from the highest level and biggest event, and work our way down/smaller |
|
|
316 |
# Amplifications/gains |
|
|
317 |
if amp_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold: # Whole chromosome Amp |
|
|
318 |
events[chrom_name + "Chrom"] = "AMP" |
|
|
319 |
# Now check for arm level events |
|
|
320 |
else: |
|
|
321 |
if amp_sum["p"] / chromosome.p_length > threshold: # p Amp |
|
|
322 |
events[chrom_name + "p"] = "AMP" |
|
|
323 |
elif amp_sum["q"] / chromosome.q_length > threshold: # q Amp |
|
|
324 |
events[chrom_name + "q"] = "AMP" |
|
|
325 |
if chrom_name + "Chrom" not in events and gain_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold: # Whole chromosome gain |
|
|
326 |
events[chrom_name + "Chrom"] = "GAIN" |
|
|
327 |
else: |
|
|
328 |
if chrom_name + "p" not in events and gain_sum["p"] / chromosome.p_length > threshold: # p Gain |
|
|
329 |
events[chrom_name + "p"] = "GAIN" |
|
|
330 |
if chrom_name + "q" not in events and gain_sum["q"] / chromosome.q_length > threshold: # q Gain |
|
|
331 |
events[chrom_name + "q"] = "GAIN" |
|
|
332 |
|
|
|
333 |
# Homozygous and heterozygous deletions |
|
|
334 |
if homdel_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold: # Whole chromosome Homozygous deletion |
|
|
335 |
events[chrom_name + "Chrom"] = "HOMDEL" |
|
|
336 |
# Now check for arm level events |
|
|
337 |
else: |
|
|
338 |
if homdel_sum["p"] / chromosome.p_length > threshold: # p HOMDEL |
|
|
339 |
events[chrom_name + "p"] = "HOMDEL" |
|
|
340 |
elif homdel_sum["q"] / chromosome.q_length > threshold: # q HOMDEL |
|
|
341 |
events[chrom_name + "q"] = "HOMDEL" |
|
|
342 |
if chrom_name + "Chrom" not in events and del_sum["Chrom"] / (chromosome.p_length + chromosome.q_length) > threshold: # Whole chromosome deletions |
|
|
343 |
events[chrom_name + "Chrom"] = "HETLOSS" |
|
|
344 |
else: |
|
|
345 |
if chrom_name + "p" not in events and del_sum["p"] / chromosome.p_length > threshold: # p deletion |
|
|
346 |
events[chrom_name + "p"] = "HETLOSS" |
|
|
347 |
if chrom_name + "q" not in events and del_sum["q"] / chromosome.q_length > threshold: # q deletions |
|
|
348 |
events[chrom_name + "q"] = "HETLOSS" |
|
|
349 |
|
|
|
350 |
return events |
|
|
351 |
|
|
|
352 |
|
|
|
353 |
def adjust_ploidy(self, arm_coords, sample_name, redo=False): |
|
|
354 |
""" |
|
|
355 |
Adjust a sample's CN states based upon ploidy |
|
|
356 |
|
|
|
357 |
Calculate the average copy number state across the entire genome. If a sample is not diploid, normalize all the CN |
|
|
358 |
changes |
|
|
359 |
:param arm_coords: A dictionary containing {"chromosome_name": Chromosome()} objects which list chromosomal coordinates |
|
|
360 |
:param sample_name: A string specifying the sample name. Used for debugging/error purposes |
|
|
361 |
:param redo: Should we override any existing ploidy state for this sample? |
|
|
362 |
:return: None |
|
|
363 |
""" |
|
|
364 |
|
|
|
365 |
if self.ploidy is not None and not redo: |
|
|
366 |
# Ploidy for this sample has already been calculated. Don't do anything |
|
|
367 |
return None |
|
|
368 |
|
|
|
369 |
# Store the length affected for each ploidy state |
|
|
370 |
ploidy_cov = {} |
|
|
371 |
genome_size = 0 |
|
|
372 |
|
|
|
373 |
for chromosome in arm_coords.values(): |
|
|
374 |
# Find overlapping CNVs for this arm |
|
|
375 |
|
|
|
376 |
# if the p or q arm coordinates are not set, use a placeholder |
|
|
377 |
if chromosome.p_start is None: |
|
|
378 |
chromosome.p_start = -100000 |
|
|
379 |
chromosome.p_end = -100000 |
|
|
380 |
chromosome.p_length = 1 |
|
|
381 |
if chromosome.q_start is None: |
|
|
382 |
chromosome.q_start = -100000 |
|
|
383 |
chromosome.q_end = -100000 |
|
|
384 |
chromosome.q_length = 1 |
|
|
385 |
|
|
|
386 |
# Save the size of this chromosome |
|
|
387 |
genome_size += chromosome.p_length |
|
|
388 |
genome_size += chromosome.q_length |
|
|
389 |
|
|
|
390 |
chrom_name = chromosome.chrom |
|
|
391 |
# Handle "chr" prefix nonsense |
|
|
392 |
if self.is_chr_prefixed and not chrom_name.startswith("chr"): |
|
|
393 |
chrom_name = "chr" + chrom_name |
|
|
394 |
elif not self.is_chr_prefixed and chrom_name.startswith("chr"): |
|
|
395 |
chrom_name = chrom_name.replace("chr", "") |
|
|
396 |
try: |
|
|
397 |
chrom_starts = self.starts[chrom_name] |
|
|
398 |
except KeyError: |
|
|
399 |
# No CN events were provided for this chromosome, hence there can be no overlap |
|
|
400 |
continue |
|
|
401 |
chrom_ends = self.ends[chrom_name] |
|
|
402 |
chrom_cn = self.cn_states[chrom_name] |
|
|
403 |
|
|
|
404 |
# Check for overlapping segments for each arm |
|
|
405 |
for (start, end, cn) in zip(chrom_starts, chrom_ends, chrom_cn): |
|
|
406 |
|
|
|
407 |
# Check p-arm overlap |
|
|
408 |
if end < chromosome.p_start or start > chromosome.q_end: |
|
|
409 |
# Segment falls out of range |
|
|
410 |
continue |
|
|
411 |
if start < chromosome.p_end: |
|
|
412 |
olap_start = start if start > chromosome.p_start else chromosome.p_start |
|
|
413 |
olap_end = end if end < chromosome.p_end else chromosome.p_end |
|
|
414 |
|
|
|
415 |
# Store the length of this overlap and associated CN |
|
|
416 |
if cn not in ploidy_cov: |
|
|
417 |
ploidy_cov[cn] = 0 |
|
|
418 |
ploidy_cov[cn] += olap_end - olap_start |
|
|
419 |
|
|
|
420 |
if end > chromosome.q_start: # We use an if, not elif, in case a segment overlaps both the p and q arm |
|
|
421 |
olap_start = start if start > chromosome.q_start else chromosome.q_start |
|
|
422 |
|
|
|
423 |
olap_end = end if end < chromosome.q_end else chromosome.q_end |
|
|
424 |
# Store the length of this overlap and associated CN |
|
|
425 |
if cn not in ploidy_cov: |
|
|
426 |
ploidy_cov[cn] = 0 |
|
|
427 |
ploidy_cov[cn] += olap_end - olap_start |
|
|
428 |
|
|
|
429 |
# Now that we have calculated the number of bases affected by each CN state, calculate the ploidy |
|
|
430 |
x = 0 |
|
|
431 |
for cn, bases in ploidy_cov.items(): |
|
|
432 |
x += cn * bases |
|
|
433 |
|
|
|
434 |
ploidy = x / genome_size |
|
|
435 |
av_ploidy = round(ploidy) |
|
|
436 |
|
|
|
437 |
# Sanity check |
|
|
438 |
if av_ploidy < 1: |
|
|
439 |
raise TypeError("Ploidy of sample \'%s\' was calculated to be below 1!" % sample_name) |
|
|
440 |
|
|
|
441 |
# If this tumour is not diploid, adjust the ploidy |
|
|
442 |
if av_ploidy != 2: |
|
|
443 |
logging.debug("\'%s\' was calculated to have a ploidy of %s" % (sample_name, av_ploidy)) |
|
|
444 |
ploidy_dif = av_ploidy - 2 |
|
|
445 |
new_cns = {} |
|
|
446 |
for chrom, cn in self.cn_states.items(): |
|
|
447 |
x = list(y - ploidy_dif for y in cn) |
|
|
448 |
new_cns[chrom] = x |
|
|
449 |
self.cn_states = new_cns |
|
|
450 |
|
|
|
451 |
self.ploidy = av_ploidy |
|
|
452 |
|
|
|
453 |
|
|
|
454 |
def get_args(): |
|
|
455 |
|
|
|
456 |
def is_valid_dir(path, parser): |
|
|
457 |
""" |
|
|
458 |
Checks to ensure the directory path exists |
|
|
459 |
:param path: A string containing a filepath to a directory |
|
|
460 |
:param parser: An argparse.ArgumentParser() object |
|
|
461 |
:return: path, if the string is a valid directory |
|
|
462 |
:raises: parser.error() if the directory does not exist |
|
|
463 |
""" |
|
|
464 |
if os.path.exists(path) and os.path.isdir(path): |
|
|
465 |
return path |
|
|
466 |
else: |
|
|
467 |
raise parser.error("Unable to set \'%s\' as the output directory: Not a valid directory" % path) |
|
|
468 |
|
|
|
469 |
def is_valid_file(path, parser): |
|
|
470 |
""" |
|
|
471 |
Checks to ensure the specified file exists |
|
|
472 |
:param path: A string containing a filepath |
|
|
473 |
:param parser: An argparse.ArgumentParser() object |
|
|
474 |
:return: path, if the string is a valid directory |
|
|
475 |
:raises: parser.error() if the file does not exist |
|
|
476 |
""" |
|
|
477 |
if os.path.exists(path) and os.path.isfile(path): |
|
|
478 |
return path |
|
|
479 |
else: |
|
|
480 |
raise parser.error("Unable to locate \'%s\': No such file or directory" % path) |
|
|
481 |
|
|
|
482 |
epilog = os.linesep.join(["The --arms, --entrez_ids, and --lymphgen_genes files can be found in the \'resources\' folder. ", |
|
|
483 |
"Note that genome and exome sequencing types are handled exactly the same. ", |
|
|
484 |
"The --entrez-ids file must have the Hugo_Symbol under the column \"Approved Symbol\" and the Entrez ID under the column \"NCBI Gene ID(supplied by NCBI)\". ", |
|
|
485 |
"The --cnvs file should contain the following colummns: Tumor_Sample_Barcode, chromosome, start, end, CN. ", |
|
|
486 |
"The --arms file should contain the following columns: chromosome, start, end, arm. " |
|
|
487 |
]) |
|
|
488 |
parser = argparse.ArgumentParser(description="Generates input files for the LymphGen classifier\nVersion 1.0.0", epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter) |
|
|
489 |
snvs_args = parser.add_argument_group("Input mutation files") |
|
|
490 |
snvs_args.add_argument("-m", "--maf", metavar="MAF", required=True, type=lambda x: is_valid_file(x, parser), help="Input MAF file listing somatic mutations") |
|
|
491 |
snvs_args.add_argument("-e", "--entrez_ids", metavar="TSV", required=True, type=lambda x: is_valid_file(x, parser), help="A tab-delimited file containing gene names (Hugo Symbol) and the corresponding Entrez gene ID") |
|
|
492 |
cnv_args = parser.add_argument_group("(Optional) Input CNV files") |
|
|
493 |
cnv_args.add_argument("-c", "--cnvs", metavar="TSV", default=None, type=lambda x: is_valid_file(x, parser), help="Input tab-delimited file summarizing copy number events") |
|
|
494 |
cnv_args.add_argument("--log2", action="store_true", help="Does the input CNV file use log2 ratios? (i.e. log2(absoluteCN) - 1)") |
|
|
495 |
cnv_args.add_argument("-g", "--genes", metavar="BED", default=None, type=lambda x: is_valid_file(x, parser), help="Input BED4+ file listing start and end positions of genes/exons") |
|
|
496 |
cnv_args.add_argument("-a", "--arms", metavar="TSV", default=None, type=lambda x: is_valid_file(x, parser), help="Input tab-delimited file listing the positions of chromosome arms") |
|
|
497 |
|
|
|
498 |
parser.add_argument("-l", "--lymphgen_genes", metavar="TXT", default=None, type=lambda x: is_valid_file(x, parser), required=False, help="An optional file listing all Entrez IDs considered by the LymphGen classifier. Output will be subset to these genes") |
|
|
499 |
parser.add_argument("-s", "--sequencing_type", metavar="SEQ", choices=["targeted", "exome", "genome"], required=True, help="Sequencing type used to obtain somatic mutations") |
|
|
500 |
parser.add_argument("-o", "--outdir", metavar="PATH", required=True, type=lambda x: is_valid_dir(x, parser), help="Output directory for LymphGen input files") |
|
|
501 |
parser.add_argument("--outprefix", metavar="STRING", default=None, help="Output files will be prefixed using this string [Default: Use the base name of the MAF file]") |
|
|
502 |
parser.add_argument("-v", "--verbose", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default="INFO", help="Set logging verbosity") |
|
|
503 |
|
|
|
504 |
args = parser.parse_args() |
|
|
505 |
# If no outprefix was set, use the basename of the maf file as the output prefix |
|
|
506 |
if args.outprefix is None: |
|
|
507 |
args.outprefix = os.path.basename(args.maf).split(".")[0] |
|
|
508 |
|
|
|
509 |
# If --cnvs is specified, the annotation files are also required |
|
|
510 |
if args.cnvs: |
|
|
511 |
if not args.genes or not args.arms: |
|
|
512 |
raise parser.error("--genes and --arms are required if --cnvs is specified") |
|
|
513 |
|
|
|
514 |
logging.basicConfig(level=getattr(logging, args.verbose), format='%(levelname)s:%(message)s') |
|
|
515 |
|
|
|
516 |
return args |
|
|
517 |
|
|
|
518 |
|
|
|
519 |
def load_entrez_ids(entrez_file: str, hugo_column: str="Approved symbol", entrez_column: str="NCBI Gene ID(supplied by NCBI)", |
|
|
520 |
status_column: str="Status", prev_name_column: str="Previous symbols"): |
|
|
521 |
""" |
|
|
522 |
Creates a dictionary which contains the gene name (Hugo Symbol) and corresponding entrez ID (NCBI ID) from the |
|
|
523 |
specified text file |
|
|
524 |
|
|
|
525 |
Note the input file must have a header. An example file can be obtained from https://www.genenames.org/download/custom/ |
|
|
526 |
Ensure the "Approved symbol" and "NCBI Gene ID" columns are selected. The default column names reflect those obtained |
|
|
527 |
from this source. |
|
|
528 |
The "Approval Status" column is optional, but will be used to remove gene names which have since been withdrawn. |
|
|
529 |
If the "Previous symbols" solumn is included, these gene names will also be assigned to that Entrez ID |
|
|
530 |
The subset_file should list one Entrez ID per line |
|
|
531 |
All extra columns are ignored |
|
|
532 |
|
|
|
533 |
:param entrez_file: A string containing a filepath to a tsv file containing the gene names and gene IDs |
|
|
534 |
:param hugo_column: A string specifying the name of the column which contains |
|
|
535 |
:param entrez_column: A string specifying the name of the column which contains the Entrez gene ID |
|
|
536 |
:param status_column: A string specifying the name of the column which contains the Approval Status. Optional. |
|
|
537 |
:param prev_name_column: A string specifting the name of the column which specifies previously used names for each gene. Optional |
|
|
538 |
:return: Two dictionaries containing {"Gene Name": "Entrez_ID"} |
|
|
539 |
""" |
|
|
540 |
hugo_name_to_id = {} |
|
|
541 |
alt_hugo_name_to_id = {} # If Previous symbols are included |
|
|
542 |
skipped_genes = [] |
|
|
543 |
|
|
|
544 |
hugo_col_num = None |
|
|
545 |
entrez_col_num = None |
|
|
546 |
status_col_num = None # Optional |
|
|
547 |
prev_name_col_num = None # Optional |
|
|
548 |
i = 0 |
|
|
549 |
|
|
|
550 |
with open(entrez_file) as f: |
|
|
551 |
for line in f: |
|
|
552 |
i += 1 |
|
|
553 |
line = line.rstrip("\n").rstrip("\r") # Remove line endings |
|
|
554 |
cols = line.split("\t") |
|
|
555 |
if len(cols) < 2: # Sanity check this file has multiple columns |
|
|
556 |
raise TypeError("Input file %s does not appear to be a tab-delimited file" % entrez_file) |
|
|
557 |
|
|
|
558 |
# If we have not assigned column IDs yet, then we have not processed the file header |
|
|
559 |
# Load the file header |
|
|
560 |
if hugo_col_num is None or entrez_col_num is None: |
|
|
561 |
j = 0 |
|
|
562 |
for col in cols: |
|
|
563 |
if col == hugo_column: |
|
|
564 |
hugo_col_num = j |
|
|
565 |
if col == entrez_column: |
|
|
566 |
entrez_col_num = j |
|
|
567 |
if status_column is not None and col == status_column: |
|
|
568 |
status_col_num = j |
|
|
569 |
if prev_name_column is not None and col == prev_name_column: |
|
|
570 |
prev_name_col_num = j |
|
|
571 |
|
|
|
572 |
j += 1 |
|
|
573 |
# Sanity check that we have found all the columns we need |
|
|
574 |
if hugo_col_num is None: |
|
|
575 |
raise AttributeError("Unable to locate a column corresponding to \'%s\' in the input file \'%s\'" % (hugo_column, entrez_file)) |
|
|
576 |
elif entrez_col_num is None: |
|
|
577 |
raise AttributeError("Unable to locate a column corresponding to \'%s\' in the input file \'%s\'" % (entrez_column, entrez_file)) |
|
|
578 |
# If we have made it this far, then the header was valid |
|
|
579 |
continue |
|
|
580 |
|
|
|
581 |
# Check the Approval Status column to see if the gene name has been withdrawn |
|
|
582 |
if status_col_num is not None: |
|
|
583 |
if cols[status_col_num] != "Approved": |
|
|
584 |
continue |
|
|
585 |
|
|
|
586 |
# Parse the gene name and ID from the input file |
|
|
587 |
try: |
|
|
588 |
hugo_name = cols[hugo_col_num] |
|
|
589 |
entrez_id = cols[entrez_col_num] |
|
|
590 |
# Sanity check: The entrez ID is ALWAYS a number |
|
|
591 |
if not entrez_id.isdigit(): |
|
|
592 |
# If there is no entrez ID, skip this gene |
|
|
593 |
if entrez_id == "": |
|
|
594 |
skipped_genes.append(hugo_name) |
|
|
595 |
continue |
|
|
596 |
else: |
|
|
597 |
raise TypeError("When parsing line %s of \'%s\', the Entrez gene ID \'%s\' does not appear to be a valid Entrez Gene ID" % (i, entrez_file, entrez_id)) |
|
|
598 |
|
|
|
599 |
except IndexError as e: |
|
|
600 |
# If we end up in here, then there were too few columns in the current line to parse the IDs |
|
|
601 |
raise AttributeError("Unable to parse line %s of \'%s\': Line is truncated?" % (i, entrez_file)) from e |
|
|
602 |
|
|
|
603 |
# Sanity check: Make sure we haven't seen this gene name before |
|
|
604 |
if hugo_name in hugo_name_to_id: |
|
|
605 |
raise AttributeError("Gene \'%s\' appears duplicated in \'%s\'" % (hugo_name, entrez_file)) |
|
|
606 |
hugo_name_to_id[hugo_name] = entrez_id |
|
|
607 |
|
|
|
608 |
# If the Previous symbols column was found, annotate these gene names with the same ID |
|
|
609 |
previous_names = cols[prev_name_col_num].split(",") |
|
|
610 |
if previous_names[0] == "": |
|
|
611 |
continue # There are no previous names for this gene |
|
|
612 |
previous_names = [x.replace(" ", "") for x in previous_names] # Remove extra spaces |
|
|
613 |
for hugo_name in previous_names: |
|
|
614 |
# If we have seen this gene name before, then don't overwrite it. Just assume that the first instance |
|
|
615 |
# encountered is correct. This likely isn't the case, but these cases should be so rare that they are |
|
|
616 |
# not worth worrying about |
|
|
617 |
if hugo_name in alt_hugo_name_to_id: |
|
|
618 |
continue |
|
|
619 |
alt_hugo_name_to_id[hugo_name] = entrez_id |
|
|
620 |
|
|
|
621 |
# Final sanity check: As Hugo Symbols and Entrez IDs have a 1-1 mapping, make sure there are no duplicate Entrez IDs |
|
|
622 |
if len(hugo_name_to_id) != len(set(hugo_name_to_id.values())): |
|
|
623 |
num_ids = Counter(hugo_name_to_id.values()) |
|
|
624 |
# For simplicity, just print out one Entrez ID which is duplicated |
|
|
625 |
bad_id = num_ids.most_common(1)[0][0] |
|
|
626 |
bad_count = num_ids.most_common(1)[0][1] |
|
|
627 |
raise AttributeError("Entrez gene ID \'\%s\' appears %s times in the input file" % (bad_id, bad_count)) |
|
|
628 |
|
|
|
629 |
return hugo_name_to_id, alt_hugo_name_to_id |
|
|
630 |
|
|
|
631 |
|
|
|
632 |
def load_subset_ids(id_file): |
|
|
633 |
""" |
|
|
634 |
Loads a set of Entrez IDs from a specified file. One ID/line |
|
|
635 |
|
|
|
636 |
:param id_file: |
|
|
637 |
""" |
|
|
638 |
|
|
|
639 |
subset_ids = [] |
|
|
640 |
with open(id_file) as f: |
|
|
641 |
for line in f: |
|
|
642 |
line = line.rstrip("\n").rstrip("\r") |
|
|
643 |
if not line.isdigit(): |
|
|
644 |
raise TypeError("Invalid Entrez ID when processing the --lyphmgen_genes file: %s" % line) |
|
|
645 |
subset_ids.append(line) |
|
|
646 |
|
|
|
647 |
subset_ids = set(subset_ids) |
|
|
648 |
return subset_ids |
|
|
649 |
|
|
|
650 |
|
|
|
651 |
def generate_mut_flat(in_maf: str, seq_type: str, gene_ids: dict, out_mut_flat: str, out_gene_list: str, alt_gene_ids: dict = None, subset_ids: iter = None): |
|
|
652 |
""" |
|
|
653 |
Converts a MAF file into the mutation flat file and gene list used by LymphGen |
|
|
654 |
WARNING: GRCh37/hg19 is currently only supported! |
|
|
655 |
|
|
|
656 |
An example of the output mutation FLAT file: |
|
|
657 |
Sample ENTREZ.ID Type Location |
|
|
658 |
Sample1 604 MUTATION 187451395 |
|
|
659 |
Sample1 604 MUTATION 187451408 |
|
|
660 |
Sample1 613 MUTATION 23523596 |
|
|
661 |
Sample1 970 TRUNC 6590925 |
|
|
662 |
Sample1 1840 Synon 113495932 |
|
|
663 |
|
|
|
664 |
An example of the output gene list: |
|
|
665 |
"ENTREZ.ID" |
|
|
666 |
60 |
|
|
667 |
355 |
|
|
668 |
567 |
|
|
669 |
596 |
|
|
670 |
604 |
|
|
671 |
605 |
|
|
672 |
613 |
|
|
673 |
639 |
|
|
674 |
673 |
|
|
675 |
|
|
|
676 |
If targeted sequencing is specified, only the genes with at least one non-synonmymous mutation in the MAF file are used for |
|
|
677 |
the gene list. Otherwise, ALL genes are used. |
|
|
678 |
|
|
|
679 |
The following MAF columns are required: Hugo_Symbol, Variant_Classification, Tumor_Sample_Barcode. If NCBI_Build and |
|
|
680 |
Start_Position are found, the output mutation flat file will contain an additional column specifying "Location". |
|
|
681 |
If Tumor_Seq_Allele2 is provided, the MYD88 hotspot mutations will be annotated as "L265P" |
|
|
682 |
|
|
|
683 |
:param in_maf: A string containing a filepath to a MAF file containing the variants of interest |
|
|
684 |
:param seq_type: A string specifying the sequencing type used to identify mutations. Only "targeted", "exome", and "genome" are supported |
|
|
685 |
:param gene_ids: A dictionary containing the mapping between Hugo_Symbol: Entrez_Gene_ID. |
|
|
686 |
:param out_mut_flat: A string specifying the output path to the mutation flat file |
|
|
687 |
:param out_gene_list: A string specifying the output path to the gene list |
|
|
688 |
:param alt_gene_ids: An additional dictionary specifying alternate Hugo_Symbol: Entrez_Gene_ID mappings |
|
|
689 |
:param subset_ids: An interable specifying a list of Entrez IDs. Only mutations within these Entrez IDs will be output |
|
|
690 |
:return: A list specifying all samples analyzed for mutations |
|
|
691 |
""" |
|
|
692 |
logging.info("Processing MAF file") |
|
|
693 |
|
|
|
694 |
# Non-synonmymous mutation types: |
|
|
695 |
non_syn = {"In_Frame_Del", "In_Frame_Ins", "Missense_Mutation"} |
|
|
696 |
trunc_mut = {"Frame_Shift_Del", "Frame_Shift_Ins", "Nonsense_Mutation", "Nonstop_Mutation", "Splice_Site", "Translation_Start_Site"} |
|
|
697 |
synon_mut = {"Silent", "5'UTR", "5'Flank", "Intron", "3'UTR"} # Use for the "Synon" mutation type |
|
|
698 |
|
|
|
699 |
# Which samples are we analyzing? |
|
|
700 |
sample_list = SortedSet() # Used to ensure the output files are in somewhat sorted order |
|
|
701 |
|
|
|
702 |
required_cols = ["Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode"] |
|
|
703 |
optional_cols = ["NCBI_Build", "Start_Position", "Tumor_Seq_Allele2"] |
|
|
704 |
header_cols = {} |
|
|
705 |
out_header_written = False |
|
|
706 |
out_header_cols = ["Sample", "ENTREZ.ID", "Type"] |
|
|
707 |
if alt_gene_ids is None: |
|
|
708 |
alt_gene_ids = {} |
|
|
709 |
|
|
|
710 |
# For targeted sequencing, which genes have we sequenced and seen mutations in? |
|
|
711 |
genes_seen = {} |
|
|
712 |
skipped_mut = [] # Store mutations in genes with no Entrez ID |
|
|
713 |
|
|
|
714 |
i = 0 |
|
|
715 |
with open(in_maf) as f, open(out_mut_flat, "w") as o: |
|
|
716 |
for line in f: |
|
|
717 |
i += 1 |
|
|
718 |
# Ignore comment lines |
|
|
719 |
if line[0] == "#": |
|
|
720 |
continue |
|
|
721 |
line = line.rstrip("\n").rstrip("\r") |
|
|
722 |
cols = line.split("\t") |
|
|
723 |
|
|
|
724 |
# Skip empty lines |
|
|
725 |
if not line: |
|
|
726 |
continue |
|
|
727 |
|
|
|
728 |
# If we haven't parsed the header yet, assume we are parsing the first line of the file, and that is the header |
|
|
729 |
if not header_cols: |
|
|
730 |
j = 0 |
|
|
731 |
for col in cols: |
|
|
732 |
if col in required_cols: |
|
|
733 |
header_cols[col] = j |
|
|
734 |
elif col in optional_cols: |
|
|
735 |
header_cols[col] = j |
|
|
736 |
j += 1 |
|
|
737 |
|
|
|
738 |
# Check that we have all required columns |
|
|
739 |
for col in required_cols: |
|
|
740 |
if col not in header_cols: |
|
|
741 |
raise AttributeError("Unable to locate a column specifying \'%s\' in the MAF file header" % col) |
|
|
742 |
# See which optional columns we have |
|
|
743 |
if "Start_Position" in header_cols: |
|
|
744 |
out_header_cols.append("Location") |
|
|
745 |
logging.info("\'Start_Position\' column found in MAF file. Output mutation flat file will be annotated with \'Position\'") |
|
|
746 |
if "Tumor_Seq_Allele2" in header_cols: |
|
|
747 |
logging.info( |
|
|
748 |
"\'Tumor_Seq_Allele2\' column found in MAF file. MYD88 hotspot mutations will be annotated as \'L265P\'") |
|
|
749 |
continue |
|
|
750 |
|
|
|
751 |
# Process mutations |
|
|
752 |
mut_attributes = {x: cols[y] for x, y in header_cols.items()} |
|
|
753 |
|
|
|
754 |
# Check genome build, as only GRCh37 is supported |
|
|
755 |
if "NCBI_Build" in mut_attributes and "Start_Position" in mut_attributes and mut_attributes["NCBI_Build"] != "GRCh37": |
|
|
756 |
raise NotImplementedError("Only GRCh37 is currently supported as a reference genome") |
|
|
757 |
|
|
|
758 |
# Lets get the easiest stuff out of the way first. Specify the output sample name and gene ID |
|
|
759 |
sample_name = mut_attributes["Tumor_Sample_Barcode"] |
|
|
760 |
if sample_name == "" or sample_name == ".": |
|
|
761 |
raise AttributeError("No tumor_sample_barcode was found when processing line %s of the MAF file" % i) |
|
|
762 |
|
|
|
763 |
hugo_name = mut_attributes["Hugo_Symbol"] |
|
|
764 |
try: |
|
|
765 |
# Skip intergenic mutations |
|
|
766 |
if hugo_name == "Unknown": |
|
|
767 |
continue |
|
|
768 |
entrez_id = gene_ids[hugo_name] |
|
|
769 |
except KeyError: |
|
|
770 |
# If this gene does not have a Entrez ID, are we prehaps using an older Hugo_Symbol? |
|
|
771 |
if hugo_name in alt_gene_ids: |
|
|
772 |
entrez_id = alt_gene_ids[hugo_name] |
|
|
773 |
else: |
|
|
774 |
# Skip this mutation if there is no Entrez ID |
|
|
775 |
skipped_mut.append(line) |
|
|
776 |
continue |
|
|
777 |
|
|
|
778 |
# Obtain position (if Start_Position is specified) |
|
|
779 |
if "Start_Position" in mut_attributes: |
|
|
780 |
position = mut_attributes["Start_Position"] |
|
|
781 |
else: |
|
|
782 |
position = None |
|
|
783 |
|
|
|
784 |
# Finally, specify the variant type |
|
|
785 |
if mut_attributes["Variant_Classification"] in trunc_mut: |
|
|
786 |
genes_seen[entrez_id] = hugo_name # This gene has a non-synonmymous mutation, so lets assume we have sequenced it |
|
|
787 |
type = "TRUNC" |
|
|
788 |
elif mut_attributes["Variant_Classification"] in non_syn: |
|
|
789 |
genes_seen[entrez_id] = hugo_name # This gene has a non-synonmymous mutation, so lets assume we have sequenced it |
|
|
790 |
type = "MUTATION" |
|
|
791 |
elif mut_attributes["Variant_Classification"] in synon_mut: |
|
|
792 |
genes_seen[entrez_id] = hugo_name |
|
|
793 |
type = "Synon" |
|
|
794 |
else: |
|
|
795 |
# Ignore intronic mutations, 3'UTRs etc |
|
|
796 |
continue |
|
|
797 |
|
|
|
798 |
# If this mutation is in MYD88, does it affect the hotspot? |
|
|
799 |
if hugo_name == "MYD88" and "Tumor_Seq_Allele2" in mut_attributes: |
|
|
800 |
if mut_attributes["Start_Position"] == "38182641" and mut_attributes["Tumor_Seq_Allele2"] == "C": |
|
|
801 |
type = "L265P" |
|
|
802 |
|
|
|
803 |
if sample_name not in sample_list: |
|
|
804 |
sample_list.add(sample_name) |
|
|
805 |
|
|
|
806 |
# If this gene isn't in the subset list provide it, skip it |
|
|
807 |
if subset_ids is not None and entrez_id not in subset_ids: |
|
|
808 |
continue |
|
|
809 |
|
|
|
810 |
# Finally, write this variant |
|
|
811 |
# Write a header line, if we have not done so already |
|
|
812 |
if not out_header_written: |
|
|
813 |
out_header_written = True |
|
|
814 |
o.write("\t".join(out_header_cols)) |
|
|
815 |
o.write(os.linesep) |
|
|
816 |
|
|
|
817 |
out_line = [sample_name, entrez_id, type, position + os.linesep] |
|
|
818 |
o.write("\t".join(out_line)) |
|
|
819 |
|
|
|
820 |
# Check to see that we actually converted mutations |
|
|
821 |
if len(genes_seen) == 0: |
|
|
822 |
raise AttributeError("No mutations were successfully converted. Check that the --entrez_ids file has valid Hugo Symbols matching the --maf file") |
|
|
823 |
elif skipped_mut: |
|
|
824 |
logging.warning("%s mutations in the MAF file were not converted. These either don't have a valid Entrez ID, or they were not in the --lymphgen_gene file" % len(skipped_mut)) |
|
|
825 |
|
|
|
826 |
logging.info("Finished processing MAF file") |
|
|
827 |
logging.info("Generating gene list") |
|
|
828 |
# Generate the gene list file |
|
|
829 |
# This file a single column with the Entrez ID, as that is all LymphGen currently supports |
|
|
830 |
with open(out_gene_list, "w") as o: |
|
|
831 |
# Write header |
|
|
832 |
o.write("\"ENTREZ.ID\"" + os.linesep) |
|
|
833 |
|
|
|
834 |
# Write out all genes we have seen a mutation in |
|
|
835 |
for entrez_id in genes_seen.keys(): |
|
|
836 |
if subset_ids is not None and entrez_id not in subset_ids: |
|
|
837 |
continue # Skip these gene, as it wasn't in the --lymphgen_gene list |
|
|
838 |
o.write(entrez_id) |
|
|
839 |
o.write(os.linesep) |
|
|
840 |
|
|
|
841 |
if seq_type == "exome" or seq_type == "genome": |
|
|
842 |
# We have sequenced (effectively) all genes in the human genome. Write out all remaining genes |
|
|
843 |
for entrez_id in gene_ids.values(): |
|
|
844 |
if entrez_id in genes_seen: # We have already written out this gene. Skip it |
|
|
845 |
continue |
|
|
846 |
if subset_ids is not None and entrez_id not in subset_ids: |
|
|
847 |
continue # Skip these gene, as it wasn't in the --lymphgen_gene list |
|
|
848 |
o.write(entrez_id) |
|
|
849 |
o.write(os.linesep) |
|
|
850 |
|
|
|
851 |
return list(sample_list) |
|
|
852 |
|
|
|
853 |
|
|
|
854 |
def load_gene_coords_bed(bed_file, gene_ids, alt_gene_ids=None): |
|
|
855 |
""" |
|
|
856 |
Load in the genomic coordinates of genes in the human genome from the specified BED4+ file. |
|
|
857 |
|
|
|
858 |
The following columns are required (no header) |
|
|
859 |
chrom start end gene |
|
|
860 |
|
|
|
861 |
A gene can be split across multiple BED entries (ex. exonic coordinates). In this case, the start and end of the first and last exon, |
|
|
862 |
respectively, will be used as the gene start/end coordinates |
|
|
863 |
|
|
|
864 |
:param bed_file: A string containing a filepath to a BED4+ file listing the positions of genes |
|
|
865 |
:return: A dictionary storing {gene_name: Gene()} |
|
|
866 |
""" |
|
|
867 |
if alt_gene_ids is None: |
|
|
868 |
alt_gene_ids = {} |
|
|
869 |
gene_coords = {} |
|
|
870 |
skipped_genes = [] |
|
|
871 |
|
|
|
872 |
i = 0 |
|
|
873 |
with open(bed_file) as f: |
|
|
874 |
for line in f: |
|
|
875 |
i += 1 |
|
|
876 |
line = line.rstrip("\n").rstrip("\r") # Remove line endings |
|
|
877 |
cols = line.split("\t") |
|
|
878 |
|
|
|
879 |
# Skip empty lines |
|
|
880 |
if not line: |
|
|
881 |
continue |
|
|
882 |
|
|
|
883 |
# Since this BED file could contain more than 4 columns (ex. a BED6 or BED12 file), manually unpack and inspect the first |
|
|
884 |
# four columns, since those are the columns we care about |
|
|
885 |
try: |
|
|
886 |
chrom = cols[0] |
|
|
887 |
start = cols[1] |
|
|
888 |
end = cols[2] |
|
|
889 |
gene = cols[3] |
|
|
890 |
except IndexError: |
|
|
891 |
# This BED entry was trucated |
|
|
892 |
raise AttributeError("Unable to parse line %s of %s as it contains less than four columns (chrom, start, end, gene)" % (i, bed_file)) |
|
|
893 |
|
|
|
894 |
# Check that the start and end are actually genomic coordinates |
|
|
895 |
if not start.isdigit(): |
|
|
896 |
raise TypeError("When parsing line %s of %s, the start position \'%s\' is not a valid genomic coordinate" % (i, bed_file, start)) |
|
|
897 |
if not end.isdigit(): |
|
|
898 |
raise TypeError("When parsing line %s of %s, the end position \'%s\' is not a valid genomic coordinate" % (i, bed_file, end)) |
|
|
899 |
|
|
|
900 |
start = int(start) |
|
|
901 |
end = int(end) |
|
|
902 |
chrom = chrom.replace("chr", "") |
|
|
903 |
|
|
|
904 |
# Is the gene name the Hugo Symbol or the Entrez gene ID? |
|
|
905 |
# If its an integer, lets assume its the Entrez ID |
|
|
906 |
# If its not, assume it is the Hugo Symbol and convert it to the Entrez ID |
|
|
907 |
if not gene.isdigit(): |
|
|
908 |
try: |
|
|
909 |
entrez_id = gene_ids[gene] |
|
|
910 |
except KeyError: |
|
|
911 |
# If we can't find this Hugo Symbol in the default mappings, check the alt gene IDs |
|
|
912 |
if gene in alt_gene_ids: |
|
|
913 |
entrez_id = alt_gene_ids[gene] |
|
|
914 |
else: |
|
|
915 |
# This gene doesn't have an Entrez ID. Skip it |
|
|
916 |
skipped_genes.append(gene) |
|
|
917 |
continue |
|
|
918 |
else: |
|
|
919 |
entrez_id = gene |
|
|
920 |
|
|
|
921 |
# Have we seen this chromosome before? |
|
|
922 |
if chrom not in gene_coords: |
|
|
923 |
gene_coords[chrom] = {} |
|
|
924 |
# Have we seen this gene before? |
|
|
925 |
if entrez_id in gene_coords[chrom]: |
|
|
926 |
# If so, then we need to update the start/end of this gene based on this new BED entry |
|
|
927 |
existing_gene_entry = gene_coords[chrom][entrez_id] |
|
|
928 |
|
|
|
929 |
# Sanity check |
|
|
930 |
if chrom != existing_gene_entry.chrom: |
|
|
931 |
raise AttributeError("We found two entries for gene \'%s\'. One is found on chromosome \'%s\', while the other is found on \'%s\'" |
|
|
932 |
% (gene, chrom, existing_gene_entry.chrom)) |
|
|
933 |
if start < existing_gene_entry.start: |
|
|
934 |
existing_gene_entry.start = start |
|
|
935 |
if end > existing_gene_entry.end: |
|
|
936 |
existing_gene_entry.end = end |
|
|
937 |
else: |
|
|
938 |
# If we haven't seen this gene before, create a new entry for it |
|
|
939 |
gene_coords[chrom][entrez_id] = Gene(chrom, start, end, entrez_id) |
|
|
940 |
|
|
|
941 |
# Now that we have processed all genes, make sure we actually found Entrez IDs for a handful of genes |
|
|
942 |
# If we haven't, it means the input file likely didn't contain Hugo Symbols |
|
|
943 |
if len(gene_coords) == 0: |
|
|
944 |
raise AttributeError("No Hugo Symbols from the --genes file (column 4) were found in the --entrez_ids file. An example gene is %s" % (skipped_genes[0])) |
|
|
945 |
|
|
|
946 |
return gene_coords |
|
|
947 |
|
|
|
948 |
|
|
|
949 |
def load_chrom_arm(arm_file): |
|
|
950 |
""" |
|
|
951 |
Loads in the genomic coordinates corresponding to the arm of each chromosome |
|
|
952 |
|
|
|
953 |
The input should be a tab-delimited file with the following columns |
|
|
954 |
chromosome start end arm |
|
|
955 |
|
|
|
956 |
:param arm_file: A string containing a filepath to a tab-delimited file containing genomic coordinates for chromosome arms |
|
|
957 |
:return: A dictionary storing the genomic range of each chromosome, and each individual arm |
|
|
958 |
""" |
|
|
959 |
|
|
|
960 |
arm_chrom = {} |
|
|
961 |
required_cols = ["chromosome", "start", "end", "arm"] |
|
|
962 |
header_cols = {} |
|
|
963 |
|
|
|
964 |
i = 0 |
|
|
965 |
with open(arm_file) as f: |
|
|
966 |
for line in f: |
|
|
967 |
i += 1 |
|
|
968 |
line = line.rstrip("\n").rstrip("\r") # Remove line endings |
|
|
969 |
cols = line.split("\t") |
|
|
970 |
|
|
|
971 |
# Skip empty lines |
|
|
972 |
if not line: |
|
|
973 |
continue |
|
|
974 |
|
|
|
975 |
# If we haven't parsed the header yet, assume this is the first line of the file (aka the header) |
|
|
976 |
if not header_cols: |
|
|
977 |
j = 0 |
|
|
978 |
for col in cols: |
|
|
979 |
if col in required_cols: |
|
|
980 |
header_cols[col] = j |
|
|
981 |
j += 1 |
|
|
982 |
|
|
|
983 |
# Check to make sure all required columns are found |
|
|
984 |
for col in required_cols: |
|
|
985 |
if col not in header_cols: |
|
|
986 |
raise AttributeError("Unable to locate column %s in the chromosome arm positions file \'%s\'" % (col, arm_file)) |
|
|
987 |
# If we get this far, the header is valid |
|
|
988 |
continue |
|
|
989 |
|
|
|
990 |
try: |
|
|
991 |
arm_attributes = {x: cols[y] for x, y in header_cols.items()} |
|
|
992 |
except IndexError: |
|
|
993 |
# We were unable to find a required column. This line is likely truncated |
|
|
994 |
raise AttributeError("Unable to parse line %s of the chromosome arm positions file %s: The line appears truncated" % (i, arm_file)) |
|
|
995 |
|
|
|
996 |
# Have we seen this chromosome before? If not, lets create an object to store its genomic features |
|
|
997 |
chrom = arm_attributes["chromosome"] |
|
|
998 |
|
|
|
999 |
# Strip chr prefix |
|
|
1000 |
chrom = chrom.replace("chr", "") |
|
|
1001 |
|
|
|
1002 |
if chrom not in arm_chrom: |
|
|
1003 |
arm_chrom[chrom] = Chromosome(chrom) |
|
|
1004 |
|
|
|
1005 |
# Save the arm coordinates in the chromosome |
|
|
1006 |
try: |
|
|
1007 |
arm_chrom[chrom].add(int(arm_attributes["start"]), int(arm_attributes["end"]), arm_attributes["arm"]) |
|
|
1008 |
except ValueError as e: |
|
|
1009 |
raise TypeError("Unable to process line %s of \'%s\': start and end must be integers") from e |
|
|
1010 |
|
|
|
1011 |
return arm_chrom |
|
|
1012 |
|
|
|
1013 |
|
|
|
1014 |
def get_overlap_genes(chrom: str, start: int, end: int, copy_num: int, gene_cords: dict): |
|
|
1015 |
|
|
|
1016 |
""" |
|
|
1017 |
Find the genes which overlap a given segment |
|
|
1018 |
|
|
|
1019 |
This is a very brute-force approach which will check all genes on the target chromosome for overlap. Pre-sorting and bisecting genomic |
|
|
1020 |
regions would be significantly faster, but 1) You need to handle overlapping genes, and 2) I am assuming the performance penalty won't |
|
|
1021 |
matter too much. |
|
|
1022 |
|
|
|
1023 |
Note that gains and losses are handled differently if a segment partially overlaps a gene. If the event is a loss, the gene is included. |
|
|
1024 |
If the event is a gain, the gene is NOT included, as that copy is likely not functional |
|
|
1025 |
|
|
|
1026 |
:param chrom: A string coresponding to the contig name |
|
|
1027 |
:param start: An int specifying the start of the segment |
|
|
1028 |
:param end: An int specifying the end of the segment |
|
|
1029 |
:param copy_num: An integer specifying the copy number of this segment |
|
|
1030 |
:param gene_cords: A dictionary storing the positions of genes, in the format {chromosome: {gene1: attr, gene2: attr...}} |
|
|
1031 |
:return: |
|
|
1032 |
""" |
|
|
1033 |
|
|
|
1034 |
# Handle chr-prefix shenanigans |
|
|
1035 |
is_chr_prefixed = next(iter(gene_cords.keys())).startswith("chr") |
|
|
1036 |
if is_chr_prefixed and not chrom.startswith("chr"): |
|
|
1037 |
chrom = "chr" + chrom |
|
|
1038 |
elif not is_chr_prefixed and chrom.startswith("chr"): |
|
|
1039 |
chrom = chrom.replace("chr", "") |
|
|
1040 |
|
|
|
1041 |
try: |
|
|
1042 |
genes_on_chrom = gene_cords[chrom] |
|
|
1043 |
except KeyError: |
|
|
1044 |
# No genes are on this contig. This could be a user error, or this a contig with no annotated genes |
|
|
1045 |
return [] |
|
|
1046 |
|
|
|
1047 |
# Go through each gene on this chromosome and see if the coordinates overlap with our regions |
|
|
1048 |
olap_genes = [] |
|
|
1049 |
for entrez_id, gene_info in genes_on_chrom.items(): |
|
|
1050 |
if start < gene_info.end and end > gene_info.start: |
|
|
1051 |
# Overlap found |
|
|
1052 |
# Now for the tricky part |
|
|
1053 |
# If the segment partially overlaps this gene, then we need to handle gains and losses differently |
|
|
1054 |
# Deleting or gaining half of a gene will likely cause it to no longer function |
|
|
1055 |
# Thus, partial deletions of genes could be drivers, while partial gains of genes are likely never drivers |
|
|
1056 |
if copy_num < 2: |
|
|
1057 |
olap_genes.append(entrez_id) |
|
|
1058 |
elif copy_num > 2: |
|
|
1059 |
if start > gene_info.start or end < gene_info.end: |
|
|
1060 |
continue # Partially overlapping |
|
|
1061 |
else: |
|
|
1062 |
olap_genes.append(entrez_id) |
|
|
1063 |
else: |
|
|
1064 |
raise NotImplementedError("Not calculating overlapping genes for copy-neutral segments") |
|
|
1065 |
|
|
|
1066 |
return olap_genes |
|
|
1067 |
|
|
|
1068 |
|
|
|
1069 |
def generate_cnv_files(cnv_segs, gene_regions_bed, arm_regions, gene_ids, out_cnv_gene, out_cnv_arm, sample_ids, alt_gene_ids=None, subset_ids=None, input_log2: bool= False, focal_cn_thresh:int = 30000000): |
|
|
1070 |
""" |
|
|
1071 |
Characterize focal and arm-level copy number events, and summarize them in the respective output files. |
|
|
1072 |
|
|
|
1073 |
For focal events (i.e. events smaller than the specified threshold), identify all genes which overlap the even, and save them to the out_cnv_gene file |
|
|
1074 |
All events are used to flag chromosomes or individual chromosomal arms which are gained or amplified. |
|
|
1075 |
|
|
|
1076 |
The cnv_segs file should have the following columns (extra columns are ignored): |
|
|
1077 |
Tumor_Sample_Barcode chromosome start" end CN |
|
|
1078 |
|
|
|
1079 |
Where CN is the absolute copy number |
|
|
1080 |
|
|
|
1081 |
The gene_regions_bed can have multiple entries for each gene (ex. exons). The maximum and minimum of the cumulative entries is used to define the |
|
|
1082 |
gene boundaries |
|
|
1083 |
|
|
|
1084 |
The arm_regions file should contain the following columns: |
|
|
1085 |
chromosome start end arm |
|
|
1086 |
|
|
|
1087 |
:param cnv_segs: A string containing a filepath to a tab-delimited file containing copy number events |
|
|
1088 |
:param gene_regions_bed: A string specifying a filepath to a BED4+ file containing gene positions |
|
|
1089 |
:param arm_regions: A string specifying a filepath to a tab-delimited file specifying the positions of chromosomal arms |
|
|
1090 |
:param gene_ids: A dictionary mapping {Hugo_Symbol: Entrez_ID} |
|
|
1091 |
:param out_cnv_gene: A string specifying the cnv_flat output file |
|
|
1092 |
:param out_cnv_arm: A string specifying the cnv_arm output file |
|
|
1093 |
:param sample_ids: A list containing samples which have one or more somatic mutations |
|
|
1094 |
:param alt_gene_ids: An additional dictionary mapping {Hugo_Symbol: Entrez_IDs}. Used if previous Hugo_Symbols were assigned to a gene |
|
|
1095 |
:param subset_ids: An iterable listing a set of Entrez IDs. Only CNVs overlapping these genes will be output |
|
|
1096 |
:param input_log2: Does the input file contain log2 ratios instead of absolute CN? |
|
|
1097 |
:param focal_cn_thresh: The maximum size of an event for it to be considered "focal", in bp |
|
|
1098 |
:return: A list of samples which have CNV information |
|
|
1099 |
""" |
|
|
1100 |
logging.info("Processing copy-number segments") |
|
|
1101 |
if input_log2: |
|
|
1102 |
logging.debug("Converting from log2 ratios") |
|
|
1103 |
# First things first, lets load in the gene regions file and figure out where each gene is |
|
|
1104 |
gene_coords = load_gene_coords_bed(gene_regions_bed, gene_ids, alt_gene_ids=alt_gene_ids) |
|
|
1105 |
|
|
|
1106 |
# Load in the chromosome arm regions |
|
|
1107 |
arm_coods = load_chrom_arm(arm_regions) |
|
|
1108 |
|
|
|
1109 |
# Process copy number segments |
|
|
1110 |
required_cols = ["Tumor_Sample_Barcode", "chromosome", "start", "end", "CN"] |
|
|
1111 |
header_cols = {} |
|
|
1112 |
|
|
|
1113 |
sample_cnvs = {} |
|
|
1114 |
|
|
|
1115 |
i = 0 |
|
|
1116 |
with open(cnv_segs) as f: |
|
|
1117 |
for line in f: |
|
|
1118 |
i += 1 |
|
|
1119 |
line = line.rstrip("\n").rstrip("\r") # Remove line endings |
|
|
1120 |
cols = line.split("\t") |
|
|
1121 |
|
|
|
1122 |
# Skip empty lines |
|
|
1123 |
if not line: |
|
|
1124 |
continue |
|
|
1125 |
|
|
|
1126 |
# If we haven't parsed the header yet, assume this is the first line of the file (aka the header) |
|
|
1127 |
if not header_cols: |
|
|
1128 |
j = 0 |
|
|
1129 |
for col in cols: |
|
|
1130 |
if col in required_cols: |
|
|
1131 |
header_cols[col] = j |
|
|
1132 |
j += 1 |
|
|
1133 |
|
|
|
1134 |
# Check to make sure all required columns are found |
|
|
1135 |
for col in required_cols: |
|
|
1136 |
if col not in header_cols: |
|
|
1137 |
raise AttributeError("Unable to locate column \'%s\' in the CNV segments file \'%s\'" % (col, cnv_segs)) |
|
|
1138 |
# If we get this far, the header is valid |
|
|
1139 |
continue |
|
|
1140 |
|
|
|
1141 |
# Process this CNV entry |
|
|
1142 |
cnv_attributes = {x: cols[y] for x, y in header_cols.items()} |
|
|
1143 |
|
|
|
1144 |
# Have we processed CNVs from this sample before? |
|
|
1145 |
if cnv_attributes["Tumor_Sample_Barcode"] not in sample_cnvs: |
|
|
1146 |
sample_cnvs[cnv_attributes["Tumor_Sample_Barcode"]] = SampleCNVs() |
|
|
1147 |
# Sterilize input and ensure it is valid, and store these events |
|
|
1148 |
try: |
|
|
1149 |
start = int(cnv_attributes["start"]) |
|
|
1150 |
end = int(cnv_attributes["end"]) |
|
|
1151 |
# Remove segments which are of length 0 |
|
|
1152 |
if start == end: |
|
|
1153 |
continue |
|
|
1154 |
# Is the CN state absolute CN or log2 ratios? If log2 ratios, convert to absolute CN |
|
|
1155 |
cn_state = float(cnv_attributes["CN"]) |
|
|
1156 |
if input_log2: |
|
|
1157 |
cn_state = math.pow(2, cn_state + 1) |
|
|
1158 |
if cn_state < 0: |
|
|
1159 |
cn_state = 0 |
|
|
1160 |
elif cn_state < 0: # Sanity check that the user didn't accidentally provide log2 ratios |
|
|
1161 |
raise AttributeError("Unable to process line %s of \'%s\': \'%s\': Negative copy number segment. Did you mean to specify the --log2 flag?" % (i, cnv_segs, line)) |
|
|
1162 |
|
|
|
1163 |
cnv_attributes["CN"] = round(cn_state) |
|
|
1164 |
|
|
|
1165 |
sample_cnvs[cnv_attributes["Tumor_Sample_Barcode"]].add( |
|
|
1166 |
cnv_attributes["chromosome"], |
|
|
1167 |
start, |
|
|
1168 |
end, |
|
|
1169 |
cnv_attributes["CN"]) |
|
|
1170 |
except ValueError as e: |
|
|
1171 |
raise TypeError("Unable to process line %s of \'%s\': start, end, and CN must be numeric" % (i, cnv_segs)) from e |
|
|
1172 |
except TypeError as e: |
|
|
1173 |
raise AttributeError("Unable to process line %s of \'%s\': \'%s\': End coordinate of segment occurs before start" % (i, cnv_segs, line)) from e |
|
|
1174 |
|
|
|
1175 |
# Now adjust for ploidy |
|
|
1176 |
logging.info("Adjusting sample ploidy") |
|
|
1177 |
for sample, cnvs in sample_cnvs.items(): |
|
|
1178 |
cnvs.adjust_ploidy(arm_coods, sample) |
|
|
1179 |
|
|
|
1180 |
# Now that we have processed all CNVs, lets see which genes have events, and write those out |
|
|
1181 |
logging.info("Calculating gene overlap with CNVs") |
|
|
1182 |
with open(out_cnv_gene, "w") as o: |
|
|
1183 |
|
|
|
1184 |
# Write output file header |
|
|
1185 |
out_header = ["Sample", "ENTREZ.ID", "Type"] |
|
|
1186 |
o.write("\t".join(out_header)) |
|
|
1187 |
o.write(os.linesep) |
|
|
1188 |
|
|
|
1189 |
# Process segments |
|
|
1190 |
for sample, cnvs in sample_cnvs.items(): |
|
|
1191 |
for chrom in cnvs.cn_states.keys(): |
|
|
1192 |
# parse each segment |
|
|
1193 |
for start, end, cn_state in zip(cnvs.starts[chrom], cnvs.ends[chrom], cnvs.cn_states[chrom]): |
|
|
1194 |
|
|
|
1195 |
# Ignore copy-neutral events |
|
|
1196 |
if cn_state == 2: |
|
|
1197 |
continue |
|
|
1198 |
# Is this event focal? If so, lets find the genes it overlaps |
|
|
1199 |
if end - start < focal_cn_thresh: |
|
|
1200 |
# What type of event is this? |
|
|
1201 |
if cn_state > 3: |
|
|
1202 |
event_type = "AMP" |
|
|
1203 |
elif cn_state > 2: |
|
|
1204 |
event_type = "GAIN" |
|
|
1205 |
elif cn_state < 1: |
|
|
1206 |
event_type = "HOMDEL" |
|
|
1207 |
elif cn_state < 2: |
|
|
1208 |
event_type = "HETLOSS" |
|
|
1209 |
else: |
|
|
1210 |
raise TypeError("Invalid copy number state \'%s\'" % cn_state) |
|
|
1211 |
|
|
|
1212 |
olap_genes = get_overlap_genes(chrom, start, end, cn_state, gene_coords) |
|
|
1213 |
# If any genes overlap, write out these genes |
|
|
1214 |
for gene in olap_genes: |
|
|
1215 |
|
|
|
1216 |
# If a list of genes to subset to was provided, subset those genes |
|
|
1217 |
if subset_ids is not None and gene not in subset_ids: |
|
|
1218 |
continue # Skip this gene, as it was not in the list provided |
|
|
1219 |
out_line = [sample, |
|
|
1220 |
gene, |
|
|
1221 |
event_type |
|
|
1222 |
] |
|
|
1223 |
o.write("\t".join(out_line)) |
|
|
1224 |
o.write(os.linesep) |
|
|
1225 |
|
|
|
1226 |
# Now that we have processed all the CNVs, identify which samples have arm-level and whole chromosomal copy number changes |
|
|
1227 |
logging.info("Calculating arm-level CNVs") |
|
|
1228 |
with open(out_cnv_arm, "w") as o: |
|
|
1229 |
# Write output header |
|
|
1230 |
out_header = ["Sample", "Arm", "Type"] |
|
|
1231 |
o.write("\t".join(out_header)) |
|
|
1232 |
o.write(os.linesep) |
|
|
1233 |
|
|
|
1234 |
for sample, cnvs in sample_cnvs.items(): |
|
|
1235 |
if sample not in sample_ids: |
|
|
1236 |
logging.warning("Sample \'%s\' was provided in the input CNVs file, but no SNVs were found. Skipping..." % sample) |
|
|
1237 |
continue |
|
|
1238 |
for chrom, chrom_info in arm_coods.items(): |
|
|
1239 |
|
|
|
1240 |
# For now, skip chromosome X and Y as those aren't supported? |
|
|
1241 |
if chrom == "X" or chrom == "chrX" or chrom == "Y" or chrom == "chrY": |
|
|
1242 |
continue |
|
|
1243 |
|
|
|
1244 |
events = cnvs.overlap_chrom(chrom_info) |
|
|
1245 |
# If there are large-scale CNVs in this sample, output them to the Arm flat file |
|
|
1246 |
for arm, c_type in events.items(): |
|
|
1247 |
out_line = [ |
|
|
1248 |
sample, |
|
|
1249 |
arm, |
|
|
1250 |
c_type |
|
|
1251 |
] |
|
|
1252 |
o.write("\t".join(out_line)) |
|
|
1253 |
o.write(os.linesep) |
|
|
1254 |
|
|
|
1255 |
return list(sample_cnvs.keys()) |
|
|
1256 |
|
|
|
1257 |
|
|
|
1258 |
def generate_sample_annot(samples: iter, cnv_samples: iter, out_sample_annot: str): |
|
|
1259 |
|
|
|
1260 |
logging.info("Generating sample annotation file") |
|
|
1261 |
with open(out_sample_annot , "w") as o: |
|
|
1262 |
# Write sample annotation file header |
|
|
1263 |
out_header_cols = ["Sample.ID", "Copy.Number", "BCL2.transloc","BCL6.transloc"] |
|
|
1264 |
o.write("\t".join(out_header_cols)) |
|
|
1265 |
o.write(os.linesep) |
|
|
1266 |
|
|
|
1267 |
for sample in samples: |
|
|
1268 |
# Were CNVs provided for this sample? |
|
|
1269 |
if sample in cnv_samples: |
|
|
1270 |
has_cn = "1" |
|
|
1271 |
else: |
|
|
1272 |
has_cn = "0" |
|
|
1273 |
out_line = [sample, |
|
|
1274 |
has_cn, |
|
|
1275 |
"NA", |
|
|
1276 |
"NA"] |
|
|
1277 |
o.write("\t".join(out_line)) |
|
|
1278 |
o.write(os.linesep) |
|
|
1279 |
|
|
|
1280 |
|
|
|
1281 |
def main(args=None): |
|
|
1282 |
|
|
|
1283 |
# Obtain input |
|
|
1284 |
if args is None: |
|
|
1285 |
args = get_args() |
|
|
1286 |
|
|
|
1287 |
# First, load in the mapping of Gene names and NCBI/Entrez IDs |
|
|
1288 |
gene_ids, alt_gene_ids = load_entrez_ids(args.entrez_ids) |
|
|
1289 |
|
|
|
1290 |
# If a --lymphgen_genes file was provided, load those genes |
|
|
1291 |
subset_ids = None |
|
|
1292 |
if args.lymphgen_genes: |
|
|
1293 |
subset_ids = load_subset_ids(args.lymphgen_genes) |
|
|
1294 |
|
|
|
1295 |
# Generate the mutation flat file and gene list using the input MAF file and Entrez IDs |
|
|
1296 |
out_mut_flat = args.outdir + os.sep + args.outprefix + "_mutation_flat.tsv" |
|
|
1297 |
out_gene_list = args.outdir + os.sep + args.outprefix + "_gene_list.txt" |
|
|
1298 |
sample_list = generate_mut_flat(args.maf, args.sequencing_type, gene_ids, out_mut_flat, out_gene_list, alt_gene_ids = alt_gene_ids, subset_ids=subset_ids) |
|
|
1299 |
|
|
|
1300 |
# Generate the copy number gene list file and arm flat file |
|
|
1301 |
if args.cnvs: |
|
|
1302 |
out_cnv_gene = args.outdir + os.sep + args.outprefix + "_cnv_flat.tsv" |
|
|
1303 |
out_cnv_arm = args.outdir + os.sep + args.outprefix + "_cnv_arm.tsv" |
|
|
1304 |
cnv_sample_list = generate_cnv_files(args.cnvs, args.genes, args.arms, gene_ids, out_cnv_gene, out_cnv_arm, sample_list, alt_gene_ids=alt_gene_ids, subset_ids=subset_ids, input_log2=args.log2) |
|
|
1305 |
else: |
|
|
1306 |
cnv_sample_list = set() |
|
|
1307 |
|
|
|
1308 |
# Generate a sample annotation file |
|
|
1309 |
out_sample_annot = args.outdir + os.sep + args.outprefix + "_sample_annotation.tsv" |
|
|
1310 |
generate_sample_annot(sample_list, set(cnv_sample_list), out_sample_annot) |
|
|
1311 |
|
|
|
1312 |
|
|
|
1313 |
if __name__ == "__main__": |
|
|
1314 |
main() |