|
a |
|
b/scripts/count_ashm_freq.R |
|
|
1 |
library(GAMBLR) |
|
|
2 |
library(tidyverse) |
|
|
3 |
|
|
|
4 |
setwd("/projects/rmorin/projects/gambl-repos/gambl-kdreval/") |
|
|
5 |
|
|
|
6 |
pathologies <- c("DLBCL", "FL", "CLL", "BL") |
|
|
7 |
|
|
|
8 |
|
|
|
9 |
metadata <- get_gambl_metadata(seq_type_filter = "genome") %>% |
|
|
10 |
filter(pathology %in% pathologies) |
|
|
11 |
|
|
|
12 |
generate_table <- function( |
|
|
13 |
this_meta, |
|
|
14 |
projection = "grch37" |
|
|
15 |
){ |
|
|
16 |
maf <- get_ssm_by_samples( |
|
|
17 |
these_samples_metadata = this_meta, |
|
|
18 |
projection = projection, |
|
|
19 |
subset_from_merge = TRUE |
|
|
20 |
) |
|
|
21 |
|
|
|
22 |
if(projection == "grch37"){ |
|
|
23 |
regions <- grch37_ashm_regions %>% |
|
|
24 |
mutate( |
|
|
25 |
chr_name = gsub("chr", "", chr_name) |
|
|
26 |
) |
|
|
27 |
}else{ |
|
|
28 |
regions <- hg38_ashm_regions |
|
|
29 |
} |
|
|
30 |
|
|
|
31 |
annotated <- cool_overlaps( |
|
|
32 |
maf, |
|
|
33 |
regions %>% |
|
|
34 |
mutate( |
|
|
35 |
name = paste(gene, region, sep = "_") |
|
|
36 |
), |
|
|
37 |
columns2 = colnames(regions)[1:3] |
|
|
38 |
) |
|
|
39 |
|
|
|
40 |
mutated_counts <- annotated %>% |
|
|
41 |
count( |
|
|
42 |
gene, name, Tumor_Sample_Barcode |
|
|
43 |
) |
|
|
44 |
|
|
|
45 |
pathology_counts <- metadata %>% |
|
|
46 |
select(Tumor_Sample_Barcode, pathology) %>% |
|
|
47 |
group_by(pathology) %>% |
|
|
48 |
mutate(total = n()) %>% |
|
|
49 |
ungroup |
|
|
50 |
|
|
|
51 |
all_counts <- left_join( |
|
|
52 |
mutated_counts, |
|
|
53 |
pathology_counts |
|
|
54 |
) |
|
|
55 |
|
|
|
56 |
all_counts <- all_counts %>% |
|
|
57 |
group_by(pathology, name) %>% |
|
|
58 |
mutate(mut = n()) %>% |
|
|
59 |
ungroup %>% |
|
|
60 |
select(-c(Tumor_Sample_Barcode, n)) %>% |
|
|
61 |
distinct |
|
|
62 |
|
|
|
63 |
all_counts <- all_counts %>% |
|
|
64 |
mutate(pc = round((mut/total)*100, 2)) %>% |
|
|
65 |
pivot_wider( |
|
|
66 |
names_from = pathology, |
|
|
67 |
names_glue = "{pathology}_{.value}", |
|
|
68 |
values_from = c(total, mut, pc) |
|
|
69 |
) %>% select(-gene) |
|
|
70 |
|
|
|
71 |
output <- left_join( |
|
|
72 |
regions %>% |
|
|
73 |
mutate( |
|
|
74 |
name = paste(gene, region, sep = "_") |
|
|
75 |
) %>% |
|
|
76 |
select(all_of(c("name", colnames(regions)[1:3]))), |
|
|
77 |
all_counts |
|
|
78 |
) %>% |
|
|
79 |
replace_na( |
|
|
80 |
list( |
|
|
81 |
DLBCL_total = pull(unique(pathology_counts[pathology_counts$pathology=="DLBCL", "total"])), |
|
|
82 |
FL_total = pull(unique(pathology_counts[pathology_counts$pathology=="FL", "total"])), |
|
|
83 |
CLL_total = pull(unique(pathology_counts[pathology_counts$pathology=="CLL", "total"])), |
|
|
84 |
BL_total = pull(unique(pathology_counts[pathology_counts$pathology=="BL", "total"])) |
|
|
85 |
) |
|
|
86 |
) %>% |
|
|
87 |
replace(is.na(.), 0) |
|
|
88 |
|
|
|
89 |
output <- output %>% |
|
|
90 |
select( |
|
|
91 |
name, |
|
|
92 |
colnames(regions)[1:3], |
|
|
93 |
contains("DLBCL"), |
|
|
94 |
contains("FL"), |
|
|
95 |
contains("CLL"), |
|
|
96 |
contains("BL") |
|
|
97 |
) |
|
|
98 |
} |
|
|
99 |
|
|
|
100 |
grch37_table <- generate_table( |
|
|
101 |
this_meta = metadata |
|
|
102 |
) |
|
|
103 |
|
|
|
104 |
hg38_table <- generate_table( |
|
|
105 |
this_meta = metadata, |
|
|
106 |
projection = "hg38" |
|
|
107 |
) |
|
|
108 |
|
|
|
109 |
write_tsv( |
|
|
110 |
grch37_table, |
|
|
111 |
"~/my_dir/repos/LLMPP/resources/curated/somatic_hypermutation_locations_with_DLBCL_frequencies_grch37.tsv" |
|
|
112 |
) |
|
|
113 |
|
|
|
114 |
write_tsv( |
|
|
115 |
hg38_table, |
|
|
116 |
"~/my_dir/repos/LLMPP/resources/curated/somatic_hypermutation_locations_with_DLBCL_frequencies_hg38.tsv" |
|
|
117 |
) |