[fcb5af]: / scripts / count_ashm_freq.R

Download this file

118 lines (99 with data), 3.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
library(GAMBLR)
library(tidyverse)
setwd("/projects/rmorin/projects/gambl-repos/gambl-kdreval/")
pathologies <- c("DLBCL", "FL", "CLL", "BL")
metadata <- get_gambl_metadata(seq_type_filter = "genome") %>%
filter(pathology %in% pathologies)
generate_table <- function(
this_meta,
projection = "grch37"
){
maf <- get_ssm_by_samples(
these_samples_metadata = this_meta,
projection = projection,
subset_from_merge = TRUE
)
if(projection == "grch37"){
regions <- grch37_ashm_regions %>%
mutate(
chr_name = gsub("chr", "", chr_name)
)
}else{
regions <- hg38_ashm_regions
}
annotated <- cool_overlaps(
maf,
regions %>%
mutate(
name = paste(gene, region, sep = "_")
),
columns2 = colnames(regions)[1:3]
)
mutated_counts <- annotated %>%
count(
gene, name, Tumor_Sample_Barcode
)
pathology_counts <- metadata %>%
select(Tumor_Sample_Barcode, pathology) %>%
group_by(pathology) %>%
mutate(total = n()) %>%
ungroup
all_counts <- left_join(
mutated_counts,
pathology_counts
)
all_counts <- all_counts %>%
group_by(pathology, name) %>%
mutate(mut = n()) %>%
ungroup %>%
select(-c(Tumor_Sample_Barcode, n)) %>%
distinct
all_counts <- all_counts %>%
mutate(pc = round((mut/total)*100, 2)) %>%
pivot_wider(
names_from = pathology,
names_glue = "{pathology}_{.value}",
values_from = c(total, mut, pc)
) %>% select(-gene)
output <- left_join(
regions %>%
mutate(
name = paste(gene, region, sep = "_")
) %>%
select(all_of(c("name", colnames(regions)[1:3]))),
all_counts
) %>%
replace_na(
list(
DLBCL_total = pull(unique(pathology_counts[pathology_counts$pathology=="DLBCL", "total"])),
FL_total = pull(unique(pathology_counts[pathology_counts$pathology=="FL", "total"])),
CLL_total = pull(unique(pathology_counts[pathology_counts$pathology=="CLL", "total"])),
BL_total = pull(unique(pathology_counts[pathology_counts$pathology=="BL", "total"]))
)
) %>%
replace(is.na(.), 0)
output <- output %>%
select(
name,
colnames(regions)[1:3],
contains("DLBCL"),
contains("FL"),
contains("CLL"),
contains("BL")
)
}
grch37_table <- generate_table(
this_meta = metadata
)
hg38_table <- generate_table(
this_meta = metadata,
projection = "hg38"
)
write_tsv(
grch37_table,
"~/my_dir/repos/LLMPP/resources/curated/somatic_hypermutation_locations_with_DLBCL_frequencies_grch37.tsv"
)
write_tsv(
hg38_table,
"~/my_dir/repos/LLMPP/resources/curated/somatic_hypermutation_locations_with_DLBCL_frequencies_hg38.tsv"
)