bc_ml_pub / Git / [3e731f] /R_scripts/Signatures.R

Models:
joseph-gordon/
bc_ml_pub
Downloads: 1
[3e731f]: / R_scripts / Signatures.R
History
Download this file
1239 lines (1084 with data), 62.2 kB

# This script is used to compare our signature with other published signatures

# Libraries #####
library(dplyr)
library(ggplot2)
library(openxlsx)
library(genefu)
library(org.Hs.eg.db)

# Annotation: files and function ######
# ID_Map
official = org.Hs.egSYMBOL
mapped_genes_official = mappedkeys(official)
official_df = as.data.frame(official[mapped_genes_official])
official_df = official_df %>% dplyr::rename(EntrezGene.ID = gene_id, Gene.Symbol = symbol)
official_df$HGNC_Official = "Yes"
official_df = official_df[-which(duplicated(official_df$Gene.Symbol)==T),]

alias = org.Hs.egALIAS2EG
mapped_genes_alias = mappedkeys(alias)
alias_df = as.data.frame(alias[mapped_genes_alias])
alias_df = alias_df %>% dplyr::rename(EntrezGene.ID = gene_id, Gene.Symbol = alias_symbol)
alias_df = alias_df[-which(alias_df$Gene.Symbol %in% official_df$Gene.Symbol),]
alias_df$HGNC_Official = "No"

ID_Map = rbind(official_df, alias_df) %>% distinct()
ID_Map$EntrezGene.ID = as.numeric(ID_Map$EntrezGene.ID)
ID_Map = ID_Map[order(ID_Map$EntrezGene.ID),] %>%
  dplyr::rename(probe=Gene.Symbol) %>%
  dplyr::select(probe, EntrezGene.ID, HGNC_Official)

# Aliases
aliases_for_join = alias_df %>% dplyr::rename(Alias = Gene.Symbol)
Aliases = official_df %>% inner_join(aliases_for_join,
                                     by = "EntrezGene.ID", multiple = "all") %>%
  dplyr::select(Alias, Gene.Symbol, EntrezGene.ID) %>%
  dplyr::rename(probe = Alias, HGNC_Symbol = Gene.Symbol,
                Entrez = EntrezGene.ID) %>%
  distinct()

rm(alias, alias_df, aliases_for_join, official, mapped_genes_alias, mapped_genes_official)

# Import our gene expression signature #####
NRS = read.xlsx("data/Output sets/Downstream/DGEA.xlsx") %>%
  dplyr::filter(adj.P.Val < 0.05) %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)

# Genefu signatures #####

# ClaudinLow
# Entrez ID's are matched to official gene symbols. Unmapped Entrez ID's are discarded
data("claudinLowData"); claudinLow = as.data.frame(list(EntrezGene.ID = claudinLowData$fnames)) %>%
  inner_join(official_df, by = "EntrezGene.ID") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)
rm(claudinLowData)

# pam50
data("pam50"); pam50 = pam50$centroids.map %>% 
  dplyr::select(Gene.Symbol = probe.centroids, EntrezGene.ID)

# scmgene is easy to generate manually
scmgene = as.data.frame(rbind(c("ESR1", 2099), c("ERBB2", 2064), c("AURKA", 6790)))
colnames(scmgene) = c("Gene.Symbol", "EntrezGene.ID")

# scmod1
data("scmod1.robust")
scmod1_ESR1_module = as.data.frame(scmod1.robust$mod$ESR1) %>%
  inner_join(official_df, by = "EntrezGene.ID") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)

scmod1_ERBB2_module = as.data.frame(scmod1.robust$mod$ERBB2) %>%
  inner_join(official_df, by = "EntrezGene.ID") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)

scmod1_AURKA_module = as.data.frame(scmod1.robust$mod$AURKA) %>%
  inner_join(official_df, by = "EntrezGene.ID") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)
rm(scmod1.robust)

# scmod2
data("scmod2.robust")
scmod2_ESR1_module = as.data.frame(scmod2.robust$mod$ESR1) %>%
  inner_join(official_df, by = "EntrezGene.ID") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)

scmod2_ERBB2_module = as.data.frame(scmod2.robust$mod$ERBB2) %>%
  inner_join(official_df, by = "EntrezGene.ID") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)

scmod2_AURKA_module = as.data.frame(scmod2.robust$mod$AURKA) %>%
  inner_join(official_df, by = "EntrezGene.ID") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)
rm(scmod2.robust)

# endoPredict
data("sig.endoPredict"); endoPredict = sig.endoPredict %>% 
  dplyr::select(Gene.Symbol = symbol, EntrezGene.ID)
rm(sig.endoPredict)

# OncotypeDX
data("sig.oncotypedx"); OncotypeDX = sig.oncotypedx %>% 
  dplyr::select(Gene.Symbol = symbol, EntrezGene.ID)
rm(sig.oncotypedx)

# Mammaprint
data("sig.gene70"); Mammaprint = sig.gene70 %>%
  dplyr::select(Gene.Symbol = NCBI.gene.symbol, EntrezGene.ID) %>%
  dplyr::filter(!is.na(Gene.Symbol))
rm(sig.gene70)

# GGI
data("sig.ggi"); GGI = sig.ggi%>%
  dplyr::select(Gene.Symbol = NCBI.gene.symbol, EntrezGene.ID) %>%
  dplyr::filter(!is.na(Gene.Symbol))
rm(sig.ggi)

# PIK3CA
data("sig.pik3cags"); sig.pik3cags$EntrezGene.ID = as.character(sig.pik3cags$EntrezGene.ID)
PIK3CA = sig.pik3cags %>%
  inner_join(official_df, by = "EntrezGene.ID") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)
rm(sig.pik3cags)

# METABRIC Integrative Clusters
library(iC10)
data("Map.Exp"); iC10 = Map.Exp %>% dplyr::rename(Gene.Symbol = Gene_symbol) %>%
  left_join(official_df, by = "Gene.Symbol") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)
rm(Map.Exp)

# IHC4 (ESR1, ERBB2, PGR, MKI67)
IHC4 = as.data.frame(rbind(c("ESR1", 2099), c("ERBB2", 2064), 
                           c("PGR", 5241), c("MKI67", 4288)))
colnames(IHC4) = c("Gene.Symbol", "EntrezGene.ID")

# MetaGX (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6584731/)
MetaGX = read.xlsx("Signatures/MetaGX.xlsx") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)

# Danaher (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5319024/)
# Supplementary Material
Danaher = read.xlsx("Signatures/Danaher.xlsx", sheet = 4) %>%
  dplyr::rename(Gene.Symbol = Gene) %>%
  left_join(official_df, by = "Gene.Symbol") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)

# MCP (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5073889/)
# Genes retrieved from GitHub repo 
# (https://github.com/ebecht/MCPcounter/blob/master/Signatures/genes.txt)
MCP = read.table("Signatures/MCP.txt")
colnames(MCP) = MCP[1,]
MCP = MCP[-1,] %>% 
  dplyr::select(Gene.Symbol = `HUGO symbols`, EntrezGene.ID = ENTREZID)

# Immunophenoscore (https://pubmed.ncbi.nlm.nih.gov/28052254/)
# Supplementary Material
Immunophenoscore = read.xlsx("Signatures/Immunophenoscore.xlsx")
colnames(Immunophenoscore) = Immunophenoscore[1,]
Immunophenoscore = Immunophenoscore[-1,] %>%
  dplyr::rename(Gene.Symbol = Metagene) %>%
  left_join(official_df, by = "Gene.Symbol") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)

# Create a signature list #####
Signatures = list(pam50 = pam50,
                  scmod1_AURKA = scmod1_AURKA_module,
                  scmod1_ERBB2_module = scmod1_ERBB2_module,
                  scmod1_ESR1_module = scmod1_ESR1_module,
                  scmod2_AURKA = scmod2_AURKA_module,
                  scmod2_ERBB2_module = scmod2_ERBB2_module,
                  scmod2_ESR1_module = scmod2_ESR1_module,
                  scmgene = scmgene,
                  IHC4 = IHC4,
                  claudinLow = claudinLow,
                  endoPredict = endoPredict,
                  OncotypeDX = OncotypeDX,
                  Mammaprint = Mammaprint,
                  GGI = GGI,
                  PIK3CA = PIK3CA,
                  iC10 = iC10,
                  MetaGX = MetaGX,
                  Danaher = Danaher,
                  MCP = MCP,
                  Immunophenoscore = Immunophenoscore)

# Cleaning up
rm(pam50, scmod1_AURKA_module, scmod1_ERBB2_module, scmod1_ESR1_module,
   scmod2_AURKA_module, scmod2_ERBB2_module, scmod2_ESR1_module, scmgene,
   IHC4, claudinLow, endoPredict, OncotypeDX, Mammaprint, GGI, PIK3CA, iC10,
   MetaGX, Danaher, MCP, Immunophenoscore)

# Overlaps with our signature (NRS) #####

# Preparing output
GS_overlap = as.data.frame(list(NRSGS = NRS$Gene.Symbol))
Entrez_overlap = as.data.frame(list(NRSE = NRS$EntrezGene.ID))

# Overlaps
for (i in 1:length(Signatures)){
  GS_overlap[,i+1] = "No"
  Entrez_overlap[,i+1] = "No"
  yes_ind_gs = which(NRS$Gene.Symbol %in% Signatures[[i]]$Gene.Symbol)
  yes_ind_entrez = which(NRS$EntrezGene.ID %in% Signatures[[i]]$EntrezGene.ID)
  GS_overlap[yes_ind_gs, i+1] = "Yes"
  Entrez_overlap[yes_ind_entrez, i+1] = "Yes"
  rm(yes_ind_entrez, yes_ind_gs)
}

rm(i); gc()
colnames(GS_overlap) = colnames(Entrez_overlap) = c("NRS", names(Signatures))

# Add total overlaps in the last row of both output sets
GS_overlap[nrow(GS_overlap) + 1,] = c("Total", 
                                     lapply(GS_overlap[,2:ncol(GS_overlap)], function(x) sum(x == "Yes")))
Entrez_overlap[nrow(Entrez_overlap) + 1,] = c("Total", 
                                     lapply(Entrez_overlap[,2:ncol(GS_overlap)], function(x) sum(x == "Yes")))

# Heatmap of NRS vs signatures
# Load required libraries
library(ComplexHeatmap)
library(circlize)
library(rcartocolor)

# Convert your dataset's relevant columns to a matrix
overlap_mat <- GS_overlap[1:166,2:ncol(GS_overlap)]

# Convert 'Yes' and 'No' to numeric values
overlap_mat[overlap_mat == 'Yes'] <- 1
overlap_mat[overlap_mat == 'No'] <- 0
overlap_mat[, colnames(overlap_mat)] = lapply(overlap_mat[, colnames(overlap_mat)], as.numeric)
overlap_mat = as.matrix(overlap_mat)

# Convert the matrix back to numeric
overlap_mat <- apply(overlap_mat, 2, as.numeric)
rownames(overlap_mat) <- GS_overlap$NRS[1:166]
overlap_mat <- t(overlap_mat)  # Transpose matrix before setting row names

# Generate the categories for each row according to your dataset
categories_vector <- c(rep("Breast cancer", 16), "Multi-cancer", rep("Immune system", 3))

# Transform the categories vector into a factor
categories_factor <- factor(categories_vector, levels = c("Breast cancer", "Multi-cancer", "Immune system"))

# Generate annotation dataframe
annotation_df_overlap <- data.frame(Annotations = categories_factor)
rownames(annotation_df_overlap) <- rownames(overlap_mat)  # Adjust row names of the annotation to match the matrix

# Generate annotation colors for heatmap
annotation_colors_overlap <- list(Annotations = c("Breast cancer" = rcartocolor::carto_pal(n = 7, name = "Burg")[6], 
                                                  "Multi-cancer" = rcartocolor::carto_pal(n = 7, name = "Teal")[7], 
                                                  "Immune system" = rcartocolor::carto_pal(n = 7, name = "Peach")[6]))

# Configure the color scheme and annotation
col_fun <- colorRamp2(c(0, 1), c("grey95", "dodgerblue4"))

# Heatmap annotation
ha = rowAnnotation(df = annotation_df_overlap, 
                   col = annotation_colors_overlap, 
                   show_annotation_name = FALSE)

# Generate the heatmap
heatmap <- Heatmap(overlap_mat,
                   name = "Overlaps",
                   show_row_names = TRUE,
                   show_column_names = TRUE,
                   cluster_rows = FALSE,
                   cluster_columns = FALSE,
                   row_title = "Signatures",
                   row_title_gp = gpar(fontface = "bold", fontsize = 15),
                   column_title = "NRS genes",
                   column_title_gp = gpar(fontface = "bold", fontsize = 15),
                   column_title_side = "bottom",
                   heatmap_legend_param = list(legend_direction = "horizontal",
                                               legend_width = unit(4, "cm"),
                                               color_bar = "discrete",
                                               at = c(0, 1),
                                               labels = c("Not overlapping", "Overlapping")),
                   col = col_fun,
                   border = "black",
                   row_names_side = "left",
                   row_names_gp = gpar(fontsize = 8),
                   column_names_gp = gpar(fontsize = 4),
                   left_annotation = ha)
png("Signatures/NRS_published_signatures_heatmap.png", res = 600, height = 3300, width = 6750)
plot.new()
heatmap
# Draw the heatmap
# draw(heatmap, heatmap_legend_side = "bottom")
dev.off()

# Venn diagrams #####
library(ggVennDiagram)
GS_Venn = c(list(NRS = NRS$Gene.Symbol), 
            lapply(Signatures, function(x) {x <- x %>% dplyr::select(Gene.Symbol)
            x <- as.character(unlist(x))}))
Entrez_Venn = c(list(NRS = NRS$EntrezGene.ID), 
                lapply(Signatures, function(x) {x <- x %>% dplyr::select(EntrezGene.ID)
                x <- as.character(unlist(x))}))

# Venn diagram list
GS_Venn_diagram_list = list()
Entrez_Venn_diagram_list = list()

for(i in 2:length(GS_Venn)){
  diagram = ggVennDiagram(
    GS_Venn[c(1,i)], label_alpha = 0,
    category.names = names(GS_Venn[c(1,i)]), set_size = 0.65, label_size = 1) +
    scale_color_manual(values = c("grey10", "grey15")) +
    scale_fill_gradient(low = "white", high = "grey30") + 
    theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 2.5),
          panel.background = element_rect(fill = "white", 
                                          colour = "white"),
          legend.key.size = unit(1.5, units = "mm"),
          legend.text = element_text(size = 2),
          legend.title = element_text(face = "bold", size = 3),
          legend.margin = ggplot2::margin(0, 0, 0, 0, unit = "mm"),
          legend.spacing.y = unit(0.5, units = "mm"),
          legend.spacing.x = unit(0.5, units = "mm"),
          legend.background = element_blank())+
    labs(title = paste0("Venn diagram of NRS and the ", names(GS_Venn)[i], " signature"))
  GS_Venn_diagram_list = c(GS_Venn_diagram_list, list(diagram)); rm(diagram)
  
  diagram = ggVennDiagram(
    Entrez_Venn[c(1,i)], label_alpha = 0,
    category.names = names(Entrez_Venn[c(1,i)]), set_size = 0.65, label_size = 1) +
    scale_color_manual(values = c("grey10", "grey15")) +
    scale_fill_gradient(low = "white", high = "grey30") + 
    theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 2.5),
          panel.background = element_rect(fill = "white", 
                                          colour = "white"),
          legend.key.size = unit(1.5, units = "mm"),
          legend.text = element_text(size = 2),
          legend.title = element_text(face = "bold", size = 3),
          legend.margin = ggplot2::margin(0, 0, 0, 0, unit = "mm"),
          legend.spacing.y = unit(0.5, units = "mm"),
          legend.spacing.x = unit(0.5, units = "mm"),
          legend.background = element_blank())+
    labs(title = paste0("Venn diagram of NRS and the ", names(Entrez_Venn)[i], " signature"))
  Entrez_Venn_diagram_list = c(Entrez_Venn_diagram_list, list(diagram)); rm(diagram)
}
rm(i); gc()
names(Entrez_Venn_diagram_list) = names(GS_Venn_diagram_list) = names(Signatures)

# Defining a multiplot function #####
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

# Multiplot of Gene Symbol Venn diagrams
tiff("Signatures/Gene_Symbol_Venn_multiplot.tiff", 
     width = 3840, height = 2160, res = 700, compression = "lzw")
multiplot(GS_Venn_diagram_list[[1]], GS_Venn_diagram_list[[2]], GS_Venn_diagram_list[[3]],
          GS_Venn_diagram_list[[4]], GS_Venn_diagram_list[[5]], GS_Venn_diagram_list[[6]],
          GS_Venn_diagram_list[[7]], GS_Venn_diagram_list[[8]], GS_Venn_diagram_list[[9]],
          GS_Venn_diagram_list[[10]], GS_Venn_diagram_list[[11]], GS_Venn_diagram_list[[12]],
          GS_Venn_diagram_list[[13]], GS_Venn_diagram_list[[14]], GS_Venn_diagram_list[[15]],
          GS_Venn_diagram_list[[16]], GS_Venn_diagram_list[[17]], GS_Venn_diagram_list[[18]],
          GS_Venn_diagram_list[[19]], GS_Venn_diagram_list[[20]], cols = 4)
m = ggplot(multiplot(GS_Venn_diagram_list[[1]], GS_Venn_diagram_list[[2]], GS_Venn_diagram_list[[3]],
                     GS_Venn_diagram_list[[4]], GS_Venn_diagram_list[[5]], GS_Venn_diagram_list[[6]],
                     GS_Venn_diagram_list[[7]], GS_Venn_diagram_list[[8]], GS_Venn_diagram_list[[9]],
                     GS_Venn_diagram_list[[10]], GS_Venn_diagram_list[[11]], GS_Venn_diagram_list[[12]],
                     GS_Venn_diagram_list[[13]], GS_Venn_diagram_list[[14]], GS_Venn_diagram_list[[15]],
                     GS_Venn_diagram_list[[16]], GS_Venn_diagram_list[[17]], GS_Venn_diagram_list[[18]],
                     GS_Venn_diagram_list[[19]], GS_Venn_diagram_list[[20]], cols = 4))
dev.off(); rm(m)

# Multiplot of Entrez ID Venn diagrams
tiff("Signatures/Entrez_Venn_multiplot.tiff", 
     width = 3840, height = 2160, res = 700, compression = "lzw")
multiplot(Entrez_Venn_diagram_list[[1]], Entrez_Venn_diagram_list[[2]], Entrez_Venn_diagram_list[[3]],
          Entrez_Venn_diagram_list[[4]], Entrez_Venn_diagram_list[[5]], Entrez_Venn_diagram_list[[6]],
          Entrez_Venn_diagram_list[[7]], Entrez_Venn_diagram_list[[8]], Entrez_Venn_diagram_list[[9]],
          Entrez_Venn_diagram_list[[10]], Entrez_Venn_diagram_list[[11]], Entrez_Venn_diagram_list[[12]],
          Entrez_Venn_diagram_list[[13]], Entrez_Venn_diagram_list[[14]], Entrez_Venn_diagram_list[[15]],
          Entrez_Venn_diagram_list[[16]], Entrez_Venn_diagram_list[[17]], Entrez_Venn_diagram_list[[18]],
          Entrez_Venn_diagram_list[[19]], Entrez_Venn_diagram_list[[20]], cols = 4)
m = ggplot(multiplot(Entrez_Venn_diagram_list[[1]], Entrez_Venn_diagram_list[[2]], Entrez_Venn_diagram_list[[3]],
                     Entrez_Venn_diagram_list[[4]], Entrez_Venn_diagram_list[[5]], Entrez_Venn_diagram_list[[6]],
                     Entrez_Venn_diagram_list[[7]], Entrez_Venn_diagram_list[[8]], Entrez_Venn_diagram_list[[9]],
                     Entrez_Venn_diagram_list[[10]], Entrez_Venn_diagram_list[[11]], Entrez_Venn_diagram_list[[12]],
                     Entrez_Venn_diagram_list[[13]], Entrez_Venn_diagram_list[[14]], Entrez_Venn_diagram_list[[15]],
                     Entrez_Venn_diagram_list[[16]], Entrez_Venn_diagram_list[[17]], Entrez_Venn_diagram_list[[18]],
                     Entrez_Venn_diagram_list[[19]], Entrez_Venn_diagram_list[[20]], cols = 4))
dev.off(); rm(m)

# Venn diagrams of the intersections of signatures and our background genes (5,672) #####
background_genes = read.xlsx("data/Output sets/Downstream/DGEA.xlsx") %>%
  dplyr::select(Gene.Symbol, EntrezGene.ID)

GS_Venn_intersections = c(list(NRS = NRS$Gene.Symbol), 
                          lapply(Signatures, function(x) {x <- x$Gene.Symbol
                          x <- intersect(as.character(x), background_genes$Gene.Symbol)
                          x <- as.character(unlist(x))}))
Entrez_Venn_intersections = c(list(NRS = NRS$EntrezGene.ID), 
                              lapply(Signatures, function(x) {x <- x $EntrezGene.ID
                              x <- intersect(as.character(x), background_genes$EntrezGene.ID)
                              x <- as.character(unlist(x))}))

# Venn diagram list
GS_Venn_intersections_diagram_list = list()
Entrez_Venn_intersections_diagram_list = list()

for(i in 2:length(GS_Venn_intersections)){
  diagram = ggVennDiagram(
    GS_Venn_intersections[c(1,i)], label_alpha = 0,
    category.names = names(GS_Venn_intersections[c(1,i)]), set_size = 0.65, label_size = 1) +
    scale_color_manual(values = c("grey10", "grey15")) +
    scale_fill_gradient(low = "white", high = "grey30") + 
    theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 2.5),
          panel.background = element_rect(fill = "white", 
                                          colour = "white"),
          legend.key.size = unit(1.5, units = "mm"),
          legend.text = element_text(size = 2),
          legend.title = element_text(face = "bold", size = 3),
          legend.margin = ggplot2::margin(0, 0, 0, 0, unit = "mm"),
          legend.spacing.y = unit(0.5, units = "mm"),
          legend.spacing.x = unit(0.5, units = "mm"),
          legend.background = element_blank())+
    labs(title = paste0("Venn diagram of NRS and the ", names(GS_Venn_intersections)[i], " signature"))
  GS_Venn_intersections_diagram_list = c(GS_Venn_intersections_diagram_list, list(diagram)); rm(diagram)
  
  diagram = ggVennDiagram(
    Entrez_Venn_intersections[c(1,i)], label_alpha = 0,
    category.names = names(Entrez_Venn_intersections[c(1,i)]), set_size = 0.65, label_size = 1) +
    scale_color_manual(values = c("grey10", "grey15")) +
    scale_fill_gradient(low = "white", high = "grey30") + 
    theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 2.5),
          panel.background = element_rect(fill = "white", 
                                          colour = "white"),
          legend.key.size = unit(1.5, units = "mm"),
          legend.text = element_text(size = 2),
          legend.title = element_text(face = "bold", size = 3),
          legend.margin = ggplot2::margin(0, 0, 0, 0, unit = "mm"),
          legend.spacing.y = unit(0.5, units = "mm"),
          legend.spacing.x = unit(0.5, units = "mm"),
          legend.background = element_blank())+
    labs(title = paste0("Venn diagram of NRS and the ", names(Entrez_Venn_intersections)[i], " signature"))
  Entrez_Venn_intersections_diagram_list = c(Entrez_Venn_intersections_diagram_list, list(diagram)); rm(diagram)
}
rm(i); gc()
names(Entrez_Venn_intersections_diagram_list) = names(GS_Venn_intersections_diagram_list) = names(Signatures)

# Multiplot of Gene Symbol Venn diagrams
tiff("Signatures/Gene_Symbol_Venn_intersections_multiplot.tiff", 
     width = 3840, height = 2160, res = 700, compression = "lzw")
multiplot(GS_Venn_intersections_diagram_list[[1]], GS_Venn_intersections_diagram_list[[2]], GS_Venn_intersections_diagram_list[[3]],
          GS_Venn_intersections_diagram_list[[4]], GS_Venn_intersections_diagram_list[[5]], GS_Venn_intersections_diagram_list[[6]],
          GS_Venn_intersections_diagram_list[[7]], GS_Venn_intersections_diagram_list[[8]], GS_Venn_intersections_diagram_list[[9]],
          GS_Venn_intersections_diagram_list[[10]], GS_Venn_intersections_diagram_list[[11]], GS_Venn_intersections_diagram_list[[12]],
          GS_Venn_intersections_diagram_list[[13]], GS_Venn_intersections_diagram_list[[14]], GS_Venn_intersections_diagram_list[[15]],
          GS_Venn_intersections_diagram_list[[16]], GS_Venn_intersections_diagram_list[[17]], GS_Venn_intersections_diagram_list[[18]],
          GS_Venn_intersections_diagram_list[[19]], GS_Venn_intersections_diagram_list[[20]], cols = 4)
m = ggplot(multiplot(GS_Venn_intersections_diagram_list[[1]], GS_Venn_intersections_diagram_list[[2]], GS_Venn_intersections_diagram_list[[3]],
                     GS_Venn_intersections_diagram_list[[4]], GS_Venn_intersections_diagram_list[[5]], GS_Venn_intersections_diagram_list[[6]],
                     GS_Venn_intersections_diagram_list[[7]], GS_Venn_intersections_diagram_list[[8]], GS_Venn_intersections_diagram_list[[9]],
                     GS_Venn_intersections_diagram_list[[10]], GS_Venn_intersections_diagram_list[[11]], GS_Venn_intersections_diagram_list[[12]],
                     GS_Venn_intersections_diagram_list[[13]], GS_Venn_intersections_diagram_list[[14]], GS_Venn_intersections_diagram_list[[15]],
                     GS_Venn_intersections_diagram_list[[16]], GS_Venn_intersections_diagram_list[[17]], GS_Venn_intersections_diagram_list[[18]],
                     GS_Venn_intersections_diagram_list[[19]], GS_Venn_intersections_diagram_list[[20]], cols = 4))
dev.off(); rm(m)

# Multiplot of Entrez ID Venn diagrams
tiff("Signatures/Entrez_Venn_intersections_multiplot.tiff", 
     width = 3840, height = 2160, res = 700, compression = "lzw")
multiplot(Entrez_Venn_intersections_diagram_list[[1]], Entrez_Venn_intersections_diagram_list[[2]], Entrez_Venn_intersections_diagram_list[[3]],
          Entrez_Venn_intersections_diagram_list[[4]], Entrez_Venn_intersections_diagram_list[[5]], Entrez_Venn_intersections_diagram_list[[6]],
          Entrez_Venn_intersections_diagram_list[[7]], Entrez_Venn_intersections_diagram_list[[8]], Entrez_Venn_intersections_diagram_list[[9]],
          Entrez_Venn_intersections_diagram_list[[10]], Entrez_Venn_intersections_diagram_list[[11]], Entrez_Venn_intersections_diagram_list[[12]],
          Entrez_Venn_intersections_diagram_list[[13]], Entrez_Venn_intersections_diagram_list[[14]], Entrez_Venn_intersections_diagram_list[[15]],
          Entrez_Venn_intersections_diagram_list[[16]], Entrez_Venn_intersections_diagram_list[[17]], Entrez_Venn_intersections_diagram_list[[18]],
          Entrez_Venn_intersections_diagram_list[[19]], Entrez_Venn_intersections_diagram_list[[20]], cols = 4)
m = ggplot(multiplot(Entrez_Venn_intersections_diagram_list[[1]], Entrez_Venn_intersections_diagram_list[[2]], Entrez_Venn_intersections_diagram_list[[3]],
                     Entrez_Venn_intersections_diagram_list[[4]], Entrez_Venn_intersections_diagram_list[[5]], Entrez_Venn_intersections_diagram_list[[6]],
                     Entrez_Venn_intersections_diagram_list[[7]], Entrez_Venn_intersections_diagram_list[[8]], Entrez_Venn_intersections_diagram_list[[9]],
                     Entrez_Venn_intersections_diagram_list[[10]], Entrez_Venn_intersections_diagram_list[[11]], Entrez_Venn_intersections_diagram_list[[12]],
                     Entrez_Venn_intersections_diagram_list[[13]], Entrez_Venn_intersections_diagram_list[[14]], Entrez_Venn_intersections_diagram_list[[15]],
                     Entrez_Venn_intersections_diagram_list[[16]], Entrez_Venn_intersections_diagram_list[[17]], Entrez_Venn_intersections_diagram_list[[18]],
                     Entrez_Venn_intersections_diagram_list[[19]], Entrez_Venn_intersections_diagram_list[[20]], cols = 4))
dev.off(); rm(m)

# Venn diagrams of selected lists only #####
# Full lists
# Venn diagrams of the intersections of signatures and our background genes (5,672) #####
GS_Venn_selected = c(list(NRS = NRS$Gene.Symbol), 
                                   lapply(Signatures[c("pam50", "Mammaprint", 
                                                       "iC10", "Immunophenoscore")], function(x) {x <- x$Gene.Symbol
                                                       x <- as.character(x)}))
Entrez_Venn_selected = c(list(NRS = NRS$EntrezGene.ID), 
                                       lapply(Signatures[c("pam50", "Mammaprint", 
                                                           "iC10", "Immunophenoscore")], function(x) {x <- x $EntrezGene.ID
                                                           x <- as.character(x)}))

# Venn diagram list
GS_Venn_selected_diagram_list = list()
Entrez_Venn_selected_diagram_list = list()

for(i in 2:length(GS_Venn_selected)){
  diagram = ggVennDiagram(
    GS_Venn_selected[c(1,i)], label_alpha = 0,
    category.names = names(GS_Venn_selected[c(1,i)]),
    label = "count", set_size = 2, label_size = 3) +
    scale_color_manual(values = c("grey10", "grey15")) +
    scale_fill_gradient(low = "white", high = "grey30") + 
    theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 6),
          panel.background = element_rect(fill = "white", 
                                          colour = "white"),
          legend.key.size = unit(3, units = "mm"),
          legend.text = element_text(size = 4),
          legend.title = element_text(face = "bold", size = 6),
          legend.margin = ggplot2::margin(0, 0, 0, 0, unit = "mm"),
          legend.spacing.y = unit(2, units = "mm"),
          legend.spacing.x = unit(2, units = "mm"),
          legend.background = element_blank())+
    labs(title = paste0("Venn diagram of NRS and the ", names(GS_Venn_selected)[i], " signature"))
  GS_Venn_selected_diagram_list = c(GS_Venn_selected_diagram_list, list(diagram)); rm(diagram)
  
  diagram = ggVennDiagram(
    Entrez_Venn_selected[c(1,i)], label_alpha = 0,
    category.names = names(Entrez_Venn_selected[c(1,i)]),
    label = "count", set_size = 2, label_size = 3) +
    scale_color_manual(values = c("grey10", "grey15")) +
    scale_fill_gradient(low = "white", high = "grey30") + 
    theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 6),
          panel.background = element_rect(fill = "white", 
                                          colour = "white"),
          legend.key.size = unit(3, units = "mm"),
          legend.text = element_text(size = 4),
          legend.title = element_text(face = "bold", size = 6),
          legend.margin = ggplot2::margin(0, 0, 0, 0, unit = "mm"),
          legend.spacing.y = unit(2, units = "mm"),
          legend.spacing.x = unit(2, units = "mm"),
          legend.background = element_blank())+
    labs(title = paste0("Venn diagram of NRS and the ", names(Entrez_Venn_selected)[i], " signature"))
  Entrez_Venn_selected_diagram_list = c(Entrez_Venn_selected_diagram_list, list(diagram)); rm(diagram)
}
rm(i); gc()
names(Entrez_Venn_selected_diagram_list) = names(GS_Venn_selected_diagram_list) =
  names(Signatures[c("pam50", "Mammaprint", 
                     "iC10", "Immunophenoscore")])

# Multiplot of Gene Symbol Venn diagrams
tiff("Signatures/Gene_Symbol_Venn_selected_multiplot.tiff", 
     width = 3840, height = 2160, res = 700, compression ="lzw")
multiplot(GS_Venn_selected_diagram_list[[1]], GS_Venn_selected_diagram_list[[2]], 
          GS_Venn_selected_diagram_list[[3]], GS_Venn_selected_diagram_list[[4]], 
          cols = 2)
m = ggplot(multiplot(GS_Venn_selected_diagram_list[[1]], 
                     GS_Venn_selected_diagram_list[[2]],
                     GS_Venn_selected_diagram_list[[3]], 
                     GS_Venn_selected_diagram_list[[4]], cols = 2))
dev.off(); rm(m)

# Multiplot of Entrez ID Venn diagrams
tiff("Signatures/Entrez_Venn_selected_multiplot.tiff", 
     width = 3840, height = 2160, res = 700, compression = "lzw")
multiplot(Entrez_Venn_selected_diagram_list[[1]], Entrez_Venn_selected_diagram_list[[2]],
          Entrez_Venn_selected_diagram_list[[3]], 
          Entrez_Venn_selected_diagram_list[[4]], cols = 2)
m = ggplot(multiplot(Entrez_Venn_selected_diagram_list[[1]], Entrez_Venn_selected_diagram_list[[2]],
                     Entrez_Venn_selected_diagram_list[[3]], 
                     Entrez_Venn_selected_diagram_list[[4]], cols = 2))
dev.off(); rm(m)

# Intersections
# Venn diagrams of the intersections of signatures and our background genes (5,672) #####
GS_Venn_selected_intersections = c(list(NRS = NRS$Gene.Symbol), 
                                   lapply(Signatures[c("pam50", "Mammaprint", 
                                                       "iC10", "Immunophenoscore")], function(x) {x <- x$Gene.Symbol
                                                       x <- intersect(as.character(x), background_genes$Gene.Symbol)
                                                       x <- as.character(unlist(x))}))
Entrez_Venn_selected_intersections = c(list(NRS = NRS$EntrezGene.ID), 
                                       lapply(Signatures[c("pam50", "Mammaprint", 
                                                           "iC10", "Immunophenoscore")], function(x) {x <- x $EntrezGene.ID
                                                           x <- intersect(as.character(x), background_genes$EntrezGene.ID)
                                                           x <- as.character(unlist(x))}))

# Venn diagram list
GS_Venn_selected_intersections_diagram_list = list()
Entrez_Venn_selected_intersections_diagram_list = list()

for(i in 2:length(GS_Venn_selected_intersections)){
  diagram = ggVennDiagram(
    GS_Venn_selected_intersections[c(1,i)], label_alpha = 0,
    category.names = names(GS_Venn_selected_intersections[c(1,i)]),
    label = "count", set_size = 2, label_size = 3) +
    scale_color_manual(values = c("grey10", "grey15")) +
    scale_fill_gradient(low = "white", high = "grey30") + 
    theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 6),
          panel.background = element_rect(fill = "white", 
                                          colour = "white"),
          legend.key.size = unit(3, units = "mm"),
          legend.text = element_text(size = 4),
          legend.title = element_text(face = "bold", size = 6),
          legend.margin = ggplot2::margin(0, 0, 0, 0, unit = "mm"),
          legend.spacing.y = unit(2, units = "mm"),
          legend.spacing.x = unit(2, units = "mm"),
          legend.background = element_blank())+
    labs(title = paste0("Venn diagram of NRS and the ", names(GS_Venn_selected_intersections)[i], " signature"))
  GS_Venn_selected_intersections_diagram_list = c(GS_Venn_selected_intersections_diagram_list, list(diagram)); rm(diagram)
  
  diagram = ggVennDiagram(
    Entrez_Venn_selected_intersections[c(1,i)], label_alpha = 0,
    category.names = names(Entrez_Venn_selected_intersections[c(1,i)]),
    label = "count", set_size = 2, label_size = 3) +
    scale_color_manual(values = c("grey10", "grey15")) +
    scale_fill_gradient(low = "white", high = "grey30") + 
    theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 6),
          panel.background = element_rect(fill = "white", 
                                          colour = "white"),
          legend.key.size = unit(3, units = "mm"),
          legend.text = element_text(size = 4),
          legend.title = element_text(face = "bold", size = 6),
          legend.margin = ggplot2::margin(0, 0, 0, 0, unit = "mm"),
          legend.spacing.y = unit(2, units = "mm"),
          legend.spacing.x = unit(2, units = "mm"),
          legend.background = element_blank())+
    labs(title = paste0("Venn diagram of NRS and the ", names(Entrez_Venn_selected_intersections)[i], " signature"))
  Entrez_Venn_selected_intersections_diagram_list = c(Entrez_Venn_selected_intersections_diagram_list, list(diagram)); rm(diagram)
}
rm(i); gc()
names(Entrez_Venn_selected_intersections_diagram_list) = names(GS_Venn_selected_intersections_diagram_list) =
  names(Signatures[c("pam50", "Mammaprint", 
                     "iC10", "Immunophenoscore")])

# Multiplot of Gene Symbol Venn diagrams
tiff("Signatures/Gene_Symbol_Venn_selected_intersections_multiplot.tiff", 
     width = 3840, height = 2160, res = 700, compression = "lzw")
multiplot(GS_Venn_selected_intersections_diagram_list[[1]], 
          GS_Venn_selected_intersections_diagram_list[[2]], 
          GS_Venn_selected_intersections_diagram_list[[3]], 
          GS_Venn_selected_intersections_diagram_list[[4]], cols = 2)
m = ggplot(multiplot(GS_Venn_selected_intersections_diagram_list[[1]], 
                     GS_Venn_selected_intersections_diagram_list[[2]], 
                     GS_Venn_selected_intersections_diagram_list[[3]], 
                     GS_Venn_selected_intersections_diagram_list[[4]], cols = 2))
dev.off(); rm(m)

# Multiplot of Entrez ID Venn diagrams
tiff("Signatures/Entrez_Venn_selected_intersections_multiplot.tiff", 
     width = 3840, height = 2160, res = 700, compression = "lzw")
multiplot(Entrez_Venn_selected_intersections_diagram_list[[1]], 
          Entrez_Venn_selected_intersections_diagram_list[[2]], 
          Entrez_Venn_selected_intersections_diagram_list[[3]],
          Entrez_Venn_selected_intersections_diagram_list[[4]], cols = 2)
m = ggplot(multiplot(Entrez_Venn_selected_intersections_diagram_list[[1]],
                     Entrez_Venn_selected_intersections_diagram_list[[2]],
                     Entrez_Venn_selected_intersections_diagram_list[[3]],
                     Entrez_Venn_selected_intersections_diagram_list[[4]],  cols = 2))
dev.off(); rm(m)

# Immunophenoscore heatmap #####
library(pheatmap)
save_pheatmap_tiff = function(x, filename, width = 174, height = 120.8333, res = 650,
                              units = "mm") {
  tiff(filename, width = width, height = height, res = res, units = units, 
       compression = "lzw")
  grid::grid.newpage()
  grid::grid.draw(x$gtable)
  dev.off()
}

# Import RDS objects
annotation_for_heatmap = readRDS("Signatures/annotation_for_heatmap.rds")
z_exprs = readRDS("Signatures/normalised_expression.rds")

# Samples ordered by response: Non-responders first
response_ordered = rownames(annotation_for_heatmap[order(annotation_for_heatmap$Response, 
                                                         decreasing = TRUE),])

# Genes ordered by Immunophenoscore group
Immunophenoscore = read.xlsx("Signatures/Immunophenoscore.xlsx")[-1,]
colnames(Immunophenoscore) = c("Gene.Symbol", "Category", "Immunity")
Immunophenoscore = Immunophenoscore[Immunophenoscore$Gene.Symbol %in% NRS$Gene.Symbol,] %>%
  dplyr::arrange(Category)
Immunophenoscore$Category = gsub("Effector memeory CD4 T cell", 
                                 "Effector memory CD4 T cell",
                                 Immunophenoscore$Category)

mock = NRS; rownames(mock) = mock$Gene.Symbol
genes_ordered = mock[Immunophenoscore$Gene.Symbol, ]
rm(mock)

plot_matrix = z_exprs[which(rownames(z_exprs) %in% genes_ordered$EntrezGene.ID), 
                      response_ordered]
rownames(plot_matrix) = Immunophenoscore$Gene.Symbol
plot_matrix = plot_matrix[genes_ordered$Gene.Symbol,]
plot_matrix = t(plot_matrix)

hmcol = rev(colorRampPalette(RColorBrewer::brewer.pal(9, "RdBu"))(255))
immune_categories = unlist(RColorBrewer::brewer.pal(10, "Paired"))
names(immune_categories) = unique(Immunophenoscore$Category)

annotation_for_heatmap = annotation_for_heatmap %>%
  dplyr::select(-scmod1, -IC10, -Mammaprint_risk, -rorS_risk, -Dataset)
ann_colors <- list(
  Response = c(Responder = "dodgerblue4", Non_responder = "deeppink4"),
  Timepoint = c(T1 = "goldenrod2", T2 = "purple4"),#, T2.5 = "green", T3 = "blue"),
  Treatment = c(Chemotherapy = "paleturquoise3", Endocrine_treatment = "sienna2"),
  pam50 = c(Basal = "red4", LumB = "skyblue",
            LumA = "darkblue", Her2 = "purple4", Normal = "lightgreen"),
  Cluster = c(`Cluster 1` = "orange", `Cluster 2` = "skyblue"),
  Category = immune_categories,
  Immunity = c(Innate = "cadetblue", Adaptive = "darkmagenta")
)

annotation_for_heatmap = annotation_for_heatmap %>%
  dplyr::select(Response, Cluster, Treatment, Timepoint, pam50)
rownames(Immunophenoscore) = Immunophenoscore$Gene.Symbol
Immunophenoscore = Immunophenoscore %>% dplyr::select(-Gene.Symbol)
heatmap = pheatmap(plot_matrix, col = (hmcol),
                   breaks = seq(-2, 2, 4/256),
                   annotation_col = Immunophenoscore,
                   annotation_row = annotation_for_heatmap,
                   annotation_colors = ann_colors,
                   cluster_cols = F,
                   cluster_rows = F,
                   gaps_row = 444,
                   treeheight_col =  0,
                   legend = TRUE,
                   show_rownames = F,
                   show_colnames = T,
                   fontsize = 5,
                   fontsize_col = 5,
                   legend_breaks = c(-2, # min(plot_matrix, na.rm = TRUE), 
                                     2), # max(plot_matrix, na.rm = TRUE)), 
                   legend_labels = (c("lower expression", "higher expression")),
                   main = "Immunophenoscore overlap with the NRS")
save_pheatmap_tiff(heatmap, "Signatures/Immunophenoscore_heatmap.tiff")

# Write out overlaps #####
write.xlsx(GS_overlap, "Signatures/Gene_Symbol_Signature_Overlaps.xlsx")
write.xlsx(Entrez_overlap, "Signatures/Entrez_Signature_Overlaps.xlsx")

# MCPcounter #####
library(MCPcounter)
library(openxlsx)
library(tidyverse)
library(broom)

trainval = read.xlsx("data/Output sets/Pheno.xlsx", sheet = 2)
responders_T1 = trainval$Sample.ID[trainval$Response == "Responder" &
                                     trainval$Timepoint_coded == "T1"]
responders_T2 = trainval$Sample.ID[trainval$Response == "Responder" &
                                     trainval$Timepoint_coded == "T2"]
non_responders_T1 = trainval$Sample.ID[trainval$Response == "Non_responder" &
                                         trainval$Timepoint_coded == "T1"]
non_responders_T2 = trainval$Sample.ID[trainval$Response == "Non_responder" &
                                         trainval$Timepoint_coded == "T2"]

MCP = list()
MCP[["Full"]] = as.data.frame(MCPcounter.estimate(z_exprs, featuresType = "ENTREZ_ID"))
# Found no markers for population(s): CD8 T cells, NK cells
MCP[["Responders"]] = MCP[["Full"]][, colnames(MCP[["Full"]]) %in% c(responders_T1, responders_T2)]
MCP[["Non_responders"]] = MCP[["Full"]][, colnames(MCP[["Full"]]) %in% c(non_responders_T1, non_responders_T2)]
MCP[["Responders_T1"]] = MCP[["Full"]][, colnames(MCP[["Full"]]) %in% responders_T1]
MCP[["Non_responders_T1"]] = MCP[["Full"]][, colnames(MCP[["Full"]]) %in% non_responders_T1]
MCP[["Responders_T2"]] = MCP[["Full"]][, colnames(MCP[["Full"]]) %in% responders_T2]
MCP[["Non_responders_T2"]] = MCP[["Full"]][, colnames(MCP[["Full"]]) %in% non_responders_T2]

# Calculate mean scores for each cell type in each dataset
MCP <- lapply(MCP, function(df) {
  df$mean_row <- rowMeans(df, na.rm = TRUE)
  return(df)
})

# Boxplots
# Converting the MCP dataframe into a long format
MCP_long <- lapply(MCP, function(df) {
  df$cell_type <- rownames(df)
  df_long <- df %>%
    tidyr::pivot_longer(-cell_type, names_to = "sample_id", values_to = "value")
  return(df_long)
})

# Assign response and timepoint groups
for (i in 1:length(MCP_long)) {
  MCP_long[[i]]$Response <- factor(ifelse(MCP_long[[i]]$sample_id %in% c(responders_T1, responders_T2),
                                                "Responder", "Non_responder"))
  MCP_long[[i]]$Timepoint <- factor(ifelse(MCP_long[[i]]$sample_id %in% c(responders_T1, non_responders_T1),
                                                 "T1", "T2"))
}

# Titles for the boxplots
titles = c("Responders vs. Non-responders",
            "Responders T1 vs. Non-responders T1",
            "Responders T2 vs. Non-responders T2",
            "Responders T1 vs. Responders T2",
            "Non_responders T1 vs. Non-responders T2")

titles2 = c("Responders_vs_Non_responders",
           "Responders_T1_vs_Non_responders_T1",
           "Responders_T2_vs_Non_responders_T2",
           "Responders_T1_vs_Responders_T2",
           "Non_responders_T1_vs_Non_responders_T2")

# Create boxplots in a loop
boxplots = list(RespvsNonresp = list(), RespT1vsNonrespT1 = list(),
                RespT2vsNonrespT2 = list(), RespT1vsRespT2 = list(),
                NonrespT1vsNonrespT2 = list())
box_datasets = list(MCP_long[["Full"]], 
                    rbind(MCP_long[["Responders_T1"]], MCP_long[["Non_responders_T1"]]),
                    rbind(MCP_long[["Responders_T2"]], MCP_long[["Non_responders_T2"]]),
                    rbind(MCP_long[["Responders_T1"]], MCP_long[["Responders_T2"]]),
                    rbind(MCP_long[["Non_responders_T1"]], MCP_long[["Non_responders_T2"]]))
x_variables = c("Response", "Response","Response", "Timepoint", "Timepoint")
cell_types = sort(unique(MCP_long[["Full"]]$cell_type))

# Function to conduct t-tests between timepoints for each group and each cell type
perform_t_tests_timepoints <- function(data, filter) {
  result = data[filter, ] %>%
    dplyr::group_by(cell_type) %>%
    dplyr::do(tidy(t.test(value ~ Timepoint, data = .)))
  return(result)
}

perform_t_tests_response <- function(data, filter) {
  result = data[filter, ] %>%
    dplyr::group_by(cell_type) %>%
    dplyr::do(tidy(t.test(value ~ Response, data = .)))
  return(result)
}

# Create significance labels (*) for the plots
add_significance_labels <- function(t_test_result) {
  t_test_result %>%
    mutate(significance_label = case_when(
      p.value > 0.05 ~ "n.s.",
      p.value <= 0.05 & p.value > 0.01 ~ "*",
      p.value <= 0.01 & p.value > 0.001 ~ "**",
      p.value <= 0.001 & p.value > 0.0001 ~ "***",
      p.value <= 0.0001 ~ "****"
    )) %>%
    dplyr::arrange(cell_type)
}

# Generate t-test results
t_test_results = list()

t_test_results[["Resp.vs.Nonresp"]] = perform_t_tests_response(MCP_long[["Full"]], 
                                                               1:nrow(MCP_long[["Full"]]))
t_test_results[["Resp.vs.Nonresp"]] = add_significance_labels(as.data.frame(t_test_results[["Resp.vs.Nonresp"]]))

t_test_results[["RespT1.vs.Nonresp_T1"]] = perform_t_tests_response(MCP_long[["Full"]], 
                                                                    which(MCP_long[["Full"]]$Timepoint == "T1"))
t_test_results[["RespT1.vs.Nonresp_T1"]] = add_significance_labels(as.data.frame(t_test_results[["RespT1.vs.Nonresp_T1"]]))

t_test_results[["RespT2.vs.Nonresp_T2"]] = perform_t_tests_response(MCP_long[["Full"]], 
                                                                    which(MCP_long[["Full"]]$Timepoint == "T2"))
t_test_results[["RespT2.vs.Nonresp_T2"]] = add_significance_labels(as.data.frame(t_test_results[["RespT2.vs.Nonresp_T2"]]))

t_test_results[["RespT1.vs.RespT2"]] = perform_t_tests_timepoints(MCP_long[["Responders"]], 
                                                                  1:nrow(MCP_long[["Responders"]]))
t_test_results[["RespT1.vs.RespT2"]] = add_significance_labels(as.data.frame(t_test_results[["RespT1.vs.RespT2"]]))

t_test_results[["NonrespT1.vs.NonrespT2"]] = perform_t_tests_timepoints(MCP_long[["Non_responders"]], 
                                                                        1:nrow(MCP_long[["Non_responders"]]))
t_test_results[["NonrespT1.vs.NonrespT2"]] = add_significance_labels(as.data.frame(t_test_results[["NonrespT1.vs.NonrespT2"]]))


combined_results = rbind(t_test_results[[1]],
                         t_test_results[[2]],
                         t_test_results[[3]],
                         t_test_results[[4]],
                         t_test_results[[5]])

for (i in 1:length(titles)) {
  for (j in 1:nrow(MCP[["Full"]])) {
    box_dataset = box_datasets[[i]] %>% dplyr::filter(cell_type == cell_types[j])
    sig_label = t_test_results[[i]][["significance_label"]][t_test_results[[i]]$cell_type == cell_types[j]]
    box_dataset$sig_label = ifelse(nrow(box_dataset) > 0, sig_label, NA)
    boxplots[[i]][[j]] = ggplot(data = box_dataset, 
                                aes(x = !!sym(x_variables[i]), y = value, 
                                    fill = !!sym(x_variables[i]))) +
      geom_boxplot(alpha = 0.8, outlier.shape = NA, width = 0.3) +
      geom_text(aes(x = 1.5, y = max(value, na.rm = TRUE), 
                    label = paste(as.character(sig_label))), 
                size = 3, vjust = 1) +
      scale_fill_manual(values = c("Responder" = "dodgerblue4", "Non_responder" = "deeppink4",
                                   "T1" = "goldenrod2", "T2" = "purple4")) +
      labs(title = NULL, #paste0(cell_types[j], ": ", titles[i])
           x = NULL, y = cell_types[j]) +
      theme_classic() +
      theme(panel.background = element_rect(fill = "white", 
                                            colour = "white"),
            panel.grid = element_blank(),
            axis.line = element_line(),
            axis.title.y = element_text(size = 7, face = "bold"),
            axis.text.x = element_text(size = 5),
            legend.position = "none")
  }
  names(boxplots[[i]]) = cell_types
}
rm(box_dataset)

# Create and export ggarrange() objects
library(ggpubr)
library(ggtext)

# Create output directory
dir.create("Signatures/MCPcounter")
MCPcounter_full_plots = list()
for (i in 1:length(titles)) {
  MCPcounter_full_plots[[i]] = ggarrange(plotlist = boxplots[[i]], ncol = 2, 
                                         nrow = 4, labels = NULL)
  print(MCPcounter_full_plots[[i]])
  ggsave(filename = paste0("MCPcounter_", titles2[i], ".tiff"),
         path = "Signatures/MCPcounter", 
         width = 3612, height = 6500, device = 'tiff', units = "px",
         dpi = 700, compression = "lzw")
  dev.off()
}

# Create a comprehensive plot of significant and meaningful results
# i.e. results that are different between the groups of response/timepoint
# It is going to be a 3x1 grid of three 1x3 ggarrange() object (a 3x3 grid ultimately)

# A legend plot
p9 = ggplot() +
  geom_rect(aes(xmin = -1, xmax = 1, ymin = -1, ymax = 1), fill = NA) +
  annotate("richtext", x = -0.5, y = 0, 
           label = "<b>a, b, c:</b> All samples<br><br>
                    <b>d, e, f:</b> T2 samples<br><br>
                    <b>g:</b> Responders (T1 vs. T2)<br><br>
                    <b>h:</b> Non-responders (T1 vs. T2)",
           hjust = 0, vjust = 0.5, fill = NA, label.color = NA, label.size = 0, size = 2) +
  theme_void() +
  coord_cartesian(xlim = c(-1, 1), ylim = c(-1, 1))

ggarrange1 = ggarrange(boxplots$RespvsNonresp$`Endothelial cells`,
                       boxplots$RespvsNonresp$Fibroblasts,
                       boxplots$RespvsNonresp$Neutrophils,
                       ncol = 3, nrow = 1, labels = c("a", "b", "c"),
                       font.label = list(size = 8.5))
ggarrange2 = ggarrange(boxplots$RespT2vsNonrespT2$`Endothelial cells`,
                       boxplots$RespT2vsNonrespT2$Fibroblasts,
                       boxplots$RespT2vsNonrespT2$Neutrophils,
                       ncol = 3, nrow = 1, labels = c("d", "e", "f"),
                       font.label = list(size = 8.5))
ggarrange3 = ggarrange(boxplots$RespT1vsRespT2$`Endothelial cells`,
                       boxplots$NonrespT1vsNonrespT2$`Endothelial cells`,
                       p9,
                       ncol = 3, nrow = 1, labels = c("g", "h", "Legend"),
                       font.label = list(size = 8.5))
tiff("Signatures/MCPcounter/MCPcounter_results.tiff", width = 3500, height = 4800,
     res = 700, compression = "lzw")
ggarrange(ggarrange1, ggarrange2, ggarrange3,
          ncol = 1, nrow = 3, labels = NULL)
dev.off()

# Danaher #####

# List of Danaher gene symbols
danaher_genes = read.xlsx("Signatures/Danaher.xlsx", sheet = 4) %>%
  dplyr::rename(Gene.Symbol = Gene)
dgea = read.xlsx("data/Output sets/Downstream/DGEA.xlsx")
z_exprs_gs = z_exprs[dgea$EntrezGene.ID,]
rownames(z_exprs_gs) = dgea$Gene.Symbol

# Convert the matrix to a data frame for easier manipulation
z_exprs_gs = as.data.frame(z_exprs_gs)

# Create the output data frame with cell types and overlap sizes
danaher_scores = as.data.frame(unique(danaher_genes$Cell.Type))
danaher_scores$overlap_size = NA
overlap_lists = list()
colnames(danaher_scores) = c("Cell.Type", "overlap_size")

for (i in 1:nrow(danaher_scores)) {
  danaher_subset = danaher_genes[danaher_genes$Cell.Type == danaher_scores$Cell.Type[i], ]
  gene_subset = danaher_subset$Gene.Symbol
  
  # Overlap of cell type signatures with our gene pool
  danaher_scores$overlap_size[i] = length(intersect(rownames(z_exprs_gs), gene_subset))
  overlap_lists[[i]] = intersect(rownames(z_exprs_gs), gene_subset)
}
names(overlap_lists) = danaher_scores$Cell.Type

# Keeping only cell types with non-zero overlaps
danaher_scores = danaher_scores[danaher_scores$overlap_size > 0, ]
danaher_scores = as.data.frame(cbind(danaher_scores, matrix(NA, nrow(danaher_scores),
                                                            ncol(z_exprs_gs))))
colnames(danaher_scores)[3:ncol(danaher_scores)] = colnames(z_exprs_gs)

# Estimate scores for each cell type
for (i in 1:ncol(z_exprs_gs)) {
  for (j in 1:nrow(danaher_scores)) {
    z_exprs_gs_filt = z_exprs_gs[overlap_lists[[danaher_scores$Cell.Type[j]]], ]
    means = colMeans(z_exprs_gs_filt, na.rm = TRUE)
    danaher_scores[j, 3:ncol(danaher_scores)] = means
  }
}

# Plotting preparations
rownames(danaher_scores) = danaher_scores$Cell.Type
Danaher_list_df = list()
Danaher_list_df[["Full"]] = danaher_scores %>% dplyr::select(-Cell.Type, -overlap_size)
# Found no markers for population(s): Neutrophils, NK CD56dim cells, NK cells, Th1 cells, Treg
Danaher_list_df[["Responders"]] = Danaher_list_df[["Full"]][, colnames(Danaher_list_df[["Full"]]) %in% c(responders_T1, responders_T2)]
Danaher_list_df[["Non_responders"]] = Danaher_list_df[["Full"]][, colnames(Danaher_list_df[["Full"]]) %in% c(non_responders_T1, non_responders_T2)]
Danaher_list_df[["Responders_T1"]] = Danaher_list_df[["Full"]][, colnames(Danaher_list_df[["Full"]]) %in% responders_T1]
Danaher_list_df[["Non_responders_T1"]] = Danaher_list_df[["Full"]][, colnames(Danaher_list_df[["Full"]]) %in% non_responders_T1]
Danaher_list_df[["Responders_T2"]] = Danaher_list_df[["Full"]][, colnames(Danaher_list_df[["Full"]]) %in% responders_T2]
Danaher_list_df[["Non_responders_T2"]] = Danaher_list_df[["Full"]][, colnames(Danaher_list_df[["Full"]]) %in% non_responders_T2]

# Boxplots
# Converting the Danaher dataframe into a long format
Danaher_long <- lapply(Danaher_list_df, function(df) {
  df$cell_type <- rownames(df)
  df_long <- df %>%
    tidyr::pivot_longer(-cell_type, names_to = "sample_id", values_to = "value")
  return(df_long)
})

# Assign response and timepoint groups
for (i in 1:length(Danaher_long)) {
  Danaher_long[[i]]$Response <- factor(ifelse(Danaher_long[[i]]$sample_id %in% c(responders_T1, responders_T2),
                                              "Responder", "Non_responder"))
  Danaher_long[[i]]$Timepoint <- factor(ifelse(Danaher_long[[i]]$sample_id %in% c(responders_T1, non_responders_T1),
                                               "T1", "T2"))
}

# Create boxplots in a loop
boxplots2 = list(RespvsNonresp = list(), RespT1vsNonrespT1 = list(),
                 RespT2vsNonrespT2 = list(), RespT1vsRespT2 = list(),
                 NonrespT1vsNonrespT2 = list())
box_datasets2 = list(Danaher_long[["Full"]], 
                     rbind(Danaher_long[["Responders_T1"]], Danaher_long[["Non_responders_T1"]]),
                     rbind(Danaher_long[["Responders_T2"]], Danaher_long[["Non_responders_T2"]]),
                     rbind(Danaher_long[["Responders_T1"]], Danaher_long[["Responders_T2"]]),
                     rbind(Danaher_long[["Non_responders_T1"]], Danaher_long[["Non_responders_T2"]]))
x_variables2 = c("Response", "Response","Response", "Timepoint", "Timepoint")
cell_types2 = sort(unique(Danaher_long[["Full"]]$cell_type))

# Generate t-test results
t_test_results2 = list()

t_test_results2[["Resp.vs.Nonresp"]] = perform_t_tests_response(Danaher_long[["Full"]], 
                                                                1:nrow(Danaher_long[["Full"]]))
t_test_results2[["Resp.vs.Nonresp"]] = add_significance_labels(as.data.frame(t_test_results2[["Resp.vs.Nonresp"]]))

t_test_results2[["RespT1.vs.Nonresp_T1"]] = perform_t_tests_response(Danaher_long[["Full"]], 
                                                                     which(Danaher_long[["Full"]]$Timepoint == "T1"))
t_test_results2[["RespT1.vs.Nonresp_T1"]] = add_significance_labels(as.data.frame(t_test_results2[["RespT1.vs.Nonresp_T1"]]))

t_test_results2[["RespT2.vs.Nonresp_T2"]] = perform_t_tests_response(Danaher_long[["Full"]], 
                                                                     which(Danaher_long[["Full"]]$Timepoint == "T2"))
t_test_results2[["RespT2.vs.Nonresp_T2"]] = add_significance_labels(as.data.frame(t_test_results2[["RespT2.vs.Nonresp_T2"]]))

t_test_results2[["RespT1.vs.RespT2"]] = perform_t_tests_timepoints(Danaher_long[["Responders"]], 
                                                                   1:nrow(Danaher_long[["Responders"]]))
t_test_results2[["RespT1.vs.RespT2"]] = add_significance_labels(as.data.frame(t_test_results2[["RespT1.vs.RespT2"]]))

t_test_results2[["NonrespT1.vs.NonrespT2"]] = perform_t_tests_timepoints(Danaher_long[["Non_responders"]], 
                                                                         1:nrow(Danaher_long[["Non_responders"]]))
t_test_results2[["NonrespT1.vs.NonrespT2"]] = add_significance_labels(as.data.frame(t_test_results2[["NonrespT1.vs.NonrespT2"]]))

combined_results2 = rbind(t_test_results2[[1]],
                          t_test_results2[[2]],
                          t_test_results2[[3]],
                          t_test_results2[[4]],
                          t_test_results2[[5]])

for (i in 1:length(titles)) {
  for (j in 1:nrow(Danaher_list_df[["Full"]])) {
    box_dataset = box_datasets2[[i]] %>% dplyr::filter(cell_type == cell_types2[j])
    sig_label = t_test_results2[[i]][["significance_label"]][t_test_results2[[i]]$cell_type == cell_types2[j]]
    box_dataset$sig_label = ifelse(nrow(box_dataset) > 0, sig_label, NA)
    boxplots2[[i]][[j]] = ggplot(data = box_dataset, 
                                 aes(x = !!sym(x_variables2[i]), y = value, 
                                     fill = !!sym(x_variables2[i]))) +
      geom_boxplot(alpha = 0.8, outlier.shape = NA, width = 0.3) +
      geom_text(aes(x = 1.5, y = max(value, na.rm = TRUE), 
                    label = paste(as.character(sig_label))), 
                size = 3, vjust = 1) +
      scale_fill_manual(values = c("Responder" = "dodgerblue4", "Non_responder" = "deeppink4",
                                   "T1" = "goldenrod2", "T2" = "purple4")) +
      labs(title = NULL, #paste0(cell_types[j], ": ", titles[i])
           x = NULL, y = cell_types2[j]) +
      theme_classic() +
      theme(panel.background = element_rect(fill = "white", 
                                            colour = "white"),
            panel.grid = element_blank(),
            axis.line = element_line(),
            axis.title.y = element_text(size = 7, face = "bold"),
            axis.text.x = element_text(size = 5),
            legend.position = "none")
  }
  names(boxplots2[[i]]) = cell_types2
}
rm(box_dataset)

# Create output directory
dir.create("Signatures/Danaher")
Danaher_full_plots = list()
for (i in 1:length(titles)) {
  Danaher_full_plots[[i]] = ggarrange(plotlist = boxplots2[[i]], ncol = 3, 
                                      nrow = 3, labels = NULL)
  print(Danaher_full_plots[[i]])
  ggsave(filename = paste0("Danaher_", titles2[i], ".tiff"),
         path = "Signatures/Danaher", 
         width = 3612, height = 6500, device = 'tiff', units = "px",
         dpi = 700, compression = "lzw")
  dev.off()
}


# Create a comprehensive plot of significant and meaningful results
# i.e. results that are different between the groups of response/timepoint
# It is going to be a 2x1 grid of three 1x3 ggarrange() object (a 2x3 grid ultimately)
# Plus: a horizontal legend underneath

# A legend plot
p10 = ggplot() +
  geom_rect(aes(xmin = -1, xmax = 1, ymin = -0.5, ymax = 1), fill = NA) +
  annotate("richtext", x = -0.5, y = 0, 
           label = "<b>a, b, c:</b> Responders (T1 vs. T2), || <b>d, e, f:</b> Non-responders (T1 vs. T2)",
           hjust = 0.2, vjust = 0.5, fill = NA, label.color = NA, label.size = 0, size = 3) +
  theme_void() +
  coord_cartesian(xlim = c(-1, 1), ylim = c(-0.5, 1))

ggarrange1 = ggarrange(boxplots2$RespT1vsRespT2$CD45,
                       boxplots2$RespT1vsRespT2$`B-cells`,
                       boxplots2$RespT1vsRespT2$Macrophages,
                       ncol = 3, nrow = 1, labels = c("a", "b", "c"),
                       font.label = list(size = 8.5))
ggarrange2 = ggarrange(boxplots2$NonrespT1vsNonrespT2$CD45,
                       boxplots2$NonrespT1vsNonrespT2$`B-cells`,
                       boxplots2$NonrespT1vsNonrespT2$Macrophages,
                       ncol = 3, nrow = 1, labels = c("d", "e", "f"),
                       font.label = list(size = 8.5))
ggarrange3 = ggarrange(p10, labels = c("Legend"),
                       font.label = list(size = 8.5), hjust = -5.5, vjust = 3)
tiff("Signatures/Danaher/Danaher_results.tiff", width = 3500, height = 4800,
     res = 700, compression = "lzw")
ggarrange(ggarrange1, ggarrange2, ggarrange3,
          ncol = 1, nrow = 3, labels = NULL, heights = c(3, 3, 1))
dev.off()

# Immunophenoscore #####


# MSigDB Oncogenic Collection Enrichment #####
library(msigdb)
library(ExperimentHub)
library(GSEABase)

# Query the database
eh = ExperimentHub()
query(eh , 'msigdb')

# Custom download
msigdb.hs = getMsigdb(org = 'hs', id = 'EZID', version = '7.2')
msigdb.hs

# Oncogenic signatures
onco = subsetCollection(msigdb.hs, 'c6')
onco_ids = geneIds(onco)

# Prepare for limma::fry or limma::camera
camera_indices = ids2indices(onco_ids, rownames(z_exprs))
z_design = readRDS("Signatures/design_matrix.rds") # design matrix
z_cm = readRDS("Signatures/contrast_matrix.rds") # contrast matrix

onco_enr = limma::camera(z_exprs, index = camera_indices, 
                  design = z_design, contrast = z_cm)
enriched_oncosets = onco_enr[onco_enr$FDR < 0.05,] # 5 sets

# Enriched oncogenic sets metadata: descriptions
onco_descriptions_long = list()
onco_descriptions_short = list()
for(i in rownames(onco_enr)){
  onco_descriptions_long[[i]] = msigdb.hs[[i]]@longDescription
  onco_descriptions_short[[i]] = msigdb.hs[[i]]@shortDescription
}
rm(i); gc()

onco_enr$Terms = rownames(onco_enr)
onco_enr$Long.Desc = onco_descriptions_long
onco_enr$Short.Desc = onco_descriptions_short

onco_descriptions_short[[1]]; onco_descriptions_long[[1]]
onco_descriptions_short[[2]]; onco_descriptions_long[[2]]
onco_descriptions_short[[3]]; onco_descriptions_long[[3]]
onco_descriptions_short[[4]]; onco_descriptions_long[[4]]
onco_descriptions_short[[5]]; onco_descriptions_long[[5]]

onco_enr = onco_enr %>%
  dplyr::select(Terms, NGenes, Direction, PValue, FDR, Short.Desc, Long.Desc)

# Write out enrichment results
write.xlsx(onco_enr, "Signatures/MSigDB_Oncogenic_Collection_Enrichment.xlsx",
           overwrite = TRUE)

# Session Info #####
sessionInfo()