--- a
+++ b/code_final/ctcl_visium_figures.R
@@ -0,0 +1,378 @@
+#!/usr/bin/env R
+
+# ------------------------------------------------------------------------------
+# title: Spatial analysis of CTCL data.
+# purpose: This script creates visium figures for the CTCL project.
+#          It takes the cell2location output.
+# created: 2024-07-16 Tue 14:11:45 BST
+# updated: 2024-08-02 Fri 13:23:37 BST
+# version: 0.0.9
+# status: Prototype
+#
+# maintainer: Ciro Ramírez-Suástegui
+# author:
+#   - name: Ciro Ramírez-Suástegui
+#     affiliation: The Wellcome Sanger Institute
+#     email: cs59@sanger.ac.uk, cramsuig@gmail.com
+# contributor:
+#   - name: Pasha Mazin
+#     affiliation: The Wellcome Sanger Institute
+#     email: pm19@sanger.ac.uk
+# ------------------------------------------------------------------------------
+
+## Environment setup ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Dependencies: functions and packages
+# basic ----------------------------------------------------
+library(plyr)
+if (requireNamespace("crayon", quietly = TRUE)) {
+  red <- crayon::red
+  yellow <- crayon::yellow
+  cyan <- crayon::cyan
+}else{
+  red <- yellow <- cyan = c
+}
+logging::basicConfig()
+section = function (i, tail_n = 60, color = cyan) {
+  tail_n <- max(c(tail_n, nchar(i) + 1))
+  y <- paste("##", color(i), "##", base::strrep("%", tail_n-nchar(i)), "\n")
+  logging::loginfo(y)
+}
+# tools ----------------------------------------------------
+library(Seurat)
+library(NMF)
+library(visutils)
+# in-house/developing --------------------------------------
+source(paste0(here::here(), "/code/visutils.R"))
+
+## Global variables and paths ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+data_dir <- "/nfs/cellgeni/pasham/projects/2211.adult.skin/data.nfs"
+figures_dir <- paste0(here::here(), "/figures/ctcl_visium")
+# CTCL as reference, original results
+c2l_file <- paste0(data_dir, "/visium/ctcl/c2l.v2.rds")
+sufix <- "ref-ctcl_original"
+# CTCL as reference
+c2l_file <- paste0(here::here(), "/results/ref_ctcl-viss_ctcl_disease.20/predmodel")
+sufix <- "ref-ctcl"
+# CTCL+Healthy as reference
+c2l_file <- paste0(
+  here::here(), c("/results/ref_ctcl_h-viss_ctcl_all.20/predmodel",
+  "/results/ref_ctcl_h-viss_bayanne_healthy.20/predmodel"))
+sufix <- "ref-ctcl-h"
+
+{ section("Loading data") ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+  c2l <- unlist(lapply(c2l_file, function(x){
+    if (grepl("rds$", x)){
+      y <- readRDS(x)
+      split.data.frame(y, splitSub(rownames(y), "|", 1))
+    }else loadC2L(x)
+  }), recursive = FALSE)
+  logging::loginfo("Filtering to just the tissue and enhancing images")
+  visium <- lapply(
+    X = readRDS(paste0(data_dir, "/visium/ctcl/vs.v2.rds")),
+    function(v){
+      y <- v[, v$is.tissue == 1]
+      y@images$slice1@image <- enhanceImage( # More contrasted images
+        y@images$slice1@image, wb = TRUE, qs = c(0.1, 0.9)
+      )
+      return(y)
+  })
+  mdata <- readRDS(paste0(data_dir, "/visium/ctcl/meta.v2.rds"))
+}
+
+## Pre-processing ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+{
+  samples_subset <- intersect(
+    grep("CTCL|WS_D", names(visium), value = TRUE),
+    grep("CTCL|WS_D", names(visium), value = TRUE)
+  )
+  # Setting up cell type colours
+  ct_color <- c(
+    "Tumour cell" = "#E41A1C",
+    "F3" = "#377EB8",
+    "F2" = "#4DAF4A",
+    "B cell" = "#FF7F00",
+    "MoDC3" = "#F781BF",
+    "VE3" = "#984EA3",
+    "DC2" = "#be5ff5"
+  )
+  for (i in grep("Healthy", names(ct_color), value = TRUE, invert = TRUE)) {
+    ct_color[paste("Healthy", i)] <- grDevices::adjustcolor(
+      ct_color[i], alpha.f = 0.3
+    )
+  }
+
+  logging::loginfo("Matching cell names")
+  # tumourcell > tumor_cell
+  # MoDC3 > moDC_3
+  # B/plasma > B_cell
+  ct_names = setNames(nm = unique(unlist(lapply(c2l, colnames))))
+  ct_names["tumor_cell"] <- "Tumour cell"
+  ct_names["moDC_3"] <- "MoDC3"
+  ct_names["B_cell"] <- "B cell"
+  for (sample_i in samples_subset) {
+    cellnames <- colnames(visium[[sample_i]])
+    rownames(c2l[[sample_i]]) <- gsub(".*\\|", "", rownames(c2l[[sample_i]]))
+    cellint <- intersect(cellnames, rownames(c2l[[sample_i]]))
+    logging::loginfo(paste0(
+      sample_i, ": ", length(cellint), "/(c2l:",
+      nrow(c2l[[sample_i]]), ",vis:", length(cellnames), ")"
+    ))
+    if (length(cellnames) > 0) {
+      c2l[[sample_i]] <- c2l[[sample_i]][cellnames, ]
+    }
+    colnames(c2l[[sample_i]]) <- ct_names[colnames(c2l[[sample_i]])]
+  }
+  # trend line and NMF need the matrix with everything together
+  c2lm <- as.data.frame(do.call(rbind, lapply(X = samples_subset, function(x){
+    y <- c2l[[x]]; rownames(y) <- paste0(x, ".", rownames(y)); y
+  })))
+  # visium[["HCA_sCTCL13876503"]] <- rotateVisium(
+  #   visium[["HCA_sCTCL13876503"]], n = 1, mirror = TRUE
+  # )
+}
+
+## Main ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+{ section("Microenvironments (NMF analysis)") ## %%%%%%%%%%%
+  c2lmt = as.matrix(c2lm)
+  # we will use per-spot normalised celltype abundancies
+  c2lmt = sweep(c2lmt, 1, rowSums(c2lmt), '/')
+  # we will run nmf N times to assess results stability
+  N = 50 # consider to increase it
+  # number of factors is almost arbitrary and to be set manually depending
+  # on desired granularity of microenvironments.
+  for (rank in c(4, 5, 6)) {
+    p_name = paste0(file.path(
+      figures_dir, paste(
+        "mfigure1f",
+        paste(c("nmf", rank, "microenvironments"), collapse="-"),
+        sufix, sep="_"
+      )
+    ), ".pdf")
+    logging::loginfo(paste(basename(p_name)))
+    start <- Sys.time()
+    set.seed(1234)
+    doMC::registerDoMC(2)
+    nmf = plyr::llply(
+      .data = 1:N,
+      .fun = function(i){
+        nmf(c2lmt, rank = rank)
+      }, .parallel = TRUE
+    )
+    logging::loginfo(paste("Elapsed:", format(difftime(Sys.time(), start, unit="min"))))
+    best.nmf = nmfGetBest(nmf, getNMFNormFs('max'))
+    # plotting
+    pdf(gsub(".pdf", "_summary.pdf", p_name), w = 7-2, h = 7+2)
+    plotNMFConsSummary(best.nmf$coefn, best.nmf$cons/N, ylab.cex = 0.7, max.cex = 2)
+    dev.off()
+    pdf(p_name, w = 7+5, h = 7+3)
+    plotNMFCons(best.nmf$coefn, best.nmf$cons/N, ylab.cex = 0.7, max.cex = 2)
+    dev.off()
+  }
+}
+
+# Configuring data going into each figure
+fconfigs <- list(
+  list(
+    name = "mfigure3d",
+    celltypes = c("Tumour cell", "VE3"), # F3
+    samples = c("5" = "HCA_sCTCL13876505", "4" = "HCA_sCTCL13876504")
+  ),
+  list(
+    name = "mfigure3j",
+    celltypes = c("Tumour cell", "DC2"), # MoDC3
+    samples = c("7" = "HCA_sCTCL13787193", "4" = "HCA_sCTCL13876504")
+  ),
+  list(
+    name = "mfigure3j",
+    celltypes = c("Tumour cell", "DC2", "MoDC3"),
+    samples = c("7" = "HCA_sCTCL13787193", "4" = "HCA_sCTCL13876504")
+  ),
+  list(
+    name = "mfigure4d",
+    celltypes = c("Tumour cell", "B cell"),
+    samples = c("5" = "HCA_sCTCL13876505", "8" = "HCA_sCTCL13787192")
+  ),
+  list(
+    name = "efigure4d", # efigure4f
+    celltypes = c(rep(list(c("Tumour cell", "F2")), 3), list(c("Tumour cell", "VE3"))),
+    samples = c(
+      "2" = "HCA_sCTCL13787190", "3" = "HCA_sCTCL13876502",
+      "1" = "HCA_sCTCL13787191", "2" = "HCA_sCTCL13787190"
+    )
+  ),
+  list(
+    name = "efigure5d", # efigure5f
+    celltypes = c("Tumour cell", "DC2", "MoDC3"), # +DC2
+    samples = c("5" = "HCA_sCTCL13876505", "6" = "HCA_sCTCL13876503")
+  ),
+  list(
+    name = "efigure6i",
+    celltypes = c("Tumour cell", "B cell"),
+    samples = c("2" = "HCA_sCTCL13787190", "3" = "HCA_sCTCL13876502")
+  ),
+  list(
+    name = "xfigureNx",
+    celltypes = c("Tumour cell", "F2", "VE3"),
+    samples = c("7" = "HCA_sCTCL13787193", "4" = "HCA_sCTCL13876504")
+  ),
+  list(
+    name = "xfigureNx", # efigure5f
+    celltypes = c("Tumour cell", "F2", "VE3"),
+    samples = c("5" = "HCA_sCTCL13876505", "6" = "HCA_sCTCL13876503")
+  ),
+  list(
+    name = "efigure6i",
+    celltypes = c("Tumour cell", "F2", "VE3", "B cell", "MoDC3"),
+    samples = c("5" = "HCA_sCTCL13876505", "6" = "HCA_sCTCL13876503")
+  ),
+  list(
+    name = "xfigureNx",
+    celltypes = c("Tumour cell", "F2", "VE3"),
+    samples = c(
+      "2" = "HCA_sCTCL13787190", "3" = "HCA_sCTCL13876502",
+      "1" = "HCA_sCTCL13787191", "2" = "HCA_sCTCL13787190"
+    )
+  )
+)
+
+{ section("Abudance plot on tissue image") ## %%%%%%%%%%%%%%
+  for (i in seq_along(fconfigs)) {
+    fconfigs[[i]]$he.grayscale = FALSE
+    fconfigs[[i]]$img.alpha = 0.2
+    fconfigs[[i]]$he.img.width = 300
+  }
+
+  for (fconfig in fconfigs) {
+    p_name = paste0(file.path(
+      figures_dir, paste(
+        fconfig$name, "spatial-abundance",
+        paste(make.names(unique(unlist(fconfig$celltypes))), collapse="-"),
+        paste(names(fconfig$samples), collapse="-"),
+        sufix, sep="_"
+      )
+    ), ".pdf")
+    nrows <- ceiling(sqrt(length(fconfig$samples)))
+    ncols <- ceiling(length(fconfig$samples) / nrows)
+    # create folder
+    logging::loginfo(paste(basename(p_name), nrows, ncols))
+    pdf(p_name, w = ncols * 5, h = nrows * 3.5)
+    par(mfcol = c(nrows, ncols), mar = c(0, 0, 1, 10), bty = "n")
+    if (!is.list(fconfig$celltypes)) {
+      fconfig$celltypes <- list(fconfig$celltypes)
+    }
+    if (length(fconfig$celltypes) == 1) {
+      fconfig$celltypes <- rep(fconfig$celltypes, length(fconfig$samples))
+    }
+    for (index_i in seq_along(fconfig$samples)) {
+      sample_i <- fconfig$samples[index_i]
+      celltypes <- fconfig$celltypes[[index_i]]
+      ftitle <- paste0(
+        mdata[sample_i, "stage"], ": ",
+        mdata[sample_i, "donor_id"]
+      )
+      ct_proportions <- c2l[[sample_i]][colnames(visium[[sample_i]]), celltypes]
+      p <- plotVisiumMultyColours(
+        visium[[sample_i]], ct_proportions,
+        zfun = function(x) x^2, scale.per.colour = TRUE,
+        col = ct_color[colnames(ct_proportions)], mode = "mean",
+        he.grayscale = fconfig$he.grayscale, img.alpha = fconfig$img.alpha,
+        main = ftitle, legend.ncol = 2,
+        min.opacity=0
+      )
+    }
+    dev.off()
+  }
+}
+
+{ section("Abundance along dermis-epidermis axis") ## %%%%%%
+  # define distance ----------------------------------------
+  for (sample_i in samples_subset) {
+    d <- as.matrix(dist(
+      visium[[sample_i]]@images$slice1@coordinates[, c("imagerow", "imagecol")]
+    ))
+    spot_dist <- min(d[upper.tri(d)])
+    visium[[sample_i]]$dist2junction <- -abs(
+      apply(d[, visium[[sample_i]]$is.surface], 1, min) / spot_dist
+    )
+  }
+  columns_l <- lapply(visium, function(x) colnames(x@meta.data) )
+  columns_u <- unique(unlist(columns_l))
+  columns_s <- sapply(columns_l, function(x) columns_u %in% x ) # shared columns
+  columns_u <- columns_u[rowSums(columns_s) == length(columns_l)]
+  spots <- as.data.frame(data.table::rbindlist(
+    lapply(samples_subset, function(x) visium[[x]]@meta.data[, columns_u] ))
+  )
+  rownames(spots) <- paste0(spots$sample_id, ".", spots$barcode)
+  # binarise the distance ----------------------------------
+  spots$dist2junction = round(as.numeric(spots$dist2surf))
+  # cor(spots$dist2junction, spots$dist2surf) # 0.9988894
+  # mean(spots$dist2junction == spots$dist2surf) # 0.06986634
+  # Calculating matrix with average ------------------------
+  # celltype abundances for each distance bin and each sample
+  dfsmtx.c2l = makeDistFeatureSampleTable(
+    dist = spots$dist2junction,
+    sample = spots$sample_id,
+    data = c2lm,
+    per.spot.norm = TRUE
+  )
+  # dfsmtx.c2l["0", fconfigs_trends[[1]]$celltypes[1], tail(unique(spots$sample_id), 1)]
+  # mean(c2lm_condition[, fconfigs_trends[[1]]$celltypes[1]])
+
+  
+  fselect <- c("mfigure3d", "mfigure3j", "efigure5d", "mfigure4d", "xfigureNx")
+  fnames <- sapply(fconfigs, "[[", "name" )
+  temp <- sapply(fselect, function(x) min(which(fnames %in% x)) )
+  fconfigs_trends <- fconfigs[temp] # [c(1, 2, 4, 5)]
+  fconfigs_trends[[1]]$name = "mfigure3e"
+  fconfigs_trends[[2]]$name = "mfigure3k"
+  ftitle <- "All CTCL combined"
+  if (any(grepl("WS_D", samples_subset))) {
+    ftitle <- "CTCL vs Healthy"
+    for (i in seq_along(fconfigs_trends)) {
+      fconfigs_trends[[i]]$name <- paste0(fconfigs_trends[[i]]$name, "-healthy")
+      fconfigs_trends[[i]]$celltypes <- c(
+        fconfigs_trends[[i]]$celltypes,
+        paste("Healthy", grep("umour", fconfigs_trends[[i]]$celltypes, value = TRUE, invert = TRUE))
+      )
+      fconfigs_trends[[i]]$features_facet <- sapply(
+        X = fconfigs_trends[[i]]$celltypes,
+        FUN = function(x) {
+        if (any(grepl("Healthy", x))){
+          grep("WS_D", samples_subset, value = TRUE)
+        }else{
+          grep("CTCL", samples_subset, value = TRUE)
+        }
+      }, simplify = FALSE)
+    }
+  }
+
+  for (fconfig in fconfigs_trends) {
+    p_name = paste0(file.path(
+      figures_dir, paste(
+        fconfig$name, "trend-abundance",
+        paste(make.names(unique(unlist(fconfig$celltypes))), collapse="-"),
+        sufix, sep="_"
+      )
+    ), ".pdf")
+    logging::loginfo(paste(basename(p_name)))
+    if (is.list(fconfig$celltypes)) next
+    pdf(p_name, w = 7-2, h = 7-3)
+    par(mar = c(4, 4, 2, 8), bty = "n")
+    plotFeatureProfiles_fun(
+      dfsmtx.c2l,
+      features = fconfig$celltypes,
+      features_facet = fconfig$features_facet,
+      cols = ct_color[fconfig$celltypes],
+      lwd = 5,
+      sd.mult = 1,
+      ylim = c(0, 1.3),
+      xlim = range(-15:0),
+      main = ftitle,
+      xlab = "Distance to surface interface (spots)",
+    )
+    dev.off()
+  }
+}
+
+## Conclusions ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+## Save ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%