CITEseqLN-Tcells / Git / [9905a0] /R/Functions.R

Models:
WandaB/
CITEseqLN-Tcells
Downloads: 1
[9905a0]: / R / Functions.R
History
Download this file
167 lines (125 with data), 5.6 kB

# glmnet function part a
cv.glmnet.balanced = function(x, y, nfolds = 10, ...) {
  
  foldid = rep(NA_integer_, length(y))
  sink("/dev/null")
  ll = pamr:::balanced.folds(y, nfolds) 
  sink()
  for (i in seq_along(ll)) { foldid[ ll[[i]] ] = i }
  stopifnot(!any(is.na(foldid)), nrow(x) == length(y))
  
  weights = rep(NA_real_, length(y))
  tab = table(y)
  stopifnot(setequal(names(tab), levels(y)))
  for (nm in names(tab)) weights[ y == nm ] = 1/tab[nm]

  cv.glmnet(x = x, y = y, foldid = foldid, weights = weights, ...) 
}

entities = c("rLN", "MZL", "FL", "MCL", "DLBCL") 

# glmnet function part b
my_glmnet = function(df) {
  require("glmnet")
  require("pamr")
  x = dplyr::select(df, all_of(cell_types)) |> as.matrix()
  y = factor(df$Entity, levels = entities)
  
  ## estimate prediction performance by LOO CV
  confusion_table = lapply(seq_along(y), function(i) {
    pr = cv.glmnet.balanced(x[-i, ], y[-i], family = "multinomial") |>
      predict(newx = x[i,, drop = FALSE], type = "response")
    tibble(truth   = y[i],
           predicted = factor(names(which.max(pr[1,,1])), levels = entities))
  }) |> bind_rows() |> table()
  
  ## final fit
  fit  = cv.glmnet.balanced(x, y, family = "multinomial")
  cfit = coef(fit)
  coefs = lapply(names(cfit), function(ent) { 
    m = cfit[[ent]]
    tibble(Entity    = ent,
           cell_type = rownames(m), 
           beta      =  m[, 1]) 
  }) |> bind_rows() |> mutate(Entity = factor(Entity, levels = entities))
  
  list(fit = fit, 
       confusion_table = confusion_table,
       coefs = coefs)
}

# This function reads and handles TCR data
readTCR <- function(files=NULL){
  
  lapply(files, data.table::fread) %>% 
    bind_rows() %>% 
    as_tibble() %>% 
    mutate(PatientID=strsplit(barcode, split = "_") %>% sapply("[[", 2)) %>% 
    select(PatientID, 1:(ncol(.)-1)) %>% 
    rename(Barcode_full=barcode)

} 

# This function adds entity to df
add_entity <- 
  function(data){
      data %>% 
        left_join(., df_meta %>% select(PatientID, Entity) %>% distinct, by="PatientID") %>% 
        dplyr::mutate(Entity=gsub(Entity, pattern = ", GCB|, non-GCB", replacement = ""))
  }


# This function calculates the proportion of variables
add_prop <- 
  function(data=NULL, vars=NULL, group.vars=NULL, ungroup=TRUE, keep.n=FALSE, prop.name=NULL) {
    
    group.vars <-  vars[group.vars]
    
    dftmp <- 
      data %>% 
        dplyr::count(!!!syms(unique(c(vars, group.vars)))) %>% 
        group_by(!!!syms(c(group.vars))) %>% 
        dplyr::mutate(Prop=n/sum(n))
    
    if(keep.n==FALSE){
      dftmp <- dftmp %>% select(-n)
    }
    
    if(ungroup==TRUE){
      dftmp <- dftmp %>% ungroup()
    }
    
    if(!is.null(prop.name)){
      colnames(dftmp)[which(colnames(dftmp)=="Prop")] <- prop.name
      
    }
    
    return(dftmp)
  }

# This functions expands data frames and fills missing values with 0
fill_zeros <- 
  function(data=NULL ,names_from=NULL, values_from=NULL) {
    data %>% 
      pivot_wider(names_from = all_of(names_from), values_from = all_of(values_from), values_fill = 0) %>% 
      pivot_longer(cols = (ncol(data)-1):ncol(.), names_to = names_from, values_to = values_from)
  }
  
# Run standard Seurat processing pipeline
SeuratProc_T <- 
  function(sobj, verbose=FALSE, dims.clustering=NULL, resolution.clustering=NULL, dims.umap=NULL) {
    
    # Remove 
    sobj <- DietSeurat(sobj)
    DefaultAssay(sobj) <- "RNA"
    
    # Filter data set based on RNA
    sobj <- FindVariableFeatures(sobj, selection.method = "vst", nfeatures = 2000, verbose=verbose)
    
    # Scale data (RNA and ADT)
    sobj <- ScaleData(sobj, features = rownames(sobj), verbose=verbose)
    
    # Assess cell cycle
    sobj <- CellCycleScoring(sobj, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, set.ident = TRUE)
    sobj <- ScaleData(sobj, vars.to.regress = c("S.Score", "G2M.Score", "percent.mt"), verbose=verbose)
    
    # Run PCA
    sobj <- RunPCA(sobj, features = VariableFeatures(sobj), nfeatures.print=5, ndims.print=1:2,
                   reduction.name = "pcaRNA", reduction.key = "pcaRNA_")
    
    # Run clustering based on transcriptome
    sobj <- FindNeighbors(sobj, dims = dims.clustering, verbose=verbose, reduction = "pcaRNA")
    sobj <- FindClusters(sobj, resolution = resolution.clustering, verbose=verbose)
    
    # Run UMAP based on transcriptome
    sobj <- RunUMAP(sobj, dims = dims.umap, verbose=verbose, reduction.key = "umapRNA_",
                    reduction.name = "umapRNA", reduction = "pcaRNA")
    
    return(sobj)
    
  }

SeuratProcADT_T <- 
  function(sobj, verbose=FALSE, dims.clustering=NULL, resolution.clustering=NULL, dims.umap=NULL) {
    
    DefaultAssay(sobj) <- "ADT"
    VariableFeatures(sobj, assay="ADT") <- rownames(sobj@assays$ADT)
    
    sobj <- ScaleData(sobj, assay = "ADT", verbose=verbose)
    
    #### Run PCA and print ElbowPlot
    sobj <- RunPCA(sobj, features = rownames(sobj@assays$ADT), nfeatures.print=5, ndims.print=1:2, 
                   reduction.name = "pcaADT", reduction.key = "pcaADT_")
    
    #### Run clustering based on ADT
    sobj <- FindNeighbors(sobj, dims = dims.clustering, verbose=verbose,  reduction = "pcaADT")
    sobj <- FindClusters(sobj, resolution = resolution.clustering, verbose=verbose)
    
    #### Run UMAP based on ADT
    sobj <- RunUMAP(sobj, dims = dims.umap, verbose=verbose, reduction = "pcaADT",
                    reduction.name = "umapADT", reduction.key = "umapADT_")
    
    DefaultAssay(sobj) <- "RNA"
    
    return(sobj)
    
  }