# glmnet function part a
cv.glmnet.balanced = function(x, y, nfolds = 10, ...) {
foldid = rep(NA_integer_, length(y))
sink("/dev/null")
ll = pamr:::balanced.folds(y, nfolds)
sink()
for (i in seq_along(ll)) { foldid[ ll[[i]] ] = i }
stopifnot(!any(is.na(foldid)), nrow(x) == length(y))
weights = rep(NA_real_, length(y))
tab = table(y)
stopifnot(setequal(names(tab), levels(y)))
for (nm in names(tab)) weights[ y == nm ] = 1/tab[nm]
cv.glmnet(x = x, y = y, foldid = foldid, weights = weights, ...)
}
entities = c("rLN", "MZL", "FL", "MCL", "DLBCL")
# glmnet function part b
my_glmnet = function(df) {
require("glmnet")
require("pamr")
x = dplyr::select(df, all_of(cell_types)) |> as.matrix()
y = factor(df$Entity, levels = entities)
## estimate prediction performance by LOO CV
confusion_table = lapply(seq_along(y), function(i) {
pr = cv.glmnet.balanced(x[-i, ], y[-i], family = "multinomial") |>
predict(newx = x[i,, drop = FALSE], type = "response")
tibble(truth = y[i],
predicted = factor(names(which.max(pr[1,,1])), levels = entities))
}) |> bind_rows() |> table()
## final fit
fit = cv.glmnet.balanced(x, y, family = "multinomial")
cfit = coef(fit)
coefs = lapply(names(cfit), function(ent) {
m = cfit[[ent]]
tibble(Entity = ent,
cell_type = rownames(m),
beta = m[, 1])
}) |> bind_rows() |> mutate(Entity = factor(Entity, levels = entities))
list(fit = fit,
confusion_table = confusion_table,
coefs = coefs)
}
# This function reads and handles TCR data
readTCR <- function(files=NULL){
lapply(files, data.table::fread) %>%
bind_rows() %>%
as_tibble() %>%
mutate(PatientID=strsplit(barcode, split = "_") %>% sapply("[[", 2)) %>%
select(PatientID, 1:(ncol(.)-1)) %>%
rename(Barcode_full=barcode)
}
# This function adds entity to df
add_entity <-
function(data){
data %>%
left_join(., df_meta %>% select(PatientID, Entity) %>% distinct, by="PatientID") %>%
dplyr::mutate(Entity=gsub(Entity, pattern = ", GCB|, non-GCB", replacement = ""))
}
# This function calculates the proportion of variables
add_prop <-
function(data=NULL, vars=NULL, group.vars=NULL, ungroup=TRUE, keep.n=FALSE, prop.name=NULL) {
group.vars <- vars[group.vars]
dftmp <-
data %>%
dplyr::count(!!!syms(unique(c(vars, group.vars)))) %>%
group_by(!!!syms(c(group.vars))) %>%
dplyr::mutate(Prop=n/sum(n))
if(keep.n==FALSE){
dftmp <- dftmp %>% select(-n)
}
if(ungroup==TRUE){
dftmp <- dftmp %>% ungroup()
}
if(!is.null(prop.name)){
colnames(dftmp)[which(colnames(dftmp)=="Prop")] <- prop.name
}
return(dftmp)
}
# This functions expands data frames and fills missing values with 0
fill_zeros <-
function(data=NULL ,names_from=NULL, values_from=NULL) {
data %>%
pivot_wider(names_from = all_of(names_from), values_from = all_of(values_from), values_fill = 0) %>%
pivot_longer(cols = (ncol(data)-1):ncol(.), names_to = names_from, values_to = values_from)
}
# Run standard Seurat processing pipeline
SeuratProc_T <-
function(sobj, verbose=FALSE, dims.clustering=NULL, resolution.clustering=NULL, dims.umap=NULL) {
# Remove
sobj <- DietSeurat(sobj)
DefaultAssay(sobj) <- "RNA"
# Filter data set based on RNA
sobj <- FindVariableFeatures(sobj, selection.method = "vst", nfeatures = 2000, verbose=verbose)
# Scale data (RNA and ADT)
sobj <- ScaleData(sobj, features = rownames(sobj), verbose=verbose)
# Assess cell cycle
sobj <- CellCycleScoring(sobj, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, set.ident = TRUE)
sobj <- ScaleData(sobj, vars.to.regress = c("S.Score", "G2M.Score", "percent.mt"), verbose=verbose)
# Run PCA
sobj <- RunPCA(sobj, features = VariableFeatures(sobj), nfeatures.print=5, ndims.print=1:2,
reduction.name = "pcaRNA", reduction.key = "pcaRNA_")
# Run clustering based on transcriptome
sobj <- FindNeighbors(sobj, dims = dims.clustering, verbose=verbose, reduction = "pcaRNA")
sobj <- FindClusters(sobj, resolution = resolution.clustering, verbose=verbose)
# Run UMAP based on transcriptome
sobj <- RunUMAP(sobj, dims = dims.umap, verbose=verbose, reduction.key = "umapRNA_",
reduction.name = "umapRNA", reduction = "pcaRNA")
return(sobj)
}
SeuratProcADT_T <-
function(sobj, verbose=FALSE, dims.clustering=NULL, resolution.clustering=NULL, dims.umap=NULL) {
DefaultAssay(sobj) <- "ADT"
VariableFeatures(sobj, assay="ADT") <- rownames(sobj@assays$ADT)
sobj <- ScaleData(sobj, assay = "ADT", verbose=verbose)
#### Run PCA and print ElbowPlot
sobj <- RunPCA(sobj, features = rownames(sobj@assays$ADT), nfeatures.print=5, ndims.print=1:2,
reduction.name = "pcaADT", reduction.key = "pcaADT_")
#### Run clustering based on ADT
sobj <- FindNeighbors(sobj, dims = dims.clustering, verbose=verbose, reduction = "pcaADT")
sobj <- FindClusters(sobj, resolution = resolution.clustering, verbose=verbose)
#### Run UMAP based on ADT
sobj <- RunUMAP(sobj, dims = dims.umap, verbose=verbose, reduction = "pcaADT",
reduction.name = "umapADT", reduction.key = "umapADT_")
DefaultAssay(sobj) <- "RNA"
return(sobj)
}