a b/R/Functions.R
1
# glmnet function part a
2
cv.glmnet.balanced = function(x, y, nfolds = 10, ...) {
3
  
4
  foldid = rep(NA_integer_, length(y))
5
  sink("/dev/null")
6
  ll = pamr:::balanced.folds(y, nfolds) 
7
  sink()
8
  for (i in seq_along(ll)) { foldid[ ll[[i]] ] = i }
9
  stopifnot(!any(is.na(foldid)), nrow(x) == length(y))
10
  
11
  weights = rep(NA_real_, length(y))
12
  tab = table(y)
13
  stopifnot(setequal(names(tab), levels(y)))
14
  for (nm in names(tab)) weights[ y == nm ] = 1/tab[nm]
15
16
  cv.glmnet(x = x, y = y, foldid = foldid, weights = weights, ...) 
17
}
18
19
entities = c("rLN", "MZL", "FL", "MCL", "DLBCL") 
20
21
# glmnet function part b
22
my_glmnet = function(df) {
23
  require("glmnet")
24
  require("pamr")
25
  x = dplyr::select(df, all_of(cell_types)) |> as.matrix()
26
  y = factor(df$Entity, levels = entities)
27
  
28
  ## estimate prediction performance by LOO CV
29
  confusion_table = lapply(seq_along(y), function(i) {
30
    pr = cv.glmnet.balanced(x[-i, ], y[-i], family = "multinomial") |>
31
      predict(newx = x[i,, drop = FALSE], type = "response")
32
    tibble(truth   = y[i],
33
           predicted = factor(names(which.max(pr[1,,1])), levels = entities))
34
  }) |> bind_rows() |> table()
35
  
36
  ## final fit
37
  fit  = cv.glmnet.balanced(x, y, family = "multinomial")
38
  cfit = coef(fit)
39
  coefs = lapply(names(cfit), function(ent) { 
40
    m = cfit[[ent]]
41
    tibble(Entity    = ent,
42
           cell_type = rownames(m), 
43
           beta      =  m[, 1]) 
44
  }) |> bind_rows() |> mutate(Entity = factor(Entity, levels = entities))
45
  
46
  list(fit = fit, 
47
       confusion_table = confusion_table,
48
       coefs = coefs)
49
}
50
51
# This function reads and handles TCR data
52
readTCR <- function(files=NULL){
53
  
54
  lapply(files, data.table::fread) %>% 
55
    bind_rows() %>% 
56
    as_tibble() %>% 
57
    mutate(PatientID=strsplit(barcode, split = "_") %>% sapply("[[", 2)) %>% 
58
    select(PatientID, 1:(ncol(.)-1)) %>% 
59
    rename(Barcode_full=barcode)
60
61
} 
62
63
# This function adds entity to df
64
add_entity <- 
65
  function(data){
66
      data %>% 
67
        left_join(., df_meta %>% select(PatientID, Entity) %>% distinct, by="PatientID") %>% 
68
        dplyr::mutate(Entity=gsub(Entity, pattern = ", GCB|, non-GCB", replacement = ""))
69
  }
70
71
72
# This function calculates the proportion of variables
73
add_prop <- 
74
  function(data=NULL, vars=NULL, group.vars=NULL, ungroup=TRUE, keep.n=FALSE, prop.name=NULL) {
75
    
76
    group.vars <-  vars[group.vars]
77
    
78
    dftmp <- 
79
      data %>% 
80
        dplyr::count(!!!syms(unique(c(vars, group.vars)))) %>% 
81
        group_by(!!!syms(c(group.vars))) %>% 
82
        dplyr::mutate(Prop=n/sum(n))
83
    
84
    if(keep.n==FALSE){
85
      dftmp <- dftmp %>% select(-n)
86
    }
87
    
88
    if(ungroup==TRUE){
89
      dftmp <- dftmp %>% ungroup()
90
    }
91
    
92
    if(!is.null(prop.name)){
93
      colnames(dftmp)[which(colnames(dftmp)=="Prop")] <- prop.name
94
      
95
    }
96
    
97
    return(dftmp)
98
  }
99
100
# This functions expands data frames and fills missing values with 0
101
fill_zeros <- 
102
  function(data=NULL ,names_from=NULL, values_from=NULL) {
103
    data %>% 
104
      pivot_wider(names_from = all_of(names_from), values_from = all_of(values_from), values_fill = 0) %>% 
105
      pivot_longer(cols = (ncol(data)-1):ncol(.), names_to = names_from, values_to = values_from)
106
  }
107
  
108
# Run standard Seurat processing pipeline
109
SeuratProc_T <- 
110
  function(sobj, verbose=FALSE, dims.clustering=NULL, resolution.clustering=NULL, dims.umap=NULL) {
111
    
112
    # Remove 
113
    sobj <- DietSeurat(sobj)
114
    DefaultAssay(sobj) <- "RNA"
115
    
116
    # Filter data set based on RNA
117
    sobj <- FindVariableFeatures(sobj, selection.method = "vst", nfeatures = 2000, verbose=verbose)
118
    
119
    # Scale data (RNA and ADT)
120
    sobj <- ScaleData(sobj, features = rownames(sobj), verbose=verbose)
121
    
122
    # Assess cell cycle
123
    sobj <- CellCycleScoring(sobj, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, set.ident = TRUE)
124
    sobj <- ScaleData(sobj, vars.to.regress = c("S.Score", "G2M.Score", "percent.mt"), verbose=verbose)
125
    
126
    # Run PCA
127
    sobj <- RunPCA(sobj, features = VariableFeatures(sobj), nfeatures.print=5, ndims.print=1:2,
128
                   reduction.name = "pcaRNA", reduction.key = "pcaRNA_")
129
    
130
    # Run clustering based on transcriptome
131
    sobj <- FindNeighbors(sobj, dims = dims.clustering, verbose=verbose, reduction = "pcaRNA")
132
    sobj <- FindClusters(sobj, resolution = resolution.clustering, verbose=verbose)
133
    
134
    # Run UMAP based on transcriptome
135
    sobj <- RunUMAP(sobj, dims = dims.umap, verbose=verbose, reduction.key = "umapRNA_",
136
                    reduction.name = "umapRNA", reduction = "pcaRNA")
137
    
138
    return(sobj)
139
    
140
  }
141
142
SeuratProcADT_T <- 
143
  function(sobj, verbose=FALSE, dims.clustering=NULL, resolution.clustering=NULL, dims.umap=NULL) {
144
    
145
    DefaultAssay(sobj) <- "ADT"
146
    VariableFeatures(sobj, assay="ADT") <- rownames(sobj@assays$ADT)
147
    
148
    sobj <- ScaleData(sobj, assay = "ADT", verbose=verbose)
149
    
150
    #### Run PCA and print ElbowPlot
151
    sobj <- RunPCA(sobj, features = rownames(sobj@assays$ADT), nfeatures.print=5, ndims.print=1:2, 
152
                   reduction.name = "pcaADT", reduction.key = "pcaADT_")
153
    
154
    #### Run clustering based on ADT
155
    sobj <- FindNeighbors(sobj, dims = dims.clustering, verbose=verbose,  reduction = "pcaADT")
156
    sobj <- FindClusters(sobj, resolution = resolution.clustering, verbose=verbose)
157
    
158
    #### Run UMAP based on ADT
159
    sobj <- RunUMAP(sobj, dims = dims.umap, verbose=verbose, reduction = "pcaADT",
160
                    reduction.name = "umapADT", reduction.key = "umapADT_")
161
    
162
    DefaultAssay(sobj) <- "RNA"
163
    
164
    return(sobj)
165
    
166
  }