|
a |
|
b/R/Functions.R |
|
|
1 |
# glmnet function part a |
|
|
2 |
cv.glmnet.balanced = function(x, y, nfolds = 10, ...) { |
|
|
3 |
|
|
|
4 |
foldid = rep(NA_integer_, length(y)) |
|
|
5 |
sink("/dev/null") |
|
|
6 |
ll = pamr:::balanced.folds(y, nfolds) |
|
|
7 |
sink() |
|
|
8 |
for (i in seq_along(ll)) { foldid[ ll[[i]] ] = i } |
|
|
9 |
stopifnot(!any(is.na(foldid)), nrow(x) == length(y)) |
|
|
10 |
|
|
|
11 |
weights = rep(NA_real_, length(y)) |
|
|
12 |
tab = table(y) |
|
|
13 |
stopifnot(setequal(names(tab), levels(y))) |
|
|
14 |
for (nm in names(tab)) weights[ y == nm ] = 1/tab[nm] |
|
|
15 |
|
|
|
16 |
cv.glmnet(x = x, y = y, foldid = foldid, weights = weights, ...) |
|
|
17 |
} |
|
|
18 |
|
|
|
19 |
entities = c("rLN", "MZL", "FL", "MCL", "DLBCL") |
|
|
20 |
|
|
|
21 |
# glmnet function part b |
|
|
22 |
my_glmnet = function(df) { |
|
|
23 |
require("glmnet") |
|
|
24 |
require("pamr") |
|
|
25 |
x = dplyr::select(df, all_of(cell_types)) |> as.matrix() |
|
|
26 |
y = factor(df$Entity, levels = entities) |
|
|
27 |
|
|
|
28 |
## estimate prediction performance by LOO CV |
|
|
29 |
confusion_table = lapply(seq_along(y), function(i) { |
|
|
30 |
pr = cv.glmnet.balanced(x[-i, ], y[-i], family = "multinomial") |> |
|
|
31 |
predict(newx = x[i,, drop = FALSE], type = "response") |
|
|
32 |
tibble(truth = y[i], |
|
|
33 |
predicted = factor(names(which.max(pr[1,,1])), levels = entities)) |
|
|
34 |
}) |> bind_rows() |> table() |
|
|
35 |
|
|
|
36 |
## final fit |
|
|
37 |
fit = cv.glmnet.balanced(x, y, family = "multinomial") |
|
|
38 |
cfit = coef(fit) |
|
|
39 |
coefs = lapply(names(cfit), function(ent) { |
|
|
40 |
m = cfit[[ent]] |
|
|
41 |
tibble(Entity = ent, |
|
|
42 |
cell_type = rownames(m), |
|
|
43 |
beta = m[, 1]) |
|
|
44 |
}) |> bind_rows() |> mutate(Entity = factor(Entity, levels = entities)) |
|
|
45 |
|
|
|
46 |
list(fit = fit, |
|
|
47 |
confusion_table = confusion_table, |
|
|
48 |
coefs = coefs) |
|
|
49 |
} |
|
|
50 |
|
|
|
51 |
# This function reads and handles TCR data |
|
|
52 |
readTCR <- function(files=NULL){ |
|
|
53 |
|
|
|
54 |
lapply(files, data.table::fread) %>% |
|
|
55 |
bind_rows() %>% |
|
|
56 |
as_tibble() %>% |
|
|
57 |
mutate(PatientID=strsplit(barcode, split = "_") %>% sapply("[[", 2)) %>% |
|
|
58 |
select(PatientID, 1:(ncol(.)-1)) %>% |
|
|
59 |
rename(Barcode_full=barcode) |
|
|
60 |
|
|
|
61 |
} |
|
|
62 |
|
|
|
63 |
# This function adds entity to df |
|
|
64 |
add_entity <- |
|
|
65 |
function(data){ |
|
|
66 |
data %>% |
|
|
67 |
left_join(., df_meta %>% select(PatientID, Entity) %>% distinct, by="PatientID") %>% |
|
|
68 |
dplyr::mutate(Entity=gsub(Entity, pattern = ", GCB|, non-GCB", replacement = "")) |
|
|
69 |
} |
|
|
70 |
|
|
|
71 |
|
|
|
72 |
# This function calculates the proportion of variables |
|
|
73 |
add_prop <- |
|
|
74 |
function(data=NULL, vars=NULL, group.vars=NULL, ungroup=TRUE, keep.n=FALSE, prop.name=NULL) { |
|
|
75 |
|
|
|
76 |
group.vars <- vars[group.vars] |
|
|
77 |
|
|
|
78 |
dftmp <- |
|
|
79 |
data %>% |
|
|
80 |
dplyr::count(!!!syms(unique(c(vars, group.vars)))) %>% |
|
|
81 |
group_by(!!!syms(c(group.vars))) %>% |
|
|
82 |
dplyr::mutate(Prop=n/sum(n)) |
|
|
83 |
|
|
|
84 |
if(keep.n==FALSE){ |
|
|
85 |
dftmp <- dftmp %>% select(-n) |
|
|
86 |
} |
|
|
87 |
|
|
|
88 |
if(ungroup==TRUE){ |
|
|
89 |
dftmp <- dftmp %>% ungroup() |
|
|
90 |
} |
|
|
91 |
|
|
|
92 |
if(!is.null(prop.name)){ |
|
|
93 |
colnames(dftmp)[which(colnames(dftmp)=="Prop")] <- prop.name |
|
|
94 |
|
|
|
95 |
} |
|
|
96 |
|
|
|
97 |
return(dftmp) |
|
|
98 |
} |
|
|
99 |
|
|
|
100 |
# This functions expands data frames and fills missing values with 0 |
|
|
101 |
fill_zeros <- |
|
|
102 |
function(data=NULL ,names_from=NULL, values_from=NULL) { |
|
|
103 |
data %>% |
|
|
104 |
pivot_wider(names_from = all_of(names_from), values_from = all_of(values_from), values_fill = 0) %>% |
|
|
105 |
pivot_longer(cols = (ncol(data)-1):ncol(.), names_to = names_from, values_to = values_from) |
|
|
106 |
} |
|
|
107 |
|
|
|
108 |
# Run standard Seurat processing pipeline |
|
|
109 |
SeuratProc_T <- |
|
|
110 |
function(sobj, verbose=FALSE, dims.clustering=NULL, resolution.clustering=NULL, dims.umap=NULL) { |
|
|
111 |
|
|
|
112 |
# Remove |
|
|
113 |
sobj <- DietSeurat(sobj) |
|
|
114 |
DefaultAssay(sobj) <- "RNA" |
|
|
115 |
|
|
|
116 |
# Filter data set based on RNA |
|
|
117 |
sobj <- FindVariableFeatures(sobj, selection.method = "vst", nfeatures = 2000, verbose=verbose) |
|
|
118 |
|
|
|
119 |
# Scale data (RNA and ADT) |
|
|
120 |
sobj <- ScaleData(sobj, features = rownames(sobj), verbose=verbose) |
|
|
121 |
|
|
|
122 |
# Assess cell cycle |
|
|
123 |
sobj <- CellCycleScoring(sobj, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, set.ident = TRUE) |
|
|
124 |
sobj <- ScaleData(sobj, vars.to.regress = c("S.Score", "G2M.Score", "percent.mt"), verbose=verbose) |
|
|
125 |
|
|
|
126 |
# Run PCA |
|
|
127 |
sobj <- RunPCA(sobj, features = VariableFeatures(sobj), nfeatures.print=5, ndims.print=1:2, |
|
|
128 |
reduction.name = "pcaRNA", reduction.key = "pcaRNA_") |
|
|
129 |
|
|
|
130 |
# Run clustering based on transcriptome |
|
|
131 |
sobj <- FindNeighbors(sobj, dims = dims.clustering, verbose=verbose, reduction = "pcaRNA") |
|
|
132 |
sobj <- FindClusters(sobj, resolution = resolution.clustering, verbose=verbose) |
|
|
133 |
|
|
|
134 |
# Run UMAP based on transcriptome |
|
|
135 |
sobj <- RunUMAP(sobj, dims = dims.umap, verbose=verbose, reduction.key = "umapRNA_", |
|
|
136 |
reduction.name = "umapRNA", reduction = "pcaRNA") |
|
|
137 |
|
|
|
138 |
return(sobj) |
|
|
139 |
|
|
|
140 |
} |
|
|
141 |
|
|
|
142 |
SeuratProcADT_T <- |
|
|
143 |
function(sobj, verbose=FALSE, dims.clustering=NULL, resolution.clustering=NULL, dims.umap=NULL) { |
|
|
144 |
|
|
|
145 |
DefaultAssay(sobj) <- "ADT" |
|
|
146 |
VariableFeatures(sobj, assay="ADT") <- rownames(sobj@assays$ADT) |
|
|
147 |
|
|
|
148 |
sobj <- ScaleData(sobj, assay = "ADT", verbose=verbose) |
|
|
149 |
|
|
|
150 |
#### Run PCA and print ElbowPlot |
|
|
151 |
sobj <- RunPCA(sobj, features = rownames(sobj@assays$ADT), nfeatures.print=5, ndims.print=1:2, |
|
|
152 |
reduction.name = "pcaADT", reduction.key = "pcaADT_") |
|
|
153 |
|
|
|
154 |
#### Run clustering based on ADT |
|
|
155 |
sobj <- FindNeighbors(sobj, dims = dims.clustering, verbose=verbose, reduction = "pcaADT") |
|
|
156 |
sobj <- FindClusters(sobj, resolution = resolution.clustering, verbose=verbose) |
|
|
157 |
|
|
|
158 |
#### Run UMAP based on ADT |
|
|
159 |
sobj <- RunUMAP(sobj, dims = dims.umap, verbose=verbose, reduction = "pcaADT", |
|
|
160 |
reduction.name = "umapADT", reduction.key = "umapADT_") |
|
|
161 |
|
|
|
162 |
DefaultAssay(sobj) <- "RNA" |
|
|
163 |
|
|
|
164 |
return(sobj) |
|
|
165 |
|
|
|
166 |
} |