a b/TcellAnalysis/notebooks/COVID19_Tcell_DA.Rmd
1
---
2
title: "COVID19: T cell differential abundance"
3
output: html_notebook
4
---
5
6
Performing differential abundance testing using a negative binomial GLM. Cell counts per donor sample are used as input, and the total number of cells 
7
captured (after QC) are used to normalize the model counts.
8
9
10
```{r, warning=FALSE, message=FALSE}
11
library(ggplot2)
12
library(ggsci)
13
library(cowplot)
14
library(RColorBrewer)
15
library(reshape2)
16
library(colorspace)
17
library(ggthemes)
18
library(edgeR)
19
library(scales)
20
library(ggrepel)
21
library(dplyr)
22
```
23
24
25
```{r}
26
covid.meta <- read.csv("~/Dropbox/COVID19/Data/Metadata FINAL 10122020.csv",
27
                        header=TRUE, stringsAsFactors=FALSE)
28
29
covid.meta$Days_from_onset[covid.meta$Days_from_onset %in% c("Not_known", "Healthy")] <- NA
30
covid.meta$Days_from_onset <- as.numeric(covid.meta$Days_from_onset)
31
32
cell.freq.merge <- read.table("~/Dropbox/COVID19/Data/Tcell_proportions.tsv",
33
                              sep="\t", header=TRUE, stringsAsFactors=FALSE)
34
35
cell.freq.merge <- cell.freq.merge[!cell.freq.merge$D0_status_summary %in% c("Non_covid"), ]
36
cell.freq.merge <- cell.freq.merge[!cell.freq.merge$patient_id %in% c("CV0198"), ]
37
# remove doublets, etc
38
cell.freq.merge <- cell.freq.merge[!cell.freq.merge$CellType %in% c("Doublets:Bcell", "Doublets:Platelet", "Doublets", "NK", "ILCs"), ]
39
```
40
41
42
```{r, warning=FALSE, message=FALSE}
43
all.meta <- read.table("~/Dropbox/COVID19/Data/COVID19_scMeta-data.tsv",
44
                       sep="\t", header=TRUE, stringsAsFactors=FALSE)
45
rownames(all.meta) <- all.meta$CellID
46
47
# remove BGCV01_CV0209 and CV0198
48
all.meta <- all.meta[!all.meta$sample_id %in% c("BGCV01_CV0902"), ]
49
all.meta <- all.meta[!all.meta$patient_id %in% c("CV0198"), ]
50
all.meta$Days_from_onset[all.meta$Days_from_onset %in% c("Not_known", "Healthy")] <- NA
51
all.meta$Days_from_onset <- as.numeric(all.meta$Days_from_onset)
52
53
n.cell.vecc <- table(all.meta$sample_id)
54
```
55
56
57
```{r}
58
tcell.annots <- read.table("~/Dropbox/COVID19/Data/Tcell_annotations_ext.tsv",
59
                           sep="\t", header=TRUE, stringsAsFactors=FALSE)
60
tcell.df.merge <- merge(tcell.annots, all.meta, by='CellID')
61
```
62
63
## Cluster-baed DA analysis: Infected vs. controls
64
65
```{r}
66
# set up testing model
67
rownames(covid.meta) <- covid.meta$sample_id
68
tcell.meta <- covid.meta[!covid.meta$D0_status_summary %in% c("Non_covid", "LPS"), ]
69
tcell.meta$OrderedSeverity <- ordered(tcell.meta$D0_status_summary,
70
                                      levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
71
tcell.meta$Infected <- ordered(tcell.meta$Status,
72
                               levels=c("Healthy", "Covid"))
73
74
tcell.model <- model.matrix(~ Sex + Age + Infected, data=tcell.meta[tcell.meta$Collection_Day %in% c("D0"), ])
75
76
# count cells
77
cell.freq.tab <- t(table(tcell.df.merge$sample_id[tcell.df.merge$Collection_Day %in% c("D0") &
78
                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid")],
79
                         tcell.df.merge$Sub.Annotation[tcell.df.merge$Collection_Day %in% c("D0") &
80
                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid")]))
81
cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("NK", "ILCs", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
82
test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
83
cell.freq.tab <- cell.freq.tab[, test.samps]
84
tcell.model <- tcell.model[colnames(cell.freq.tab), ]
85
86
tcell.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
87
88
#estimate dispersions
89
tcell.dge <- estimateDisp(tcell.dge, design=tcell.model)
90
tcell.linear.fit <- glmQLFit(tcell.dge, tcell.model, robust=TRUE)
91
tcell.res <- as.data.frame(topTags(glmQLFTest(tcell.linear.fit, coef=4), sort.by='none', n=Inf))
92
tcell.res$CellType <- rownames(tcell.res)
93
94
tcell.res$Sig <- as.numeric(tcell.res$FDR < 0.1)
95
tcell.res$Diff <- sign(tcell.res$logFC)
96
tcell.res$Diff[tcell.res$FDR > 0.1] <- 0
97
table(tcell.res$Diff)
98
```
99
100
101
102
```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
103
mx.lfc <- max(abs(tcell.res$logFC))
104
eps <- mx.lfc * 0.05
105
106
infect.resplot.labels <- tcell.res$CellType
107
infect.resplot.labels[tcell.res$Sig == 0] <- ""
108
109
ggplot(tcell.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
110
    geom_point(shape=21, size=4) +
111
    theme_cowplot() +
112
    scale_fill_manual(values=c("black", "red")) +
113
    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
114
    guides(fill=guide_legend(title="FDR < 0.1")) +
115
    geom_text_repel(aes(label=infect.resplot.labels),
116
                     fill='white') +
117
    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
118
    ggsave("~/Dropbox/COVID19/plot.dir/Tcell_edgeR_volcano_caseVscontrol-linear.pdf",
119
           height=2.95, width=4.15, useDingbats=FALSE) +
120
    NULL
121
```
122
123
124
## Cluster-baed DA analysis: COVID19 severity
125
126
```{r}
127
# set up testing model
128
rownames(covid.meta) <- covid.meta$sample_id
129
tcell.meta <- covid.meta[!covid.meta$D0_status_summary %in% c("Non_covid", "LPS", "Healthy"), ]
130
tcell.meta$OrderedSeverity <- ordered(tcell.meta$D0_status_summary,
131
                                      levels=c("Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
132
133
tcell.model <- model.matrix(~ Sex + Age + OrderedSeverity, data=tcell.meta[tcell.meta$Collection_Day %in% c("D0"), ])
134
135
# count cells
136
cell.freq.tab <- t(table(tcell.df.merge$sample_id[tcell.df.merge$Collection_Day %in% c("D0") &
137
                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid", "Healthy")],
138
                         tcell.df.merge$Sub.Annotation[tcell.df.merge$Collection_Day %in% c("D0") &
139
                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid", "Healthy")]))
140
cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("NK", "ILCs", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
141
test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
142
cell.freq.tab <- cell.freq.tab[, test.samps]
143
tcell.model <- tcell.model[colnames(cell.freq.tab), ]
144
145
tcell.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
146
147
#estimate dispersions
148
tcell.dge <- estimateDisp(tcell.dge, design=tcell.model)
149
tcell.linear.fit <- glmQLFit(tcell.dge, tcell.model, robust=TRUE)
150
tcell.res <- as.data.frame(topTags(glmQLFTest(tcell.linear.fit, coef=4), sort.by='none', n=Inf))
151
tcell.res$CellType <- rownames(tcell.res)
152
153
tcell.res$Sig <- as.numeric(tcell.res$FDR < 0.1)
154
tcell.res$Diff <- sign(tcell.res$logFC)
155
tcell.res$Diff[tcell.res$FDR > 0.1] <- 0
156
table(tcell.res$Diff)
157
```
158
159
160
161
```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
162
mx.lfc <- max(abs(tcell.res$logFC))
163
eps <- mx.lfc * 0.05
164
165
tcell.resplot.labels <- tcell.res$CellType
166
tcell.resplot.labels[tcell.res$Sig == 0] <- ""
167
168
ggplot(tcell.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
169
    geom_point(shape=21, size=4) +
170
    theme_cowplot() +
171
    scale_fill_manual(values=c("black", "red")) +
172
    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
173
    guides(fill=guide_legend(title="FDR < 0.1")) +
174
    geom_text_repel(aes(label=tcell.resplot.labels),
175
                     fill='white') +
176
    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
177
    ggsave("~/Dropbox/COVID19/plot.dir/Tcell_edgeR_volcano_caseOnly-linear.pdf",
178
           height=2.95, width=4.15, useDingbats=FALSE) +
179
    NULL
180
```
181
182
183
Quadratic changes.
184
185
186
```{r}
187
tcell.quad.res <- as.data.frame(topTags(glmQLFTest(tcell.linear.fit, coef=5), sort.by='none', n=Inf))
188
tcell.quad.res$CellType <- rownames(tcell.quad.res)
189
190
tcell.quad.res$Sig <- as.numeric(tcell.quad.res$FDR < 0.1)
191
tcell.quad.res$Diff <- sign(tcell.quad.res$logFC)
192
tcell.quad.res$Diff[tcell.quad.res$FDR > 0.1] <- 0
193
table(tcell.quad.res$Diff)
194
```
195
196
197
```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
198
mx.lfc <- max(abs(tcell.quad.res$logFC))
199
eps <- mx.lfc * 0.05
200
201
tcell.quad.resplot.labels <- tcell.quad.res$CellType
202
tcell.quad.resplot.labels[tcell.quad.res$Sig == 0] <- ""
203
204
ggplot(tcell.quad.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
205
    geom_point(shape=21, size=4) +
206
    theme_cowplot() +
207
    scale_fill_manual(values=c("black", "red")) +
208
    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
209
    guides(fill=guide_legend(title="FDR < 0.1")) +
210
    geom_text_repel(aes(label=tcell.quad.resplot.labels),
211
                    force=10,
212
                    fill='white') +
213
    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
214
    ggsave("~/Dropbox/COVID19/plot.dir/Tcell_edgeR_volcano_caseOnly-quadratic.pdf",
215
           height=2.95, width=4.15, useDingbats=FALSE) +
216
    NULL
217
```
218
219
Show a plot of just the DA clusters.
220
221
```{r, warning=FALSE, message=FALSE, fig.height=4.15, fig.width=9.95}
222
da.clusters <- setdiff(unique(c(tcell.resplot.labels, tcell.quad.resplot.labels)), "CD4.Th17")
223
224
group.cols <- c("#2ca02c", "#1f77b4", "#9467bd", "#fed976", "#fd8d3c", "#e31a1c", "#800026", "#252525")
225
names(group.cols) <- c("Healthy", "LPS", "Non_covid", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical")
226
227
# censor outliers to the 95th quantile for each cell type.
228
plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
229
                           !cell.freq.merge$D0_status_summary %in% c("LPS") &
230
                           cell.freq.merge$CellType %in% da.clusters, ]
231
232
q95.ct <- sapply(unique(plotting.df$CellType),
233
                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
234
q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
235
236
plotting.df <- merge(plotting.df, q95.df, by='CellType')
237
plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
238
239
plotting.df$D0_status_summary <- ordered(plotting.df$D0_status_summary,
240
                                         levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
241
242
ggplot(plotting.df[!plotting.df$D0_status_summary %in% c("Healthy"), ],
243
       aes(x=D0_status_summary, y=Freq, fill=D0_status_summary)) +
244
    geom_boxplot(outlier.size=0.5, coef=1.5) +
245
    theme_cowplot() +
246
    facet_wrap(~CellType, scales="free_y", nrow=3) +
247
    scale_fill_manual(values=group.cols) +
248
    expand_limits(y=c(0)) +
249
    theme(aspect=1/1.3,
250
          axis.text.x=element_blank(),
251
          axis.ticks.x=element_blank(),
252
          legend.key.size=unit(0.4, "cm"),
253
          strip.background=element_rect(fill='white', colour='white'),
254
          strip.text=element_text(size=12)) +
255
    labs(x="Severity", y="Proportion") +
256
    guides(fill=guide_legend(title="Severity")) +
257
    ggsave("~/Dropbox/COVID19/plot.dir/Tcells_proportions-DA_CasesOnly-boxplot.pdf",
258
           height=4.15, width=9.95, useDingbats=FALSE) +
259
    NULL
260
```
261
262
Also show the equivalent plot of healthy vs. infected.
263
264
265
```{r, warning=FALSE, message=FALSE, fig.height=4.15, fig.width=9.95}
266
infect.da.clusters <- setdiff(unique(c(infect.resplot.labels)), "CD4.Th17")
267
268
group.cols <- c("#2ca02c", "#252525")
269
names(group.cols) <- c("Healthy", "COVID-19")
270
271
# censor outliers to the 95th quantile for each cell type.
272
plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
273
                           !cell.freq.merge$D0_status_summary %in% c("LPS") &
274
                           cell.freq.merge$CellType %in% infect.da.clusters, ]
275
276
q95.ct <- sapply(unique(plotting.df$CellType),
277
                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
278
q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
279
280
plotting.df <- merge(plotting.df, q95.df, by='CellType')
281
plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
282
283
plotting.df$Status <- factor(plotting.df$Status,
284
                             levels=c("Healthy", "Covid"),
285
                             labels=c("Healthy", "COVID-19"))
286
287
ggplot(plotting.df,
288
       aes(x=Status, y=Freq, fill=Status)) +
289
    geom_boxplot(outlier.size=0.5, coef=1.5) +
290
    theme_cowplot() +
291
    facet_wrap(~CellType, scales="free_y", nrow=3) +
292
    scale_fill_manual(values=group.cols) +
293
    expand_limits(y=c(0)) +
294
    theme(aspect=1/1.3,
295
          axis.text.x=element_blank(),
296
          axis.ticks.x=element_blank(),
297
          legend.key.size=unit(0.4, "cm"),
298
          strip.background=element_rect(fill='white', colour='white'),
299
          strip.text=element_text(size=12)) +
300
    labs(x="", y="Proportion") +
301
    guides(fill=guide_legend(title="Status")) +
302
    ggsave("~/Dropbox/COVID19/plot.dir/Tcells_proportions-DA_CaseVsControls-boxplot.pdf",
303
           height=4.15, width=9.95, useDingbats=FALSE) +
304
    NULL
305
```
306
307
308
## Cluster-based DA analysis: days since symptom onset
309
310
```{r}
311
# set up testing model
312
rownames(covid.meta) <- covid.meta$sample_id
313
tcell.meta <- covid.meta[!covid.meta$D0_status_summary %in% c("Non_covid", "LPS", "Healthy"), ]
314
tcell.meta$OrderedSeverity <- ordered(tcell.meta$D0_status_summary,
315
                                      levels=c("Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
316
tcell.meta <- tcell.meta[tcell.meta$D0_status_summary %in% c("Mild", "Moderate", "Severe", "Critical"), ]
317
318
tcell.meta$Days_from_onset[tcell.meta$Days_from_onset %in% c("Not_known")] <- NA
319
tcell.meta$Days_from_onset <- as.numeric(tcell.meta$Days_from_onset)
320
tcell.meta <- tcell.meta[!is.na(tcell.meta$Days_from_onset), ]
321
322
323
tcell.model <- model.matrix(~ Sex + Age + Days_from_onset, 
324
                            data=tcell.meta[tcell.meta$Collection_Day %in% c("D0"), ])
325
326
# count cells
327
cell.freq.tab <- t(table(tcell.df.merge$sample_id[tcell.df.merge$Collection_Day %in% c("D0") &
328
                                                      !is.na(tcell.df.merge$Days_from_onset) &
329
                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic")],
330
                         tcell.df.merge$Sub.Annotation[tcell.df.merge$Collection_Day %in% c("D0") &
331
                                                           !is.na(tcell.df.merge$Days_from_onset) &
332
                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic")]))
333
cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("NK", "ILCs", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
334
test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
335
cell.freq.tab <- cell.freq.tab[, test.samps]
336
tcell.model <- tcell.model[colnames(cell.freq.tab), ]
337
338
tcell.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
339
340
#estimate dispersions
341
tcell.dge <- estimateDisp(tcell.dge, design=tcell.model)
342
tcell.linear.fit <- glmQLFit(tcell.dge, tcell.model, robust=TRUE)
343
tcell.res <- as.data.frame(topTags(glmQLFTest(tcell.linear.fit, coef=4), sort.by='none', n=Inf))
344
tcell.res$CellType <- rownames(tcell.res)
345
346
tcell.res$Sig <- as.numeric(tcell.res$FDR < 0.1)
347
tcell.res$Diff <- sign(tcell.res$logFC)
348
tcell.res$Diff[tcell.res$FDR > 0.1] <- 0
349
table(tcell.res$Diff)
350
```
351
352
353
354
```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
355
mx.lfc <- max(abs(tcell.res$logFC))
356
eps <- mx.lfc * 0.05
357
358
tcell.resplot.labels <- tcell.res$CellType
359
tcell.resplot.labels[tcell.res$Sig == 0] <- ""
360
361
ggplot(tcell.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
362
    geom_point(shape=21, size=4) +
363
    theme_cowplot() +
364
    scale_fill_manual(values=c("black", "red")) +
365
    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
366
    guides(fill=guide_legend(title="FDR < 0.1")) +
367
    geom_text_repel(aes(label=tcell.resplot.labels),
368
                     fill='white') +
369
    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
370
    ggsave("~/Dropbox/COVID19/plot.dir/Tcell_edgeR_volcano_daysOnset-linear.pdf",
371
           height=2.95, width=4.15, useDingbats=FALSE) +
372
    NULL
373
```
374
375
376
Show a plot of just the DA clusters.
377
378
```{r, warning=FALSE, message=FALSE, fig.height=4.15, fig.width=9.95}
379
da.clusters <- setdiff(unique(c(tcell.resplot.labels, tcell.quad.resplot.labels)), "CD4.Th17")
380
381
group.cols <- c("#2ca02c", "#1f77b4", "#9467bd", "#fed976", "#fd8d3c", "#e31a1c", "#800026", "#252525")
382
names(group.cols) <- c("Healthy", "LPS", "Non_covid", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical")
383
384
# censor outliers to the 95th quantile for each cell type.
385
plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
386
                                   !cell.freq.merge$D0_status_summary %in% c("LPS") &
387
                                   !is.na(cell.freq.merge$Days_from_onset) &
388
                                   cell.freq.merge$CellType %in% da.clusters, ]
389
390
q95.ct <- sapply(unique(plotting.df$CellType),
391
                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
392
q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
393
394
plotting.df <- merge(plotting.df, q95.df, by='CellType')
395
plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
396
397
plotting.df$D0_status_summary <- ordered(plotting.df$D0_status_summary,
398
                                         levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
399
400
ggplot(plotting.df[!plotting.df$D0_status_summary %in% c("Healthy", "Asymptomatic"), ],
401
       aes(x=Days_from_onset, y=Freq)) +
402
    #geom_boxplot(outlier.size=0.5, coef=1.5) +
403
    #geom_point(size=1) +
404
    stat_smooth() +
405
    theme_cowplot() +
406
    facet_wrap(~CellType, scales="free_y", nrow=3) +
407
    #scale_colour_manual(values=group.cols) +
408
    expand_limits(y=c(0)) +
409
    theme(aspect=1/1.3,
410
          axis.text.x=element_blank(),
411
          axis.ticks.x=element_blank(),
412
          legend.key.size=unit(0.4, "cm"),
413
          strip.background=element_rect(fill='white', colour='white'),
414
          strip.text=element_text(size=12)) +
415
    labs(x="Severity", y="Proportion") +
416
    guides(fill=guide_legend(title="Severity")) +
417
    ggsave("~/Dropbox/COVID19/plot.dir/Tcells_proportions-DA_daysOnset-points.pdf",
418
           height=4.15, width=9.95, useDingbats=FALSE) +
419
    NULL
420
```
421
422
423