Switch to unified view

a b/Notebooks/DA_analysis-symptom_onset - Updated.Rmd
1
---
2
title: "COVID19: Differential abundance based on symptom onset - Updated"
3
output: html_notebook
4
---
5
6
Performing differential abundance testing using a negative binomial GLM. Cell counts per donor sample are used as input, and the total number of cells 
7
captured (after QC) are used to normalize the model counts. This notebook is concerned with using the initial clustering results and their differential 
8
abundance according to symptom onset in hospitalised patients.
9
10
11
```{r, warning=FALSE, message=FALSE}
12
library(ggplot2)
13
library(ggsci)
14
library(cowplot)
15
library(RColorBrewer)
16
library(reshape2)
17
library(colorspace)
18
library(ggthemes)
19
library(edgeR)
20
library(scales)
21
library(ggrepel)
22
library(dplyr)
23
library(kableExtra)
24
```
25
26
27
```{r}
28
covid.meta <- read.csv("~/Dropbox/COVID19/Data/basic_COVID19_PBMC_metadata_160212.csv",
29
                        header=TRUE, stringsAsFactors=FALSE)
30
old.meta <- read.csv("~/Dropbox/COVID19/Data/Metadata FINAL 10122020.csv",
31
                     header=TRUE, stringsAsFactors=FALSE)
32
old.meta$sample_id[old.meta$sample_id %in% c("BGCV13_CV0201")] <- "BGCV06_CV0201"
33
old.meta$sample_id[old.meta$sample_id %in% c("BGCV06_CV0326")] <- "BGCV13_CV0326"
34
35
covid.meta$AgeRange <- covid.meta$Age
36
covid.meta$Status_on_day_collection_summary[covid.meta$Status_on_day_collection_summary %in% c("LPS_10hours")] <- "LPS"
37
covid.meta$Status_on_day_collection_summary[covid.meta$Status_on_day_collection_summary %in% c("LPS_90mins")] <- "LPS"
38
39
covid.meta <- merge(covid.meta[, !colnames(covid.meta) %in% c("Age")], old.meta[, c("sample_id", "patient_id", "Age")],
40
                    by='sample_id')
41
42
covid.meta$Days_from_onset[covid.meta$Days_from_onset %in% c("Not_known", "Healthy")] <- NA
43
covid.meta$Days_from_onset <- as.numeric(covid.meta$Days_from_onset)
44
```
45
46
Plot days since symptom onset by disease severity and Site.
47
48
```{r, fig.height=2.95, fig.width=3.95,}
49
covid.meta$OrderedSeverity <- ordered(covid.meta$Status_on_day_collection_summary,
50
                                      levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
51
52
ggplot(covid.meta[covid.meta$Collection_Day %in% c("D0") &
53
                      !covid.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Asymptomatic", "Healthy") &
54
                      !is.na(covid.meta$Days_from_onset), ],
55
       aes(x=OrderedSeverity, y=Days_from_onset)) +
56
    geom_boxplot() +
57
    labs(x="Disease severity", y="Days from\nsymptom onset") +
58
    theme_cowplot()  +
59
    theme(aspect=1,
60
          axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
61
    ggsave("~/Dropbox/COVID19/Updated_plots/SymptomOnset_severity-boxplot.pdf",
62
           height=2.95, width=3.95, useDingbats=FALSE) +
63
    NULL
64
```
65
66
67
```{r, fig.height=2.95, fig.width=3.95,}
68
covid.meta$OrderedSeverity <- ordered(covid.meta$Status_on_day_collection_summary,
69
                                      levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
70
71
ggplot(covid.meta[covid.meta$Collection_Day %in% c("D0") &
72
                      !covid.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Asymptomatic", "Healthy") &
73
                      !is.na(covid.meta$Days_from_onset), ],
74
       aes(x=Site, y=Days_from_onset)) +
75
    geom_boxplot() +
76
    labs(x="Disease severity", y="Days from\nsymptom onset") +
77
    theme_cowplot()  +
78
    theme(aspect=1,
79
          axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
80
    ggsave("~/Dropbox/COVID19/Updated_plots/SymptomOnset_Site-boxplot.pdf",
81
           height=2.95, width=3.95, useDingbats=FALSE) +
82
    NULL
83
```
84
85
86
87
```{r, warning=FALSE, message=FALSE}
88
all.meta <- read.table("~/Dropbox/COVID19/Data/Updated/COVID19_scMeta-data.tsv",
89
                       sep="\t", header=TRUE, stringsAsFactors=FALSE)
90
rownames(all.meta) <- all.meta$CellID
91
92
doublet.remove <- read.table("~/Dropbox/COVID19/Data/Updated/All_doublets.tsv",
93
                             sep="\t", header=FALSE, stringsAsFactors=FALSE)
94
95
# remove BGCV01_CV0209 and CV0198
96
all.meta <- all.meta[!all.meta$sample_id %in% c("BGCV01_CV0902"), ]
97
all.meta <- all.meta[!all.meta$patient_id %in% c("CV0198"), ]
98
all.meta$Days_from_onset[all.meta$Days_from_onset %in% c("Not_known", "Healthy")] <- NA
99
all.meta$Days_from_onset <- as.numeric(all.meta$Days_from_onset)
100
101
n.cell.vecc <- table(all.meta$sample_id)
102
103
all.meta <- all.meta[!all.meta$CellID %in% doublet.remove$V1, ]
104
```
105
106
107
```{r}
108
cell.xtab <- as.data.frame(xtabs( ~ sample_id +  initial_clustering, data=all.meta))
109
cell.cast <- dcast(cell.xtab, sample_id ~ initial_clustering, value.var='Freq')
110
rownames(cell.cast) <- cell.cast$sample_id
111
cell.cast <- cell.cast[, -1]
112
cell.freq <- as.data.frame(t(sapply(rownames(cell.cast), FUN=function(X) as.numeric(cell.cast[X, ]/n.cell.vecc[X]),
113
                                    simplify=TRUE)))
114
115
rownames(cell.freq) <- rownames(cell.cast)
116
colnames(cell.freq) <- colnames(cell.cast)
117
cell.freq$sample_id <- rownames(cell.cast)
118
cell.melt <- melt(cell.freq, id.vars=c('sample_id'))
119
colnames(cell.melt) <- c("sample_id", "CellType", "Freq")
120
cell.melt$CellType <- as.character(cell.melt$CellType)
121
122
cell.freq.merge <- merge(cell.melt, covid.meta, by='sample_id')
123
```
124
125
126
## Cluster-based DA analysis: days since symptom onset
127
128
```{r}
129
# set up testing model
130
rownames(covid.meta) <- covid.meta$sample_id
131
init.meta <- covid.meta[!covid.meta$Status_on_day_collection_summary %in% c("Non_covid", "LPS", "Healthy"), ]
132
init.meta$OrderedSeverity <- ordered(init.meta$Status_on_day_collection_summary,
133
                                      levels=c("Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
134
init.meta <- init.meta[init.meta$Status_on_day_collection_summary %in% c("Mild", "Moderate", "Severe", "Critical"), ]
135
136
init.meta$Days_from_onset[init.meta$Days_from_onset %in% c("Not_known")] <- NA
137
init.meta$Days_from_onset <- as.numeric(init.meta$Days_from_onset)
138
init.meta <- init.meta[!is.na(init.meta$Days_from_onset), ]
139
140
init.model <- model.matrix(~ Sex + Age + Site + Days_from_onset, 
141
                            data=init.meta[init.meta$Collection_Day %in% c("D0"), ])
142
143
# count cells
144
cell.freq.tab <- t(table(all.meta$sample_id[all.meta$Collection_Day %in% c("D0") &
145
                                                !is.na(all.meta$Days_from_onset) &
146
                                                !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic")],
147
                         all.meta$initial_clustering[all.meta$Collection_Day %in% c("D0") &
148
                                                         !is.na(all.meta$Days_from_onset) &
149
                                                         !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic")]))
150
151
cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("Doublet", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
152
test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
153
cell.freq.tab <- cell.freq.tab[, test.samps]
154
init.model <- init.model[colnames(cell.freq.tab), ]
155
156
init.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
157
158
#estimate dispersions
159
init.dge <- estimateDisp(init.dge, design=init.model)
160
init.linear.fit <- glmQLFit(init.dge, init.model, robust=TRUE)
161
init.res <- as.data.frame(topTags(glmQLFTest(init.linear.fit, coef=4), sort.by='none', n=Inf))
162
init.res$CellType <- rownames(init.res)
163
164
init.res$Sig <- as.numeric(init.res$FDR < 0.1)
165
init.res$Diff <- sign(init.res$logFC)
166
init.res$Diff[init.res$FDR > 0.1] <- 0
167
table(init.res$Diff)
168
```
169
170
171
172
```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
173
mx.lfc <- max(abs(init.res$logFC))
174
eps <- mx.lfc * 0.05
175
176
init.resplot.labels <- init.res$CellType
177
init.resplot.labels[init.res$Sig == 0] <- ""
178
179
ggplot(init.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
180
    geom_point(shape=21, size=4) +
181
    theme_cowplot() +
182
    scale_fill_manual(values=c("black", "red")) +
183
    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
184
    guides(fill=guide_legend(title="FDR < 0.1")) +
185
    geom_text_repel(aes(label=init.resplot.labels),
186
                     fill='white') +
187
    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
188
    ggsave("~/Dropbox/COVID19/Updated_plots/All_edgeR_volcano_daysOnset-linear.pdf",
189
           height=2.95, width=4.15, useDingbats=FALSE) +
190
    NULL
191
```
192
193
194
Show a plot of just the DA clusters.
195
196
```{r, warning=FALSE, message=FALSE, fig.height=5.15, fig.width=9.95}
197
da.clusters <- unique(c(init.resplot.labels))
198
199
group.cols <- c("#2ca02c", "#1f77b4", "#9467bd", "#fed976", "#fd8d3c", "#e31a1c", "#800026", "#252525")
200
names(group.cols) <- c("Healthy", "LPS", "Non_covid", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical")
201
202
# censor outliers to the 95th quantile for each cell type.
203
plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
204
                                   !cell.freq.merge$Status_on_day_collection_summary %in% c("LPS", "Non_covid") &
205
                                   !is.na(cell.freq.merge$Days_from_onset) &
206
                                   cell.freq.merge$CellType %in% da.clusters, ]
207
208
q95.ct <- sapply(unique(plotting.df$CellType),
209
                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
210
q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
211
212
plotting.df <- merge(plotting.df, q95.df, by='CellType')
213
plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
214
215
plotting.df$Status_on_day_collection_summary <- ordered(plotting.df$Status_on_day_collection_summary,
216
                                         levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
217
218
ggplot(plotting.df[!plotting.df$Status_on_day_collection_summary %in% c("Healthy", "Asymptomatic"), ],
219
       aes(x=Days_from_onset, y=Freq)) +
220
    #geom_boxplot(outlier.size=0.5, coef=1.5) +
221
    geom_point(aes(colour=Status_on_day_collection_summary), size=1) +
222
    stat_smooth(method=MASS::rlm) +
223
    theme_cowplot() +
224
    facet_wrap(~CellType, scales="free_y", nrow=3) +
225
    scale_colour_manual(values=group.cols) +
226
    expand_limits(y=c(0)) +
227
    theme(aspect=1/1.3,
228
          legend.key.size=unit(0.4, "cm"),
229
          strip.background=element_rect(fill='white', colour='white'),
230
          strip.text=element_text(size=12)) +
231
    labs(x="Severity", y="Proportion") +
232
    guides(colour=guide_legend(title="Severity")) +
233
    ggsave("~/Dropbox/COVID19/Updated_plots/All_proportions-DA_daysOnset-points.pdf",
234
           height=5.15, width=9.95, useDingbats=FALSE) +
235
    NULL
236
```
237
238
Many of these changes seem to be driven by critically ill patients.
239
240
241
```{r, warning=FALSE, message=FALSE}
242
write.table(init.res,
243
            file="~/Dropbox/COVID19/Data/Updated/SymptomOnset_resTable.txt",
244
            sep="\t", quote=FALSE, row.names=FALSE)
245
246
# print P-values to html table
247
clon.lm.pvalues <- data.frame("CellType"=init.res$CellType, "logFC"=init.res$logFC,
248
                              "Pvalue"=init.res$PValue,
249
                              "FDR"=init.res$FDR)
250
251
kbl(clon.lm.pvalues) %>% kable_paper(full_width=FALSE) %>% 
252
    save_kable("~/Dropbox/COVID19/Updated_plots/DA_symptomOnset_trend_pvals.html",
253
               self_contained=TRUE)
254
255
kable(clon.lm.pvalues)
256
```
257
258
259
## Cluster-based DA analysis: days since symptom onset - excluding critical patients
260
261
```{r}
262
# set up testing model
263
rownames(covid.meta) <- covid.meta$sample_id
264
sanscrit.meta <- covid.meta[!covid.meta$Status_on_day_collection_summary %in% c("Non_covid", "LPS", "Healthy", "Critical"), ]
265
sanscrit.meta$Days_from_onset[sanscrit.meta$Days_from_onset %in% c("Not_known")] <- NA
266
sanscrit.meta$Days_from_onset <- as.numeric(sanscrit.meta$Days_from_onset)
267
sanscrit.meta <- sanscrit.meta[!is.na(sanscrit.meta$Days_from_onset), ]
268
269
sanscrit.model <- model.matrix(~ Sex + Age + Site + Days_from_onset, 
270
                            data=sanscrit.meta[sanscrit.meta$Collection_Day %in% c("D0"), ])
271
272
# count cells
273
cell.freq.tab <- t(table(all.meta$sample_id[all.meta$Collection_Day %in% c("D0") &
274
                                                !is.na(all.meta$Days_from_onset) &
275
                                                !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic", "Critical")],
276
                         all.meta$initial_clustering[all.meta$Collection_Day %in% c("D0") &
277
                                                         !is.na(all.meta$Days_from_onset) &
278
                                                         !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic", "Critical")]))
279
280
cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("Doublet", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
281
test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
282
cell.freq.tab <- cell.freq.tab[, test.samps]
283
sanscrit.model <- sanscrit.model[colnames(cell.freq.tab), ]
284
285
sanscrit.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
286
287
#estimate dispersions
288
sanscrit.dge <- estimateDisp(sanscrit.dge, design=sanscrit.model)
289
sanscrit.linear.fit <- glmQLFit(sanscrit.dge, sanscrit.model, robust=TRUE)
290
sanscrit.res <- as.data.frame(topTags(glmQLFTest(sanscrit.linear.fit, coef=4), sort.by='none', n=Inf))
291
sanscrit.res$CellType <- rownames(sanscrit.res)
292
293
sanscrit.res$Sig <- as.numeric(sanscrit.res$FDR < 0.1)
294
sanscrit.res$Diff <- sign(sanscrit.res$logFC)
295
sanscrit.res$Diff[sanscrit.res$FDR > 0.1] <- 0
296
table(sanscrit.res$Diff)
297
```
298
299
300
301
```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
302
mx.lfc <- max(abs(sanscrit.res$logFC))
303
eps <- mx.lfc * 0.05
304
305
sanscrit.resplot.labels <- sanscrit.res$CellType
306
sanscrit.resplot.labels[sanscrit.res$Sig == 0] <- ""
307
308
ggplot(sanscrit.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
309
    geom_point(shape=21, size=4) +
310
    theme_cowplot() +
311
    scale_fill_manual(values=c("black", "red")) +
312
    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
313
    guides(fill=guide_legend(title="FDR < 0.1")) +
314
    geom_text_repel(aes(label=sanscrit.resplot.labels),
315
                     fill='white') +
316
    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
317
    ggsave("~/Dropbox/COVID19/Updated_plots/All_edgeR_volcano_daysOnset-noCritical-linear.pdf",
318
           height=2.95, width=4.15, useDingbats=FALSE) +
319
    NULL
320
```
321
322
323
Show a plot of just the DA clusters.
324
325
```{r, warning=FALSE, message=FALSE, fig.height=5.15, fig.width=9.95}
326
da.clusters <- unique(c(sanscrit.resplot.labels))
327
328
group.cols <- c("#2ca02c", "#1f77b4", "#9467bd", "#fed976", "#fd8d3c", "#e31a1c", "#800026", "#252525")
329
names(group.cols) <- c("Healthy", "LPS", "Non_covid", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical")
330
331
# censor outliers to the 95th quantile for each cell type.
332
plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
333
                                   !cell.freq.merge$Status_on_day_collection_summary %in% c("LPS", "Non_covid") &
334
                                   !is.na(cell.freq.merge$Days_from_onset) &
335
                                   cell.freq.merge$CellType %in% da.clusters, ]
336
337
q95.ct <- sapply(unique(plotting.df$CellType),
338
                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
339
q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
340
341
plotting.df <- merge(plotting.df, q95.df, by='CellType')
342
plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
343
344
plotting.df$Status_on_day_collection_summary <- ordered(plotting.df$Status_on_day_collection_summary,
345
                                         levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe"))
346
347
ggplot(plotting.df[!plotting.df$Status_on_day_collection_summary %in% c("Healthy", "Asymptomatic", "Critical"), ],
348
       aes(x=Days_from_onset, y=Freq)) +
349
    #geom_boxplot(outlier.size=0.5, coef=1.5) +
350
    geom_point(aes(colour=Status_on_day_collection_summary), size=1) +
351
    stat_smooth(method=MASS::rlm) +
352
    theme_cowplot() +
353
    facet_wrap(~CellType, scales="free_y", nrow=3) +
354
    scale_colour_manual(values=group.cols) +
355
    expand_limits(y=c(0)) +
356
    theme(aspect=1/1.3,
357
          legend.key.size=unit(0.4, "cm"),
358
          strip.background=element_rect(fill='white', colour='white'),
359
          strip.text=element_text(size=12)) +
360
    labs(x="Severity", y="Proportion") +
361
    guides(colour=guide_legend(title="Symptom duration")) +
362
    ggsave("~/Dropbox/COVID19/Updated_plots/All_proportions-DA_daysOnset-noCritical-points.pdf",
363
           height=5.15, width=9.95, useDingbats=FALSE) +
364
    NULL
365
```
366
367
Many of these changes seem to be driven by critically ill patients.
368
369
370
```{r, warning=FALSE, message=FALSE}
371
write.table(sanscrit.res,
372
            file="~/Dropbox/COVID19/Data/Updated/SymptomOnset-noCritical_resTable.txt",
373
            sep="\t", quote=FALSE, row.names=FALSE)
374
375
# print P-values to html table
376
clon.lm.pvalues <- data.frame("CellType"=sanscrit.res$CellType, "logFC"=sanscrit.res$logFC,
377
                              "Pvalue"=sanscrit.res$PValue,
378
                              "FDR"=sanscrit.res$FDR)
379
380
kbl(clon.lm.pvalues) %>% kable_paper(full_width=FALSE) %>% 
381
    save_kable("~/Dropbox/COVID19/Updated_plots/DA_symptomOnset-noCritical_trend_pvals.html",
382
               self_contained=TRUE)
383
384
kable(clon.lm.pvalues)
385
```
386
387
388
Compare the log fold changes with and without the critical patients.
389
390
```{r}
391
colnames(init.res) <- c(paste0("Full.", colnames(init.res)[c(1:5)]), "CellType", "Full.Sig", "Full.Diff")
392
colnames(sanscrit.res) <- c(paste0("woCritical.", colnames(sanscrit.res)[c(1:5)]), "CellType", "woCritical.Sig", "woCritical.Diff")
393
```
394
395
396
```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=2.95}
397
compare.model <- merge(init.res, sanscrit.res, by='CellType')
398
399
compare.model$Sigs <- 0
400
compare.model$Sigs[compare.model$Full.Sig == 1 & compare.model$woCritical.Sig == 1] <- 1
401
402
cell.labels <- compare.model$CellType
403
cell.labels[compare.model$Sigs == 0] <- ""
404
405
ggplot(compare.model, aes(x=Full.logFC, y=woCritical.logFC)) +
406
    geom_hline(yintercept=0, lty=2) +
407
    geom_vline(xintercept=0, lty=2) +
408
    geom_point() +
409
    theme_cowplot() +
410
    labs(x="log Fold Change: Full model",
411
         y="log Fold Change: w/o Critical") +
412
    geom_text_repel(aes(label=cell.labels),
413
                    force=50) +
414
    ggsave("~/Dropbox/COVID19/Updated_plots/SymptomOnset_compare-points.pdf",
415
           height=2.95, width=2.95, useDingbats=FALSE) +
416
    NULL
417
```
418
419
420
## Sensitivity analysis
421
422
Remove the outliers w.r.t. time since symptom onset to test the sensitivity.
423
424
## Cluster-based DA analysis: days since symptom onset - excluding > 24 days onset
425
426
```{r, fig.height=2.95, fig.width=3.95,}
427
428
ggplot(covid.meta[covid.meta$Collection_Day %in% c("D0") &
429
                      covid.meta$Days_from_onset <= 24 &
430
                      !covid.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Asymptomatic", "Healthy") &
431
                      !is.na(covid.meta$Days_from_onset), ],
432
       aes(x=OrderedSeverity, y=Days_from_onset)) +
433
    geom_boxplot() +
434
    labs(x="Disease severity", y="Days from\nsymptom onset") +
435
    theme_cowplot()  +
436
    theme(aspect=1,
437
          axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
438
    ggsave("~/Dropbox/COVID19/Updated_plots/SymptomOnset_severity-sensitivityAnalysis-boxplot.pdf",
439
           height=2.95, width=3.95, useDingbats=FALSE) +
440
    NULL
441
```
442
443
This shows the distribution of symptom duration w.r.t. disease severity.
444
445
```{r}
446
# set up testing model
447
rownames(covid.meta) <- covid.meta$sample_id
448
symoutlier.meta <- covid.meta[!covid.meta$Status_on_day_collection_summary %in% c("Non_covid", "LPS", "Healthy"), ]
449
symoutlier.meta$Days_from_onset[symoutlier.meta$Days_from_onset %in% c("Not_known")] <- NA
450
symoutlier.meta$Days_from_onset <- as.numeric(symoutlier.meta$Days_from_onset)
451
symoutlier.meta <- symoutlier.meta[!is.na(symoutlier.meta$Days_from_onset), ]
452
symoutlier.meta <- symoutlier.meta[symoutlier.meta$Days_from_onset <= 24, ]
453
454
symoutlier.model <- model.matrix(~ Sex + Age + Site + Days_from_onset, 
455
                            data=symoutlier.meta[symoutlier.meta$Collection_Day %in% c("D0"), ])
456
457
# count cells
458
cell.freq.tab <- t(table(all.meta$sample_id[all.meta$Collection_Day %in% c("D0") &
459
                                                !is.na(all.meta$Days_from_onset) &
460
                                                all.meta$Days_from_onset <= 24 &
461
                                                !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy")],
462
                         all.meta$initial_clustering[all.meta$Collection_Day %in% c("D0") &
463
                                                         !is.na(all.meta$Days_from_onset) &
464
                                                         all.meta$Days_from_onset <= 24 &
465
                                                         !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy")]))
466
467
cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("Doublet", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
468
test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
469
cell.freq.tab <- cell.freq.tab[, test.samps]
470
symoutlier.model <- symoutlier.model[colnames(cell.freq.tab), ]
471
472
symoutlier.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
473
474
#estimate dispersions
475
symoutlier.dge <- estimateDisp(symoutlier.dge, design=symoutlier.model)
476
symoutlier.linear.fit <- glmQLFit(symoutlier.dge, symoutlier.model, robust=TRUE)
477
symoutlier.res <- as.data.frame(topTags(glmQLFTest(symoutlier.linear.fit, coef=4), sort.by='none', n=Inf))
478
symoutlier.res$CellType <- rownames(symoutlier.res)
479
480
symoutlier.res$Sig <- as.numeric(symoutlier.res$FDR < 0.1)
481
symoutlier.res$Diff <- sign(symoutlier.res$logFC)
482
symoutlier.res$Diff[symoutlier.res$FDR > 0.1] <- 0
483
table(symoutlier.res$Diff)
484
```
485
486
487
488
```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
489
mx.lfc <- max(abs(symoutlier.res$logFC))
490
eps <- mx.lfc * 0.05
491
492
symoutlier.resplot.labels <- symoutlier.res$CellType
493
symoutlier.resplot.labels[symoutlier.res$Sig == 0] <- ""
494
495
ggplot(symoutlier.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
496
    geom_point(shape=21, size=4) +
497
    theme_cowplot() +
498
    scale_fill_manual(values=c("black", "red")) +
499
    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
500
    guides(fill=guide_legend(title="FDR < 0.1")) +
501
    geom_text_repel(aes(label=symoutlier.resplot.labels),
502
                     fill='white') +
503
    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
504
    ggsave("~/Dropbox/COVID19/Updated_plots/All_edgeR_volcano_daysOnset-SensitivityAnalysis-linear.pdf",
505
           height=2.95, width=4.15, useDingbats=FALSE) +
506
    NULL
507
```
508
509
510
Show a plot of just the DA clusters.
511
512
```{r, warning=FALSE, message=FALSE, fig.height=5.15, fig.width=9.95}
513
da.clusters <- unique(c(symoutlier.resplot.labels))
514
515
group.cols <- c("#2ca02c", "#1f77b4", "#9467bd", "#fed976", "#fd8d3c", "#e31a1c", "#800026", "#252525")
516
names(group.cols) <- c("Healthy", "LPS", "Non_covid", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical")
517
518
# censor outliers to the 95th quantile for each cell type.
519
plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
520
                                   !cell.freq.merge$Status_on_day_collection_summary %in% c("LPS", "Non_covid") &
521
                                   !is.na(cell.freq.merge$Days_from_onset) &
522
                                   cell.freq.merge$CellType %in% da.clusters, ]
523
524
q95.ct <- sapply(unique(plotting.df$CellType),
525
                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
526
q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
527
528
plotting.df <- merge(plotting.df, q95.df, by='CellType')
529
plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
530
531
plotting.df$Status_on_day_collection_summary <- ordered(plotting.df$Status_on_day_collection_summary,
532
                                         levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe"))
533
534
ggplot(plotting.df[!plotting.df$Status_on_day_collection_summary %in% c("Healthy") &
535
                       plotting.df$Days_from_onset <= 24, ],
536
       aes(x=Days_from_onset, y=Freq)) +
537
    #geom_boxplot(outlier.size=0.5, coef=1.5) +
538
    geom_point(aes(colour=Status_on_day_collection_summary), size=1) +
539
    stat_smooth(method=MASS::rlm) +
540
    theme_cowplot() +
541
    facet_wrap(~CellType, scales="free_y", nrow=3) +
542
    scale_colour_manual(values=group.cols) +
543
    expand_limits(y=c(0)) +
544
    theme(aspect=1/1.3,
545
          legend.key.size=unit(0.4, "cm"),
546
          strip.background=element_rect(fill='white', colour='white'),
547
          strip.text=element_text(size=12)) +
548
    labs(x="Symptom duration", y="Proportion") +
549
    guides(colour=guide_legend(title="Symptom duration")) +
550
    ggsave("~/Dropbox/COVID19/Updated_plots/All_proportions-DA_daysOnset-sensitivityAnalysis-points.pdf",
551
           height=5.15, width=9.95, useDingbats=FALSE) +
552
    NULL
553
```
554
555
This shows the relationship between symptom duration and cell type abundance, excluding the outlier patients.
556
557
558
```{r, warning=FALSE, message=FALSE}
559
write.table(symoutlier.res,
560
            file="~/Dropbox/COVID19/Data/Updated/SymptomOnset-sensitivityAnalysis_resTable.txt",
561
            sep="\t", quote=FALSE, row.names=FALSE)
562
563
# print P-values to html table
564
clon.lm.pvalues <- data.frame("CellType"=symoutlier.res$CellType, "logFC"=symoutlier.res$logFC,
565
                              "Pvalue"=symoutlier.res$PValue,
566
                              "FDR"=symoutlier.res$FDR)
567
568
kbl(clon.lm.pvalues) %>% kable_paper(full_width=FALSE) %>% 
569
    save_kable("~/Dropbox/COVID19/Updated_plots/DA_symptomOnset-sensitivityAnalysis_trend_pvals.html",
570
               self_contained=TRUE)
571
572
kable(clon.lm.pvalues)
573
```
574
575
576
Compare the log fold changes with and without the critical patients.
577
578
```{r}
579
# colnames(init.res) <- c(paste0("Full.", colnames(init.res)[c(1:5)]), "CellType", "Full.Sig", "Full.Diff")
580
colnames(symoutlier.res) <- c(paste0("Ltday24.", colnames(symoutlier.res)[c(1:5)]), "CellType", "woCritical.Sig", "woCritical.Diff")
581
```
582
583
584
```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=3.35}
585
compare.model <- merge(init.res, symoutlier.res, by='CellType')
586
587
compare.model$Sigs <- 0
588
compare.model$Sigs[compare.model$Full.Sig == 1 & compare.model$woCritical.Sig == 1] <- 1
589
590
cell.labels <- compare.model$CellType
591
cell.labels[compare.model$Sigs == 0] <- ""
592
593
ggplot(compare.model, aes(x=Full.logFC, y=Ltday24.logFC)) +
594
    geom_hline(yintercept=0, lty=2) +
595
    geom_vline(xintercept=0, lty=2) +
596
    geom_point() +
597
    theme_cowplot() +
598
    labs(x="log Fold Change: Full model",
599
         y="log Fold Change: Symptoms\n<24 days") +
600
    geom_text_repel(aes(label=cell.labels),
601
                    force=50) +
602
    ggsave("~/Dropbox/COVID19/Updated_plots/SymptomOnset_compare-sensitivityAnalysis-points.pdf",
603
           height=2.95, width=3.35, useDingbats=FALSE) +
604
    NULL
605
```
606
607
608