--- a
+++ b/Notebooks/DA_analysis-symptom_onset - Updated.Rmd
@@ -0,0 +1,608 @@
+---
+title: "COVID19: Differential abundance based on symptom onset - Updated"
+output: html_notebook
+---
+
+Performing differential abundance testing using a negative binomial GLM. Cell counts per donor sample are used as input, and the total number of cells 
+captured (after QC) are used to normalize the model counts. This notebook is concerned with using the initial clustering results and their differential 
+abundance according to symptom onset in hospitalised patients.
+
+
+```{r, warning=FALSE, message=FALSE}
+library(ggplot2)
+library(ggsci)
+library(cowplot)
+library(RColorBrewer)
+library(reshape2)
+library(colorspace)
+library(ggthemes)
+library(edgeR)
+library(scales)
+library(ggrepel)
+library(dplyr)
+library(kableExtra)
+```
+
+
+```{r}
+covid.meta <- read.csv("~/Dropbox/COVID19/Data/basic_COVID19_PBMC_metadata_160212.csv",
+                        header=TRUE, stringsAsFactors=FALSE)
+old.meta <- read.csv("~/Dropbox/COVID19/Data/Metadata FINAL 10122020.csv",
+                     header=TRUE, stringsAsFactors=FALSE)
+old.meta$sample_id[old.meta$sample_id %in% c("BGCV13_CV0201")] <- "BGCV06_CV0201"
+old.meta$sample_id[old.meta$sample_id %in% c("BGCV06_CV0326")] <- "BGCV13_CV0326"
+
+covid.meta$AgeRange <- covid.meta$Age
+covid.meta$Status_on_day_collection_summary[covid.meta$Status_on_day_collection_summary %in% c("LPS_10hours")] <- "LPS"
+covid.meta$Status_on_day_collection_summary[covid.meta$Status_on_day_collection_summary %in% c("LPS_90mins")] <- "LPS"
+
+covid.meta <- merge(covid.meta[, !colnames(covid.meta) %in% c("Age")], old.meta[, c("sample_id", "patient_id", "Age")],
+                    by='sample_id')
+
+covid.meta$Days_from_onset[covid.meta$Days_from_onset %in% c("Not_known", "Healthy")] <- NA
+covid.meta$Days_from_onset <- as.numeric(covid.meta$Days_from_onset)
+```
+
+Plot days since symptom onset by disease severity and Site.
+
+```{r, fig.height=2.95, fig.width=3.95,}
+covid.meta$OrderedSeverity <- ordered(covid.meta$Status_on_day_collection_summary,
+                                      levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
+
+ggplot(covid.meta[covid.meta$Collection_Day %in% c("D0") &
+                      !covid.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Asymptomatic", "Healthy") &
+                      !is.na(covid.meta$Days_from_onset), ],
+       aes(x=OrderedSeverity, y=Days_from_onset)) +
+    geom_boxplot() +
+    labs(x="Disease severity", y="Days from\nsymptom onset") +
+    theme_cowplot()  +
+    theme(aspect=1,
+          axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/SymptomOnset_severity-boxplot.pdf",
+           height=2.95, width=3.95, useDingbats=FALSE) +
+    NULL
+```
+
+
+```{r, fig.height=2.95, fig.width=3.95,}
+covid.meta$OrderedSeverity <- ordered(covid.meta$Status_on_day_collection_summary,
+                                      levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
+
+ggplot(covid.meta[covid.meta$Collection_Day %in% c("D0") &
+                      !covid.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Asymptomatic", "Healthy") &
+                      !is.na(covid.meta$Days_from_onset), ],
+       aes(x=Site, y=Days_from_onset)) +
+    geom_boxplot() +
+    labs(x="Disease severity", y="Days from\nsymptom onset") +
+    theme_cowplot()  +
+    theme(aspect=1,
+          axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/SymptomOnset_Site-boxplot.pdf",
+           height=2.95, width=3.95, useDingbats=FALSE) +
+    NULL
+```
+
+
+
+```{r, warning=FALSE, message=FALSE}
+all.meta <- read.table("~/Dropbox/COVID19/Data/Updated/COVID19_scMeta-data.tsv",
+                       sep="\t", header=TRUE, stringsAsFactors=FALSE)
+rownames(all.meta) <- all.meta$CellID
+
+doublet.remove <- read.table("~/Dropbox/COVID19/Data/Updated/All_doublets.tsv",
+                             sep="\t", header=FALSE, stringsAsFactors=FALSE)
+
+# remove BGCV01_CV0209 and CV0198
+all.meta <- all.meta[!all.meta$sample_id %in% c("BGCV01_CV0902"), ]
+all.meta <- all.meta[!all.meta$patient_id %in% c("CV0198"), ]
+all.meta$Days_from_onset[all.meta$Days_from_onset %in% c("Not_known", "Healthy")] <- NA
+all.meta$Days_from_onset <- as.numeric(all.meta$Days_from_onset)
+
+n.cell.vecc <- table(all.meta$sample_id)
+
+all.meta <- all.meta[!all.meta$CellID %in% doublet.remove$V1, ]
+```
+
+
+```{r}
+cell.xtab <- as.data.frame(xtabs( ~ sample_id +  initial_clustering, data=all.meta))
+cell.cast <- dcast(cell.xtab, sample_id ~ initial_clustering, value.var='Freq')
+rownames(cell.cast) <- cell.cast$sample_id
+cell.cast <- cell.cast[, -1]
+cell.freq <- as.data.frame(t(sapply(rownames(cell.cast), FUN=function(X) as.numeric(cell.cast[X, ]/n.cell.vecc[X]),
+                                    simplify=TRUE)))
+
+rownames(cell.freq) <- rownames(cell.cast)
+colnames(cell.freq) <- colnames(cell.cast)
+cell.freq$sample_id <- rownames(cell.cast)
+cell.melt <- melt(cell.freq, id.vars=c('sample_id'))
+colnames(cell.melt) <- c("sample_id", "CellType", "Freq")
+cell.melt$CellType <- as.character(cell.melt$CellType)
+
+cell.freq.merge <- merge(cell.melt, covid.meta, by='sample_id')
+```
+
+
+## Cluster-based DA analysis: days since symptom onset
+
+```{r}
+# set up testing model
+rownames(covid.meta) <- covid.meta$sample_id
+init.meta <- covid.meta[!covid.meta$Status_on_day_collection_summary %in% c("Non_covid", "LPS", "Healthy"), ]
+init.meta$OrderedSeverity <- ordered(init.meta$Status_on_day_collection_summary,
+                                      levels=c("Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
+init.meta <- init.meta[init.meta$Status_on_day_collection_summary %in% c("Mild", "Moderate", "Severe", "Critical"), ]
+
+init.meta$Days_from_onset[init.meta$Days_from_onset %in% c("Not_known")] <- NA
+init.meta$Days_from_onset <- as.numeric(init.meta$Days_from_onset)
+init.meta <- init.meta[!is.na(init.meta$Days_from_onset), ]
+
+init.model <- model.matrix(~ Sex + Age + Site + Days_from_onset, 
+                            data=init.meta[init.meta$Collection_Day %in% c("D0"), ])
+
+# count cells
+cell.freq.tab <- t(table(all.meta$sample_id[all.meta$Collection_Day %in% c("D0") &
+                                                !is.na(all.meta$Days_from_onset) &
+                                                !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic")],
+                         all.meta$initial_clustering[all.meta$Collection_Day %in% c("D0") &
+                                                         !is.na(all.meta$Days_from_onset) &
+                                                         !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic")]))
+
+cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("Doublet", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
+test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
+cell.freq.tab <- cell.freq.tab[, test.samps]
+init.model <- init.model[colnames(cell.freq.tab), ]
+
+init.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
+
+#estimate dispersions
+init.dge <- estimateDisp(init.dge, design=init.model)
+init.linear.fit <- glmQLFit(init.dge, init.model, robust=TRUE)
+init.res <- as.data.frame(topTags(glmQLFTest(init.linear.fit, coef=4), sort.by='none', n=Inf))
+init.res$CellType <- rownames(init.res)
+
+init.res$Sig <- as.numeric(init.res$FDR < 0.1)
+init.res$Diff <- sign(init.res$logFC)
+init.res$Diff[init.res$FDR > 0.1] <- 0
+table(init.res$Diff)
+```
+
+
+
+```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
+mx.lfc <- max(abs(init.res$logFC))
+eps <- mx.lfc * 0.05
+
+init.resplot.labels <- init.res$CellType
+init.resplot.labels[init.res$Sig == 0] <- ""
+
+ggplot(init.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
+    geom_point(shape=21, size=4) +
+    theme_cowplot() +
+    scale_fill_manual(values=c("black", "red")) +
+    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
+    guides(fill=guide_legend(title="FDR < 0.1")) +
+    geom_text_repel(aes(label=init.resplot.labels),
+                     fill='white') +
+    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/All_edgeR_volcano_daysOnset-linear.pdf",
+           height=2.95, width=4.15, useDingbats=FALSE) +
+    NULL
+```
+
+
+Show a plot of just the DA clusters.
+
+```{r, warning=FALSE, message=FALSE, fig.height=5.15, fig.width=9.95}
+da.clusters <- unique(c(init.resplot.labels))
+
+group.cols <- c("#2ca02c", "#1f77b4", "#9467bd", "#fed976", "#fd8d3c", "#e31a1c", "#800026", "#252525")
+names(group.cols) <- c("Healthy", "LPS", "Non_covid", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical")
+
+# censor outliers to the 95th quantile for each cell type.
+plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
+                                   !cell.freq.merge$Status_on_day_collection_summary %in% c("LPS", "Non_covid") &
+                                   !is.na(cell.freq.merge$Days_from_onset) &
+                                   cell.freq.merge$CellType %in% da.clusters, ]
+
+q95.ct <- sapply(unique(plotting.df$CellType),
+                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
+q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
+
+plotting.df <- merge(plotting.df, q95.df, by='CellType')
+plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
+
+plotting.df$Status_on_day_collection_summary <- ordered(plotting.df$Status_on_day_collection_summary,
+                                         levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
+
+ggplot(plotting.df[!plotting.df$Status_on_day_collection_summary %in% c("Healthy", "Asymptomatic"), ],
+       aes(x=Days_from_onset, y=Freq)) +
+    #geom_boxplot(outlier.size=0.5, coef=1.5) +
+    geom_point(aes(colour=Status_on_day_collection_summary), size=1) +
+    stat_smooth(method=MASS::rlm) +
+    theme_cowplot() +
+    facet_wrap(~CellType, scales="free_y", nrow=3) +
+    scale_colour_manual(values=group.cols) +
+    expand_limits(y=c(0)) +
+    theme(aspect=1/1.3,
+          legend.key.size=unit(0.4, "cm"),
+          strip.background=element_rect(fill='white', colour='white'),
+          strip.text=element_text(size=12)) +
+    labs(x="Severity", y="Proportion") +
+    guides(colour=guide_legend(title="Severity")) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/All_proportions-DA_daysOnset-points.pdf",
+           height=5.15, width=9.95, useDingbats=FALSE) +
+    NULL
+```
+
+Many of these changes seem to be driven by critically ill patients.
+
+
+```{r, warning=FALSE, message=FALSE}
+write.table(init.res,
+            file="~/Dropbox/COVID19/Data/Updated/SymptomOnset_resTable.txt",
+            sep="\t", quote=FALSE, row.names=FALSE)
+
+# print P-values to html table
+clon.lm.pvalues <- data.frame("CellType"=init.res$CellType, "logFC"=init.res$logFC,
+                              "Pvalue"=init.res$PValue,
+                              "FDR"=init.res$FDR)
+
+kbl(clon.lm.pvalues) %>% kable_paper(full_width=FALSE) %>% 
+    save_kable("~/Dropbox/COVID19/Updated_plots/DA_symptomOnset_trend_pvals.html",
+               self_contained=TRUE)
+
+kable(clon.lm.pvalues)
+```
+
+
+## Cluster-based DA analysis: days since symptom onset - excluding critical patients
+
+```{r}
+# set up testing model
+rownames(covid.meta) <- covid.meta$sample_id
+sanscrit.meta <- covid.meta[!covid.meta$Status_on_day_collection_summary %in% c("Non_covid", "LPS", "Healthy", "Critical"), ]
+sanscrit.meta$Days_from_onset[sanscrit.meta$Days_from_onset %in% c("Not_known")] <- NA
+sanscrit.meta$Days_from_onset <- as.numeric(sanscrit.meta$Days_from_onset)
+sanscrit.meta <- sanscrit.meta[!is.na(sanscrit.meta$Days_from_onset), ]
+
+sanscrit.model <- model.matrix(~ Sex + Age + Site + Days_from_onset, 
+                            data=sanscrit.meta[sanscrit.meta$Collection_Day %in% c("D0"), ])
+
+# count cells
+cell.freq.tab <- t(table(all.meta$sample_id[all.meta$Collection_Day %in% c("D0") &
+                                                !is.na(all.meta$Days_from_onset) &
+                                                !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic", "Critical")],
+                         all.meta$initial_clustering[all.meta$Collection_Day %in% c("D0") &
+                                                         !is.na(all.meta$Days_from_onset) &
+                                                         !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic", "Critical")]))
+
+cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("Doublet", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
+test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
+cell.freq.tab <- cell.freq.tab[, test.samps]
+sanscrit.model <- sanscrit.model[colnames(cell.freq.tab), ]
+
+sanscrit.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
+
+#estimate dispersions
+sanscrit.dge <- estimateDisp(sanscrit.dge, design=sanscrit.model)
+sanscrit.linear.fit <- glmQLFit(sanscrit.dge, sanscrit.model, robust=TRUE)
+sanscrit.res <- as.data.frame(topTags(glmQLFTest(sanscrit.linear.fit, coef=4), sort.by='none', n=Inf))
+sanscrit.res$CellType <- rownames(sanscrit.res)
+
+sanscrit.res$Sig <- as.numeric(sanscrit.res$FDR < 0.1)
+sanscrit.res$Diff <- sign(sanscrit.res$logFC)
+sanscrit.res$Diff[sanscrit.res$FDR > 0.1] <- 0
+table(sanscrit.res$Diff)
+```
+
+
+
+```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
+mx.lfc <- max(abs(sanscrit.res$logFC))
+eps <- mx.lfc * 0.05
+
+sanscrit.resplot.labels <- sanscrit.res$CellType
+sanscrit.resplot.labels[sanscrit.res$Sig == 0] <- ""
+
+ggplot(sanscrit.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
+    geom_point(shape=21, size=4) +
+    theme_cowplot() +
+    scale_fill_manual(values=c("black", "red")) +
+    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
+    guides(fill=guide_legend(title="FDR < 0.1")) +
+    geom_text_repel(aes(label=sanscrit.resplot.labels),
+                     fill='white') +
+    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/All_edgeR_volcano_daysOnset-noCritical-linear.pdf",
+           height=2.95, width=4.15, useDingbats=FALSE) +
+    NULL
+```
+
+
+Show a plot of just the DA clusters.
+
+```{r, warning=FALSE, message=FALSE, fig.height=5.15, fig.width=9.95}
+da.clusters <- unique(c(sanscrit.resplot.labels))
+
+group.cols <- c("#2ca02c", "#1f77b4", "#9467bd", "#fed976", "#fd8d3c", "#e31a1c", "#800026", "#252525")
+names(group.cols) <- c("Healthy", "LPS", "Non_covid", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical")
+
+# censor outliers to the 95th quantile for each cell type.
+plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
+                                   !cell.freq.merge$Status_on_day_collection_summary %in% c("LPS", "Non_covid") &
+                                   !is.na(cell.freq.merge$Days_from_onset) &
+                                   cell.freq.merge$CellType %in% da.clusters, ]
+
+q95.ct <- sapply(unique(plotting.df$CellType),
+                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
+q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
+
+plotting.df <- merge(plotting.df, q95.df, by='CellType')
+plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
+
+plotting.df$Status_on_day_collection_summary <- ordered(plotting.df$Status_on_day_collection_summary,
+                                         levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe"))
+
+ggplot(plotting.df[!plotting.df$Status_on_day_collection_summary %in% c("Healthy", "Asymptomatic", "Critical"), ],
+       aes(x=Days_from_onset, y=Freq)) +
+    #geom_boxplot(outlier.size=0.5, coef=1.5) +
+    geom_point(aes(colour=Status_on_day_collection_summary), size=1) +
+    stat_smooth(method=MASS::rlm) +
+    theme_cowplot() +
+    facet_wrap(~CellType, scales="free_y", nrow=3) +
+    scale_colour_manual(values=group.cols) +
+    expand_limits(y=c(0)) +
+    theme(aspect=1/1.3,
+          legend.key.size=unit(0.4, "cm"),
+          strip.background=element_rect(fill='white', colour='white'),
+          strip.text=element_text(size=12)) +
+    labs(x="Severity", y="Proportion") +
+    guides(colour=guide_legend(title="Symptom duration")) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/All_proportions-DA_daysOnset-noCritical-points.pdf",
+           height=5.15, width=9.95, useDingbats=FALSE) +
+    NULL
+```
+
+Many of these changes seem to be driven by critically ill patients.
+
+
+```{r, warning=FALSE, message=FALSE}
+write.table(sanscrit.res,
+            file="~/Dropbox/COVID19/Data/Updated/SymptomOnset-noCritical_resTable.txt",
+            sep="\t", quote=FALSE, row.names=FALSE)
+
+# print P-values to html table
+clon.lm.pvalues <- data.frame("CellType"=sanscrit.res$CellType, "logFC"=sanscrit.res$logFC,
+                              "Pvalue"=sanscrit.res$PValue,
+                              "FDR"=sanscrit.res$FDR)
+
+kbl(clon.lm.pvalues) %>% kable_paper(full_width=FALSE) %>% 
+    save_kable("~/Dropbox/COVID19/Updated_plots/DA_symptomOnset-noCritical_trend_pvals.html",
+               self_contained=TRUE)
+
+kable(clon.lm.pvalues)
+```
+
+
+Compare the log fold changes with and without the critical patients.
+
+```{r}
+colnames(init.res) <- c(paste0("Full.", colnames(init.res)[c(1:5)]), "CellType", "Full.Sig", "Full.Diff")
+colnames(sanscrit.res) <- c(paste0("woCritical.", colnames(sanscrit.res)[c(1:5)]), "CellType", "woCritical.Sig", "woCritical.Diff")
+```
+
+
+```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=2.95}
+compare.model <- merge(init.res, sanscrit.res, by='CellType')
+
+compare.model$Sigs <- 0
+compare.model$Sigs[compare.model$Full.Sig == 1 & compare.model$woCritical.Sig == 1] <- 1
+
+cell.labels <- compare.model$CellType
+cell.labels[compare.model$Sigs == 0] <- ""
+
+ggplot(compare.model, aes(x=Full.logFC, y=woCritical.logFC)) +
+    geom_hline(yintercept=0, lty=2) +
+    geom_vline(xintercept=0, lty=2) +
+    geom_point() +
+    theme_cowplot() +
+    labs(x="log Fold Change: Full model",
+         y="log Fold Change: w/o Critical") +
+    geom_text_repel(aes(label=cell.labels),
+                    force=50) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/SymptomOnset_compare-points.pdf",
+           height=2.95, width=2.95, useDingbats=FALSE) +
+    NULL
+```
+
+
+## Sensitivity analysis
+
+Remove the outliers w.r.t. time since symptom onset to test the sensitivity.
+
+## Cluster-based DA analysis: days since symptom onset - excluding > 24 days onset
+
+```{r, fig.height=2.95, fig.width=3.95,}
+
+ggplot(covid.meta[covid.meta$Collection_Day %in% c("D0") &
+                      covid.meta$Days_from_onset <= 24 &
+                      !covid.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Asymptomatic", "Healthy") &
+                      !is.na(covid.meta$Days_from_onset), ],
+       aes(x=OrderedSeverity, y=Days_from_onset)) +
+    geom_boxplot() +
+    labs(x="Disease severity", y="Days from\nsymptom onset") +
+    theme_cowplot()  +
+    theme(aspect=1,
+          axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/SymptomOnset_severity-sensitivityAnalysis-boxplot.pdf",
+           height=2.95, width=3.95, useDingbats=FALSE) +
+    NULL
+```
+
+This shows the distribution of symptom duration w.r.t. disease severity.
+
+```{r}
+# set up testing model
+rownames(covid.meta) <- covid.meta$sample_id
+symoutlier.meta <- covid.meta[!covid.meta$Status_on_day_collection_summary %in% c("Non_covid", "LPS", "Healthy"), ]
+symoutlier.meta$Days_from_onset[symoutlier.meta$Days_from_onset %in% c("Not_known")] <- NA
+symoutlier.meta$Days_from_onset <- as.numeric(symoutlier.meta$Days_from_onset)
+symoutlier.meta <- symoutlier.meta[!is.na(symoutlier.meta$Days_from_onset), ]
+symoutlier.meta <- symoutlier.meta[symoutlier.meta$Days_from_onset <= 24, ]
+
+symoutlier.model <- model.matrix(~ Sex + Age + Site + Days_from_onset, 
+                            data=symoutlier.meta[symoutlier.meta$Collection_Day %in% c("D0"), ])
+
+# count cells
+cell.freq.tab <- t(table(all.meta$sample_id[all.meta$Collection_Day %in% c("D0") &
+                                                !is.na(all.meta$Days_from_onset) &
+                                                all.meta$Days_from_onset <= 24 &
+                                                !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy")],
+                         all.meta$initial_clustering[all.meta$Collection_Day %in% c("D0") &
+                                                         !is.na(all.meta$Days_from_onset) &
+                                                         all.meta$Days_from_onset <= 24 &
+                                                         !all.meta$Status_on_day_collection_summary %in% c("LPS", "Non_covid", "Healthy")]))
+
+cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("Doublet", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
+test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
+cell.freq.tab <- cell.freq.tab[, test.samps]
+symoutlier.model <- symoutlier.model[colnames(cell.freq.tab), ]
+
+symoutlier.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
+
+#estimate dispersions
+symoutlier.dge <- estimateDisp(symoutlier.dge, design=symoutlier.model)
+symoutlier.linear.fit <- glmQLFit(symoutlier.dge, symoutlier.model, robust=TRUE)
+symoutlier.res <- as.data.frame(topTags(glmQLFTest(symoutlier.linear.fit, coef=4), sort.by='none', n=Inf))
+symoutlier.res$CellType <- rownames(symoutlier.res)
+
+symoutlier.res$Sig <- as.numeric(symoutlier.res$FDR < 0.1)
+symoutlier.res$Diff <- sign(symoutlier.res$logFC)
+symoutlier.res$Diff[symoutlier.res$FDR > 0.1] <- 0
+table(symoutlier.res$Diff)
+```
+
+
+
+```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
+mx.lfc <- max(abs(symoutlier.res$logFC))
+eps <- mx.lfc * 0.05
+
+symoutlier.resplot.labels <- symoutlier.res$CellType
+symoutlier.resplot.labels[symoutlier.res$Sig == 0] <- ""
+
+ggplot(symoutlier.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
+    geom_point(shape=21, size=4) +
+    theme_cowplot() +
+    scale_fill_manual(values=c("black", "red")) +
+    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
+    guides(fill=guide_legend(title="FDR < 0.1")) +
+    geom_text_repel(aes(label=symoutlier.resplot.labels),
+                     fill='white') +
+    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/All_edgeR_volcano_daysOnset-SensitivityAnalysis-linear.pdf",
+           height=2.95, width=4.15, useDingbats=FALSE) +
+    NULL
+```
+
+
+Show a plot of just the DA clusters.
+
+```{r, warning=FALSE, message=FALSE, fig.height=5.15, fig.width=9.95}
+da.clusters <- unique(c(symoutlier.resplot.labels))
+
+group.cols <- c("#2ca02c", "#1f77b4", "#9467bd", "#fed976", "#fd8d3c", "#e31a1c", "#800026", "#252525")
+names(group.cols) <- c("Healthy", "LPS", "Non_covid", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical")
+
+# censor outliers to the 95th quantile for each cell type.
+plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
+                                   !cell.freq.merge$Status_on_day_collection_summary %in% c("LPS", "Non_covid") &
+                                   !is.na(cell.freq.merge$Days_from_onset) &
+                                   cell.freq.merge$CellType %in% da.clusters, ]
+
+q95.ct <- sapply(unique(plotting.df$CellType),
+                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
+q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
+
+plotting.df <- merge(plotting.df, q95.df, by='CellType')
+plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
+
+plotting.df$Status_on_day_collection_summary <- ordered(plotting.df$Status_on_day_collection_summary,
+                                         levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe"))
+
+ggplot(plotting.df[!plotting.df$Status_on_day_collection_summary %in% c("Healthy") &
+                       plotting.df$Days_from_onset <= 24, ],
+       aes(x=Days_from_onset, y=Freq)) +
+    #geom_boxplot(outlier.size=0.5, coef=1.5) +
+    geom_point(aes(colour=Status_on_day_collection_summary), size=1) +
+    stat_smooth(method=MASS::rlm) +
+    theme_cowplot() +
+    facet_wrap(~CellType, scales="free_y", nrow=3) +
+    scale_colour_manual(values=group.cols) +
+    expand_limits(y=c(0)) +
+    theme(aspect=1/1.3,
+          legend.key.size=unit(0.4, "cm"),
+          strip.background=element_rect(fill='white', colour='white'),
+          strip.text=element_text(size=12)) +
+    labs(x="Symptom duration", y="Proportion") +
+    guides(colour=guide_legend(title="Symptom duration")) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/All_proportions-DA_daysOnset-sensitivityAnalysis-points.pdf",
+           height=5.15, width=9.95, useDingbats=FALSE) +
+    NULL
+```
+
+This shows the relationship between symptom duration and cell type abundance, excluding the outlier patients.
+
+
+```{r, warning=FALSE, message=FALSE}
+write.table(symoutlier.res,
+            file="~/Dropbox/COVID19/Data/Updated/SymptomOnset-sensitivityAnalysis_resTable.txt",
+            sep="\t", quote=FALSE, row.names=FALSE)
+
+# print P-values to html table
+clon.lm.pvalues <- data.frame("CellType"=symoutlier.res$CellType, "logFC"=symoutlier.res$logFC,
+                              "Pvalue"=symoutlier.res$PValue,
+                              "FDR"=symoutlier.res$FDR)
+
+kbl(clon.lm.pvalues) %>% kable_paper(full_width=FALSE) %>% 
+    save_kable("~/Dropbox/COVID19/Updated_plots/DA_symptomOnset-sensitivityAnalysis_trend_pvals.html",
+               self_contained=TRUE)
+
+kable(clon.lm.pvalues)
+```
+
+
+Compare the log fold changes with and without the critical patients.
+
+```{r}
+# colnames(init.res) <- c(paste0("Full.", colnames(init.res)[c(1:5)]), "CellType", "Full.Sig", "Full.Diff")
+colnames(symoutlier.res) <- c(paste0("Ltday24.", colnames(symoutlier.res)[c(1:5)]), "CellType", "woCritical.Sig", "woCritical.Diff")
+```
+
+
+```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=3.35}
+compare.model <- merge(init.res, symoutlier.res, by='CellType')
+
+compare.model$Sigs <- 0
+compare.model$Sigs[compare.model$Full.Sig == 1 & compare.model$woCritical.Sig == 1] <- 1
+
+cell.labels <- compare.model$CellType
+cell.labels[compare.model$Sigs == 0] <- ""
+
+ggplot(compare.model, aes(x=Full.logFC, y=Ltday24.logFC)) +
+    geom_hline(yintercept=0, lty=2) +
+    geom_vline(xintercept=0, lty=2) +
+    geom_point() +
+    theme_cowplot() +
+    labs(x="log Fold Change: Full model",
+         y="log Fold Change: Symptoms\n<24 days") +
+    geom_text_repel(aes(label=cell.labels),
+                    force=50) +
+    ggsave("~/Dropbox/COVID19/Updated_plots/SymptomOnset_compare-sensitivityAnalysis-points.pdf",
+           height=2.95, width=3.35, useDingbats=FALSE) +
+    NULL
+```
+
+
+