b/TcellAnalysis/notebooks/COVID19_Tcell_DA.Rmd
+---
+title: "COVID19: T cell differential abundance"
+output: html_notebook
+---
+Performing differential abundance testing using a negative binomial GLM. Cell counts per donor sample are used as input, and the total number of cells
+captured (after QC) are used to normalize the model counts.
+```{r, warning=FALSE, message=FALSE}
+library(ggplot2)
+library(ggsci)
+library(cowplot)
+library(RColorBrewer)
+library(reshape2)
+library(colorspace)
+library(ggthemes)
+library(edgeR)
+library(scales)
+library(ggrepel)
+library(dplyr)
+```
+```{r}
+covid.meta <- read.csv("~/Dropbox/COVID19/Data/Metadata FINAL 10122020.csv",
+                        header=TRUE, stringsAsFactors=FALSE)
+covid.meta$Days_from_onset[covid.meta$Days_from_onset %in% c("Not_known", "Healthy")] <- NA
+covid.meta$Days_from_onset <- as.numeric(covid.meta$Days_from_onset)
+cell.freq.merge <- read.table("~/Dropbox/COVID19/Data/Tcell_proportions.tsv",
+                              sep="\t", header=TRUE, stringsAsFactors=FALSE)
+cell.freq.merge <- cell.freq.merge[!cell.freq.merge$D0_status_summary %in% c("Non_covid"), ]
+cell.freq.merge <- cell.freq.merge[!cell.freq.merge$patient_id %in% c("CV0198"), ]
+# remove doublets, etc
+cell.freq.merge <- cell.freq.merge[!cell.freq.merge$CellType %in% c("Doublets:Bcell", "Doublets:Platelet", "Doublets", "NK", "ILCs"), ]
+```
+```{r, warning=FALSE, message=FALSE}
+all.meta <- read.table("~/Dropbox/COVID19/Data/COVID19_scMeta-data.tsv",
+                       sep="\t", header=TRUE, stringsAsFactors=FALSE)
+rownames(all.meta) <- all.meta$CellID
+# remove BGCV01_CV0209 and CV0198
+all.meta <- all.meta[!all.meta$sample_id %in% c("BGCV01_CV0902"), ]
+all.meta <- all.meta[!all.meta$patient_id %in% c("CV0198"), ]
+all.meta$Days_from_onset[all.meta$Days_from_onset %in% c("Not_known", "Healthy")] <- NA
+all.meta$Days_from_onset <- as.numeric(all.meta$Days_from_onset)
+n.cell.vecc <- table(all.meta$sample_id)
+```
+```{r}
+tcell.annots <- read.table("~/Dropbox/COVID19/Data/Tcell_annotations_ext.tsv",
+                           sep="\t", header=TRUE, stringsAsFactors=FALSE)
+tcell.df.merge <- merge(tcell.annots, all.meta, by='CellID')
+```
+## Cluster-baed DA analysis: Infected vs. controls
+```{r}
+# set up testing model
+rownames(covid.meta) <- covid.meta$sample_id
+tcell.meta <- covid.meta[!covid.meta$D0_status_summary %in% c("Non_covid", "LPS"), ]
+tcell.meta$OrderedSeverity <- ordered(tcell.meta$D0_status_summary,
+                                      levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
+tcell.meta$Infected <- ordered(tcell.meta$Status,
+                               levels=c("Healthy", "Covid"))
+tcell.model <- model.matrix(~ Sex + Age + Infected, data=tcell.meta[tcell.meta$Collection_Day %in% c("D0"), ])
+# count cells
+cell.freq.tab <- t(table(tcell.df.merge$sample_id[tcell.df.merge$Collection_Day %in% c("D0") &
+                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid")],
+                         tcell.df.merge$Sub.Annotation[tcell.df.merge$Collection_Day %in% c("D0") &
+                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid")]))
+cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("NK", "ILCs", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
+test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
+cell.freq.tab <- cell.freq.tab[, test.samps]
+tcell.model <- tcell.model[colnames(cell.freq.tab), ]
+tcell.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
+#estimate dispersions
+tcell.dge <- estimateDisp(tcell.dge, design=tcell.model)
+tcell.linear.fit <- glmQLFit(tcell.dge, tcell.model, robust=TRUE)
+tcell.res <- as.data.frame(topTags(glmQLFTest(tcell.linear.fit, coef=4), sort.by='none', n=Inf))
+tcell.res$CellType <- rownames(tcell.res)
+tcell.res$Sig <- as.numeric(tcell.res$FDR < 0.1)
+tcell.res$Diff <- sign(tcell.res$logFC)
+tcell.res$Diff[tcell.res$FDR > 0.1] <- 0
+table(tcell.res$Diff)
+```
+```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
+mx.lfc <- max(abs(tcell.res$logFC))
+eps <- mx.lfc * 0.05
+infect.resplot.labels <- tcell.res$CellType
+infect.resplot.labels[tcell.res$Sig == 0] <- ""
+ggplot(tcell.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
+    geom_point(shape=21, size=4) +
+    theme_cowplot() +
+    scale_fill_manual(values=c("black", "red")) +
+    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
+    guides(fill=guide_legend(title="FDR < 0.1")) +
+    geom_text_repel(aes(label=infect.resplot.labels),
+                     fill='white') +
+    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
+    ggsave("~/Dropbox/COVID19/plot.dir/Tcell_edgeR_volcano_caseVscontrol-linear.pdf",
+           height=2.95, width=4.15, useDingbats=FALSE) +
+    NULL
+```
+## Cluster-baed DA analysis: COVID19 severity
+```{r}
+# set up testing model
+rownames(covid.meta) <- covid.meta$sample_id
+tcell.meta <- covid.meta[!covid.meta$D0_status_summary %in% c("Non_covid", "LPS", "Healthy"), ]
+tcell.meta$OrderedSeverity <- ordered(tcell.meta$D0_status_summary,
+                                      levels=c("Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
+tcell.model <- model.matrix(~ Sex + Age + OrderedSeverity, data=tcell.meta[tcell.meta$Collection_Day %in% c("D0"), ])
+# count cells
+cell.freq.tab <- t(table(tcell.df.merge$sample_id[tcell.df.merge$Collection_Day %in% c("D0") &
+                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid", "Healthy")],
+                         tcell.df.merge$Sub.Annotation[tcell.df.merge$Collection_Day %in% c("D0") &
+                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid", "Healthy")]))
+cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("NK", "ILCs", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
+test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
+cell.freq.tab <- cell.freq.tab[, test.samps]
+tcell.model <- tcell.model[colnames(cell.freq.tab), ]
+tcell.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
+#estimate dispersions
+tcell.dge <- estimateDisp(tcell.dge, design=tcell.model)
+tcell.linear.fit <- glmQLFit(tcell.dge, tcell.model, robust=TRUE)
+tcell.res <- as.data.frame(topTags(glmQLFTest(tcell.linear.fit, coef=4), sort.by='none', n=Inf))
+tcell.res$CellType <- rownames(tcell.res)
+tcell.res$Sig <- as.numeric(tcell.res$FDR < 0.1)
+tcell.res$Diff <- sign(tcell.res$logFC)
+tcell.res$Diff[tcell.res$FDR > 0.1] <- 0
+table(tcell.res$Diff)
+```
+```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
+mx.lfc <- max(abs(tcell.res$logFC))
+eps <- mx.lfc * 0.05
+tcell.resplot.labels <- tcell.res$CellType
+tcell.resplot.labels[tcell.res$Sig == 0] <- ""
+ggplot(tcell.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
+    geom_point(shape=21, size=4) +
+    theme_cowplot() +
+    scale_fill_manual(values=c("black", "red")) +
+    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
+    guides(fill=guide_legend(title="FDR < 0.1")) +
+    geom_text_repel(aes(label=tcell.resplot.labels),
+                     fill='white') +
+    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
+    ggsave("~/Dropbox/COVID19/plot.dir/Tcell_edgeR_volcano_caseOnly-linear.pdf",
+           height=2.95, width=4.15, useDingbats=FALSE) +
+    NULL
+```
+Quadratic changes.
+```{r}
+tcell.quad.res <- as.data.frame(topTags(glmQLFTest(tcell.linear.fit, coef=5), sort.by='none', n=Inf))
+tcell.quad.res$CellType <- rownames(tcell.quad.res)
+tcell.quad.res$Sig <- as.numeric(tcell.quad.res$FDR < 0.1)
+tcell.quad.res$Diff <- sign(tcell.quad.res$logFC)
+tcell.quad.res$Diff[tcell.quad.res$FDR > 0.1] <- 0
+table(tcell.quad.res$Diff)
+```
+```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
+mx.lfc <- max(abs(tcell.quad.res$logFC))
+eps <- mx.lfc * 0.05
+tcell.quad.resplot.labels <- tcell.quad.res$CellType
+tcell.quad.resplot.labels[tcell.quad.res$Sig == 0] <- ""
+ggplot(tcell.quad.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
+    geom_point(shape=21, size=4) +
+    theme_cowplot() +
+    scale_fill_manual(values=c("black", "red")) +
+    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
+    guides(fill=guide_legend(title="FDR < 0.1")) +
+    geom_text_repel(aes(label=tcell.quad.resplot.labels),
+                    force=10,
+                    fill='white') +
+    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
+    ggsave("~/Dropbox/COVID19/plot.dir/Tcell_edgeR_volcano_caseOnly-quadratic.pdf",
+           height=2.95, width=4.15, useDingbats=FALSE) +
+    NULL
+```
+Show a plot of just the DA clusters.
+```{r, warning=FALSE, message=FALSE, fig.height=4.15, fig.width=9.95}
+da.clusters <- setdiff(unique(c(tcell.resplot.labels, tcell.quad.resplot.labels)), "CD4.Th17")
+group.cols <- c("#2ca02c", "#1f77b4", "#9467bd", "#fed976", "#fd8d3c", "#e31a1c", "#800026", "#252525")
+names(group.cols) <- c("Healthy", "LPS", "Non_covid", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical")
+# censor outliers to the 95th quantile for each cell type.
+plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
+                           !cell.freq.merge$D0_status_summary %in% c("LPS") &
+                           cell.freq.merge$CellType %in% da.clusters, ]
+q95.ct <- sapply(unique(plotting.df$CellType),
+                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
+q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
+plotting.df <- merge(plotting.df, q95.df, by='CellType')
+plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
+plotting.df$D0_status_summary <- ordered(plotting.df$D0_status_summary,
+                                         levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
+ggplot(plotting.df[!plotting.df$D0_status_summary %in% c("Healthy"), ],
+       aes(x=D0_status_summary, y=Freq, fill=D0_status_summary)) +
+    geom_boxplot(outlier.size=0.5, coef=1.5) +
+    theme_cowplot() +
+    facet_wrap(~CellType, scales="free_y", nrow=3) +
+    scale_fill_manual(values=group.cols) +
+    expand_limits(y=c(0)) +
+    theme(aspect=1/1.3,
+          axis.text.x=element_blank(),
+          axis.ticks.x=element_blank(),
+          legend.key.size=unit(0.4, "cm"),
+          strip.background=element_rect(fill='white', colour='white'),
+          strip.text=element_text(size=12)) +
+    labs(x="Severity", y="Proportion") +
+    guides(fill=guide_legend(title="Severity")) +
+    ggsave("~/Dropbox/COVID19/plot.dir/Tcells_proportions-DA_CasesOnly-boxplot.pdf",
+           height=4.15, width=9.95, useDingbats=FALSE) +
+    NULL
+```
+Also show the equivalent plot of healthy vs. infected.
+```{r, warning=FALSE, message=FALSE, fig.height=4.15, fig.width=9.95}
+infect.da.clusters <- setdiff(unique(c(infect.resplot.labels)), "CD4.Th17")
+group.cols <- c("#2ca02c", "#252525")
+names(group.cols) <- c("Healthy", "COVID-19")
+# censor outliers to the 95th quantile for each cell type.
+plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
+                           !cell.freq.merge$D0_status_summary %in% c("LPS") &
+                           cell.freq.merge$CellType %in% infect.da.clusters, ]
+q95.ct <- sapply(unique(plotting.df$CellType),
+                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
+q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
+plotting.df <- merge(plotting.df, q95.df, by='CellType')
+plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
+plotting.df$Status <- factor(plotting.df$Status,
+                             levels=c("Healthy", "Covid"),
+                             labels=c("Healthy", "COVID-19"))
+ggplot(plotting.df,
+       aes(x=Status, y=Freq, fill=Status)) +
+    geom_boxplot(outlier.size=0.5, coef=1.5) +
+    theme_cowplot() +
+    facet_wrap(~CellType, scales="free_y", nrow=3) +
+    scale_fill_manual(values=group.cols) +
+    expand_limits(y=c(0)) +
+    theme(aspect=1/1.3,
+          axis.text.x=element_blank(),
+          axis.ticks.x=element_blank(),
+          legend.key.size=unit(0.4, "cm"),
+          strip.background=element_rect(fill='white', colour='white'),
+          strip.text=element_text(size=12)) +
+    labs(x="", y="Proportion") +
+    guides(fill=guide_legend(title="Status")) +
+    ggsave("~/Dropbox/COVID19/plot.dir/Tcells_proportions-DA_CaseVsControls-boxplot.pdf",
+           height=4.15, width=9.95, useDingbats=FALSE) +
+    NULL
+```
+## Cluster-based DA analysis: days since symptom onset
+```{r}
+# set up testing model
+rownames(covid.meta) <- covid.meta$sample_id
+tcell.meta <- covid.meta[!covid.meta$D0_status_summary %in% c("Non_covid", "LPS", "Healthy"), ]
+tcell.meta$OrderedSeverity <- ordered(tcell.meta$D0_status_summary,
+                                      levels=c("Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
+tcell.meta <- tcell.meta[tcell.meta$D0_status_summary %in% c("Mild", "Moderate", "Severe", "Critical"), ]
+tcell.meta$Days_from_onset[tcell.meta$Days_from_onset %in% c("Not_known")] <- NA
+tcell.meta$Days_from_onset <- as.numeric(tcell.meta$Days_from_onset)
+tcell.meta <- tcell.meta[!is.na(tcell.meta$Days_from_onset), ]
+tcell.model <- model.matrix(~ Sex + Age + Days_from_onset,
+                            data=tcell.meta[tcell.meta$Collection_Day %in% c("D0"), ])
+# count cells
+cell.freq.tab <- t(table(tcell.df.merge$sample_id[tcell.df.merge$Collection_Day %in% c("D0") &
+                                                      !is.na(tcell.df.merge$Days_from_onset) &
+                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic")],
+                         tcell.df.merge$Sub.Annotation[tcell.df.merge$Collection_Day %in% c("D0") &
+                                                           !is.na(tcell.df.merge$Days_from_onset) &
+                                                        !tcell.df.merge$D0_status_summary %in% c("LPS", "Non_covid", "Healthy", "Asymptomatic")]))
+cell.freq.tab <- cell.freq.tab[!rownames(cell.freq.tab) %in% c("NK", "ILCs", "Doublets", "Doublets:Bcell", "Doublets:Platelet"), ]
+test.samps <- intersect(colnames(cell.freq.tab), names(n.cell.vecc))
+cell.freq.tab <- cell.freq.tab[, test.samps]
+tcell.model <- tcell.model[colnames(cell.freq.tab), ]
+tcell.dge <- DGEList(cell.freq.tab, lib.size=log(n.cell.vecc[test.samps]))
+#estimate dispersions
+tcell.dge <- estimateDisp(tcell.dge, design=tcell.model)
+tcell.linear.fit <- glmQLFit(tcell.dge, tcell.model, robust=TRUE)
+tcell.res <- as.data.frame(topTags(glmQLFTest(tcell.linear.fit, coef=4), sort.by='none', n=Inf))
+tcell.res$CellType <- rownames(tcell.res)
+tcell.res$Sig <- as.numeric(tcell.res$FDR < 0.1)
+tcell.res$Diff <- sign(tcell.res$logFC)
+tcell.res$Diff[tcell.res$FDR > 0.1] <- 0
+table(tcell.res$Diff)
+```
+```{r, warning=FALSE, message=FALSE, fig.height=2.95, fig.width=4.15}
+mx.lfc <- max(abs(tcell.res$logFC))
+eps <- mx.lfc * 0.05
+tcell.resplot.labels <- tcell.res$CellType
+tcell.resplot.labels[tcell.res$Sig == 0] <- ""
+ggplot(tcell.res, aes(x=logFC, y=-log10(FDR), fill=as.character(Sig))) +
+    geom_point(shape=21, size=4) +
+    theme_cowplot() +
+    scale_fill_manual(values=c("black", "red")) +
+    scale_x_continuous(limits=c(-mx.lfc - eps, mx.lfc + eps), oob=squish) +
+    guides(fill=guide_legend(title="FDR < 0.1")) +
+    geom_text_repel(aes(label=tcell.resplot.labels),
+                     fill='white') +
+    labs(x="log fold change", y=expression(paste("-log"[10], " FDR"))) +
+    ggsave("~/Dropbox/COVID19/plot.dir/Tcell_edgeR_volcano_daysOnset-linear.pdf",
+           height=2.95, width=4.15, useDingbats=FALSE) +
+    NULL
+```
+Show a plot of just the DA clusters.
+```{r, warning=FALSE, message=FALSE, fig.height=4.15, fig.width=9.95}
+da.clusters <- setdiff(unique(c(tcell.resplot.labels, tcell.quad.resplot.labels)), "CD4.Th17")
+group.cols <- c("#2ca02c", "#1f77b4", "#9467bd", "#fed976", "#fd8d3c", "#e31a1c", "#800026", "#252525")
+names(group.cols) <- c("Healthy", "LPS", "Non_covid", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical")
+# censor outliers to the 95th quantile for each cell type.
+plotting.df <- cell.freq.merge[cell.freq.merge$Collection_Day %in% c("D0") &
+                                   !cell.freq.merge$D0_status_summary %in% c("LPS") &
+                                   !is.na(cell.freq.merge$Days_from_onset) &
+                                   cell.freq.merge$CellType %in% da.clusters, ]
+q95.ct <- sapply(unique(plotting.df$CellType),
+                 FUN=function(x) quantile(plotting.df[plotting.df$CellType %in% x, ]$Freq, c(0.05, 0.95), na.rm=TRUE)[2])
+q95.df <- data.frame("CellType"=gsub(names(q95.ct), pattern="\\.95%", replacement=""), "Q95"=q95.ct)
+plotting.df <- merge(plotting.df, q95.df, by='CellType')
+plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Freq <- plotting.df[plotting.df$Freq >= plotting.df$Q95, ]$Q95
+plotting.df$D0_status_summary <- ordered(plotting.df$D0_status_summary,
+                                         levels=c("Healthy", "Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
+ggplot(plotting.df[!plotting.df$D0_status_summary %in% c("Healthy", "Asymptomatic"), ],
+       aes(x=Days_from_onset, y=Freq)) +
+    #geom_boxplot(outlier.size=0.5, coef=1.5) +
+    #geom_point(size=1) +
+    stat_smooth() +
+    theme_cowplot() +
+    facet_wrap(~CellType, scales="free_y", nrow=3) +
+    #scale_colour_manual(values=group.cols) +
+    expand_limits(y=c(0)) +
+    theme(aspect=1/1.3,
+          axis.text.x=element_blank(),
+          axis.ticks.x=element_blank(),
+          legend.key.size=unit(0.4, "cm"),
+          strip.background=element_rect(fill='white', colour='white'),
+          strip.text=element_text(size=12)) +
+    labs(x="Severity", y="Proportion") +
+    guides(fill=guide_legend(title="Severity")) +
+    ggsave("~/Dropbox/COVID19/plot.dir/Tcells_proportions-DA_daysOnset-points.pdf",
+           height=4.15, width=9.95, useDingbats=FALSE) +
+    NULL
+```