[c3d134]: / rna / processing / 2_QC.R

Download this file

139 lines (109 with data), 5.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
here::i_am("rna/processing/2_QC.R")
source(here::here("settings.R"))
#####################
## Define arguments ##
#####################
p <- ArgumentParser(description='')
p$add_argument('--metadata', type="character", help='Metadata')
p$add_argument('--outputdir', type="character", help='Output directory')
p$add_argument('--min_nFeature_RNA', type="integer", help='Minimum number of expressed genes')
p$add_argument('--max_nFeature_RNA', type="integer", help='Maximum number of expressed genes')
p$add_argument('--mitochondrial_percent_RNA', type="integer", help='Maximum percentage of mitochondrial reads')
p$add_argument('--ribosomal_percent_RNA', type="integer", help='Maximum percentage of ribosomal reads')
p$add_argument('--samples', type="character", nargs="+", help='Samples')
args <- p$parse_args(commandArgs(TRUE))
#####################
## Define settings ##
#####################
## START TEST ##
# args <- list()
# args$samples <- opts$samples
# # args$metadata <- paste0(io$basedir,"/processed_new/rna_new/metadata.txt.gz")
# args$metadata <- io$metadata
# args$min_nFeature_RNA <- 2000
# args$max_nFeature_RNA <- 10000
# args$mitochondrial_percent_RNA <- 40
# args$ribosomal_percent_RNA <- 20
# args$outputdir <- paste0(io$basedir,"/results_new/rna/qc")
## END TEST ##
# Sanity checks
stopifnot(args$samples%in%opts$samples)
###############
## Load data ##
###############
metadata <- fread(args$metadata) %>%
.[sample%in%args$samples] %>%
# .[,pass_rnaQC:=nFeature_RNA>args$min_nFeature_RNA & nCount_RNA>2**args$log_nCount_RNA & mitochondrial_percent_RNA<args$mitochondrial_percent_RNA]
.[,pass_rnaQC:=nFeature_RNA<=args$max_nFeature_RNA & nFeature_RNA>=args$min_nFeature_RNA & mitochondrial_percent_RNA<args$mitochondrial_percent_RNA & ribosomal_percent_RNA<args$ribosomal_percent_RNA]
# temporary
metadata[stage=="E8.55",stage:="E8.5"]
table(metadata$pass_rnaQC)
# Ad hoc editing because of outlier cluster of high rRNA in the E7.75 sample
metadata[sample=="E7.75_rep1" & ribosomal_percent_RNA>=8,pass_rnaQC:=FALSE]
#####################
## Plot QC metrics ##
#####################
to.plot <- metadata %>% .[pass_rnaQC==TRUE] %>%
.[nFeature_RNA<=8000 & mitochondrial_percent_RNA<=60 & ribosomal_percent_RNA<=16] %>% # remove massive outliers for plotting
# .[,log_nFeature_RNA:=log10(nFeature_RNA)] %>%
melt(id.vars=c("sample","cell","stage"), measure.vars=c("nFeature_RNA","mitochondrial_percent_RNA","ribosomal_percent_RNA"))
# melt(id.vars=c("sample","cell"), measure.vars=c("nFeature_RNA"))
facet.labels <- c("nFeature_RNA" = "Num. of genes", "mitochondrial_percent_RNA" = "Mitochondrial %", "ribosomal_percent_RNA" = "Ribosomal %")
## Box plot
p <- ggplot(to.plot, aes_string(x="sample", y="value", fill="stage")) +
geom_boxplot(outlier.shape=NA, coef=1) +
facet_wrap(~variable, scales="free_y", nrow=1, labeller = as_labeller(facet.labels)) +
scale_fill_manual(values=opts$stage.colors) +
theme_classic() +
theme(
axis.text.y = element_text(colour="black",size=rel(1)),
axis.text.x = element_text(colour="black",size=rel(0.65), angle=20, hjust=1, vjust=1),
axis.title.x = element_blank()
)
pdf(sprintf("%s/qc_metrics_boxplot.pdf",args$outputdir), width=9, height=5)
# pdf(sprintf("%s/qc_metrics_boxplot.pdf",args$outputdir))
print(p)
dev.off()
## histogram
tmp <- data.table(
variable = c("nFeature_RNA", "mitochondrial_percent_RNA", "ribosomal_percent_RNA"),
value = c(args$min_nFeature_RNA, args$mitochondrial_percent_RNA, args$ribosomal_percent_RNA)
)
# tmp <- data.table(
# variable = c("nFeature_RNA"),
# value = c(args$min_nFeature_RNA)
# )
p <- gghistogram(to.plot, x="value", fill="sample", bins=50) +
# p <- ggdensity(to.plot, x="value", fill="sample") +
geom_vline(aes(xintercept=value), linetype="dashed", data=tmp) +
facet_wrap(~variable, scales="free", nrow=1) +
theme(
axis.text = element_text(size=rel(0.8)),
axis.title.x = element_blank(),
legend.position = "right",
legend.text = element_text(size=rel(0.75))
)
pdf(sprintf("%s/qc_metrics_histogram.pdf",args$outputdir), width=13, height=6)
print(p)
dev.off()
#########################################################
## Plot fraction of cells that pass QC for each sample ##
#########################################################
to.plot <- metadata %>%
.[,mean(pass_rnaQC,na.rm=T),by=c("sample","stage")]
p <- ggbarplot(to.plot, x="sample", y="V1", fill="stage") +
scale_fill_manual(values=opts$stage.colors) +
labs(x="", y="Fraction of cells that pass QC (RNA)") +
# facet_wrap(~stage)
theme(
legend.position = "none",
axis.text.y = element_text(colour="black",size=rel(0.8)),
axis.text.x = element_text(colour="black",size=rel(0.65), angle=20, hjust=1, vjust=1),
)
pdf(sprintf("%s/qc_metrics_barplot.pdf",args$outputdir), width=6, height=5)
print(p)
dev.off()
##########
## Save ##
##########
fwrite(metadata, paste0(args$outputdir,"/sample_metadata_after_qc.txt.gz"), quote=F, na="NA", sep="\t")