--- a +++ b/exseek/scripts/matrix-process.R @@ -0,0 +1,895 @@ +#!/usr/bin/env Rscript +suppressPackageStartupMessages(library("argparse")) + +# create parser object +parser <- ArgumentParser() +parser$add_argument("-s", "--step", required=TRUE, help="which step to run") +parser$add_argument("-i", "--input", required=TRUE, help="input expression matrix file") +parser$add_argument("-o", "--output", required=TRUE, help="output expression matrix file") +parser$add_argument("--temp-dir", required=FALSE, default=".", help="temporary directory") +parser$add_argument("-c", "--class", required=FALSE, help="input class info file") +parser$add_argument("-b", "--batch", required=FALSE, help="input batch info file") +parser$add_argument("-m", "--method", required=FALSE, help="name of the method") + +parser$add_argument("--filtercount", type="integer", default=10, help="count threshold for filtering") +parser$add_argument("--filtercpm", type="double", default=10, help="CPM threshold for filtering") +parser$add_argument("--filterrpkm", type="double", help="RPKM threshold for filtering") + +parser$add_argument( "--filtermethod", type="character", default="filtercount", + metavar="STRING", + help="the filter algorithm to use [default = %(default)s], and return count matrix") +parser$add_argument("--filterexpv", type="double", default=5, + help="filter by expression value of a gene [default = %(default)s]", + metavar="NUMBER") + +parser$add_argument("--filtersample", type="double", default=0.2, + help="filter by counts of sample above certain counts of a gene [default = %(default)s]", + metavar="NUMBER") +parser$add_argument("--imputecluster", type="integer", default=5, + help="cluster number in scImpute [default = %(default)s]", + metavar="NUMBER") +parser$add_argument("--imputevipernum", type="integer", default=5000, + help="number in viper [default = %(default)s]", + metavar="NUMBER") +parser$add_argument( "--imputecutoff", type="double", default=0.5, + metavar="NUMBER", + help="cutoff in viper [default = %(default)s]") +parser$add_argument( "--imputealpha", type="double", default=0.1, + metavar="NUMBER", + help="alpha in viper [default = %(default)s]") + +parser$add_argument( "--normtopk", type="integer", default=20, + metavar="NUMBER", + help="top K feature as scale factor [default = %(default)s]") +parser$add_argument( "--cvthreshold", type="double", default=0.5, + metavar="NUMBER", + help="coefficient variance threshold of reference gene, filter ref gene with CV bigger than [default = %(default)s]") +parser$add_argument( "--remove-gene-types", type="character", default="miRNA,piRNA", + metavar="STRING", + help="remove some time of RNA for normalization scale factor calculation [default = %(default)s]") +parser$add_argument( "--ref-gene-file", type="character", #default="miRNA,piRNA", + metavar="STRING", + help="reference gene file path [default = %(default)s]") +#they are feature name for full length feature, most are miRNA, for domain feature, they have the same feature name + +parser$add_argument("--batch-index", type="integer", default=1, + metavar="INT", + help="batch index to select which batch to use [default = %(default)s]") +parser$add_argument("--ruv-k", type="integer", default=1, metavar="INT", + help="parameter k for RUVs (The number of factors of unwanted variation to be estimated from the data)") + +parser$add_argument("-p", "--processors", type="integer", default=1, + help="Number of processors to use. This option is useful on multicore *nix or Mac machine only, when performing multiple runs (nrun > 1) [default = %(default)s]", + metavar="NUMBER") + +# get command line options, if help option encountered print help and exit, +# otherwise if options not found on command line then set defaults, +args <- parser$parse_args() + +splitname <- unlist(strsplit(args$input,split = '/',fixed = TRUE)) +splitname <- splitname[length(splitname)] + +#' @title read counts matrix +#' +#' @param path string. +#' @param ... other arguments passsed on to [readr::read_tsv()] +#' +#' @return integer matrix +#' +#' @details In any case, first part (separated by `|`) of row names must be +#' Ensembl transcript id +#' +#' @export + + +#' @title sample classinfo +#' +#' @param path string. +#' +#' @return string matrix +#' +#' @details column 1 represents sample name, column 2 represents classinfo +#' +#' @export + +# path = 'scirep_classes.txt' +read_classinfo <- function(path, ...) { + read.table(path, sep='\t', header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE) +} + +read_matrix <- function(filename){ + read.table(filename, sep='\t', header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE) +} + +write_matrix <- function(mat, filename){ + write.table(mat, filename, sep='\t',quote=FALSE, row.names=TRUE, col.names=TRUE) +} + +################################################################################ +#################################imputation##################################### +################################################################################ + +#' @title filter genes with low expression values +#' +#' @param mat integer matrix. +#' @param min_count, min_sample_per_gene integer scalar. For each gene, it must +#' contain at least `min_count` reads in at least `min_sample_per_gene` +#' samples. Otherwise, it would be dropped. +#' +#' @return integer matrix. +#' +#' @examples +#' filter_low(sim_mat) +#' +#' @export +#filter_low <- function(mat, min_count = 5, min_sample_per_gene = 10) { +# print(paste('start filtering lowly expressed gene:','count threshold',min_count,'sample threshold',min_sample_per_gene,sep=' ')) +# low_per_row <- rowSums(mat > min_count) +# keeped_row <- low_per_row > min_sample_per_gene +# mat[keeped_row, ] +#} + +filter_low <- function(mat, min_count = 5, min_sample_per_gene = 0.5) { + print(paste('start filtering lowly expressed gene:','count threshold',min_count,'sample threshold',min_sample_per_gene,sep=' ')) + min_sample_per_gene <- ceiling(dim(mat)[2]*min_sample_per_gene) + low_per_row <- rowSums(mat > min_count) + keeped_row <- low_per_row >= min_sample_per_gene + mat[keeped_row, ] +} + +filter_low_cpm <- function(mat, min_cpm = 5, min_pct_sample_per_gene = 0.5) { + print(paste('start filtering lowly expressed gene:','CPM threshold',min_cpm,'sample threshold',min_pct_sample_per_gene,sep=' ')) + row_all <- nrow(mat) %>% seq_len() + mat_cpm <- t(t(mat*1e6) / colSums(mat[row_all, , drop = F], na.rm = T)) + min_sample_per_gene <- ceiling(dim(mat_cpm)[2]*min_pct_sample_per_gene) + low_per_row <- rowSums(mat_cpm > min_cpm) + keeped_row <- low_per_row >= min_sample_per_gene + mat[keeped_row, ] +} + +filter_low_rpkm <- function(mat, min_rpkm = 5, min_pct_sample_per_gene = 0.5) { + print(paste('start filtering lowly expressed gene:','RPKM threshold',min_rpkm,'sample threshold',min_pct_sample_per_gene,sep=' ')) + row_all <- nrow(mat) %>% seq_len() + mat_cpm <- t(t(mat*1e6) / colSums(mat[row_all, , drop = F], na.rm = T)) + + gene_length <- c() + for(i in seq_len(length(rownames(mat_cpm)))){ + gene_length[i] <- as.integer(unlist(strsplit(rownames(mat_cpm)[i],"|",fixed=T))[7]) + -as.integer(unlist(strsplit(rownames(mat_cpm)[i],"|",fixed=T))[6]) + } + mat_rpkm <- mat_cpm*1000/gene_length + + min_sample_per_gene <- ceiling(dim(mat_rpkm)[2]*min_pct_sample_per_gene) + low_per_row <- rowSums(mat_rpkm > min_rpkm) + keeped_row <- low_per_row >= min_sample_per_gene + mat[keeped_row, ] +} + +#' @imputation +#' +#' @param mat integer matrix. +#' @param tmp_path where tmp files stores, "data/expression_matrix/" for example. +#' @param out_path where outputs stores, "data/matrix_processing/imputation/" for example. +#' @param K imputation Kcluster +#' @param N imputation ncores +#' @return integer matrix named "scimpute_count.txt" stored in out_path. +#' +#' @examples imputation(mat, "data/expression_matrix/", "data/matrix_processing/imputation/",5,3) +#' +#' @export +scimpute_count <- function(mat, temp_dir, K = 5, N = 3) { + suppressMessages(library("scImpute")) + print('start imputation using scImpute') + mat_correct <- names(mat) + names(mat) <- paste('C_',seq_len(length(names(mat)))) + write.csv(mat, paste(temp_path, "input.csv",sep=""), sep=',') + dir.create(splitname) + scimpute(count_path = paste(temp_dir, "input.csv",sep=""), infile = "csv", outfile = "txt", out_dir = temp_dir, Kcluster = K, ncores = N) + mat <- read.table(paste(temp_dir, "scimpute_count.txt", sep="/"),sep=' ',header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE) + unlink(temp_dir, recursive=TRUE) + names(mat) <-mat_correct + mat +} + +viper_count <- function(mat, num = 5000, percentage.cutoff = 0.1, alpha= 0.5, temp_dir=".") { + suppressWarnings(library(VIPER)) + mat_correct <- names(mat) + names(mat) <- paste('C_',seq_len(length(names(mat)))) + print (num, percentage.cutoff, alpha) + VIPER(mat, num = num, percentage.cutoff = percentage.cutoff, minbool = FALSE, alpha = alpha, report = TRUE, outdir = temp_dir) + mat <- read.table(paste(temp_dir, 'imputed_counts.csv', sep='/'),sep=' ',header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE) + unlink(temp_dir, recursive=TRUE) + names(mat) <-mat_correct + mat +} + + +################################################################################ +###############################normalization#################################### +################################################################################ +#suppressPackageStartupMessages(library(clusterSim)) +suppressPackageStartupMessages(library(scRNA.seq.funcs)) +suppressPackageStartupMessages(library(scater)) +suppressPackageStartupMessages(library(scran)) +suppressPackageStartupMessages(library(SingleCellExperiment)) +suppressPackageStartupMessages(library(kBET)) +suppressPackageStartupMessages(library(sva)) +suppressPackageStartupMessages(library(edgeR)) +suppressPackageStartupMessages(library(ggpubr)) +options(stringsAsFactors = FALSE) +library(magrittr) + +#' @title martix normalization +#' @examples +#' \donotrun{ +#' norm_mat( +#' '/path/to/matrix' +#' ) +#' } +normalize <- function( + mat, + method, + top_n = 20, + rm_gene_types = NULL, + ref_genes=NULL +) { + if (method == 'SCnorm') mat <- norm_SCnorm(mat) + else if (method == 'TMM') mat <- norm_tmm(mat) + else if (method == 'RLE') mat <- norm_rle(mat) + else if (method == 'CPM') mat <- norm_cpm_total(mat) + else if (method == 'UQ') mat <- norm_uq(mat) + else if (method == 'CPM_top') mat <- norm_cpm_top(mat, top_n) + else if (method == 'CPM_rm') { + if(is.null(rm_gene_types)){ + stop('argument rm_gene_types is required for normalization method: CPM_top') + } + mat <- norm_cpm_rm(mat, rm_gene_types) + } + else if (method == 'CPM_refer') mat <- norm_cpm_refer(mat, refer_gene_id_path) + else if (method == 'null') mat <- mat + else stop('unknown normalization method: ', method) + mat +} + + +#' @title SCnorm normalization +#' +#' @param mat integer matrix. counts +#' @param ... other arguments passed on to [SCnorm::SCnorm()] +#' +#' @examples +#' norm_SCnorm(sim_mat*10) +#' +#' @family matrix normalization +#' +#' @export +norm_SCnorm <- function(mat, ...) { + print('start normalization using SCnorm') + Conditions = rep(1, ncol(mat)); + sce <- suppressMessages(SCnorm::SCnorm(mat, Conditions, NCores=4)); + SCnorm::results(sce) +} + +# norm_scater ------------------ + +#' @title TMM/RLE normalization by scater package +#' +#' @param mat integer matrix. counts +#' +#' @family matrix normalization +#' +#' @name norm_scater + +as_SingleCellExperiment <- function(mat, col_data = NULL) { + assays = list(counts = as.matrix(mat)) + if (is.null(col_data)) + SingleCellExperiment::SingleCellExperiment(assays = assays) + else + SingleCellExperiment::SingleCellExperiment(assays = assays, colData = col_data) +} + +#' @rdname norm_scater +#' +#' @details `norm_uq()` performs upper-quartile normalization +#' +#' @examples +#' norm_uq(sim_mat) +#' +#' @export +norm_uq <- function(mat) { + print('start normalization using UQ') + #mat %>% as_SingleCellExperiment() %>% + #{suppressWarnings(scater::normaliseExprs(., "TMM"))} %>% + #scater::normalise() %>% SingleCellExperiment::normcounts() + dl <- edgeR::DGEList(counts=mat) + dl <- edgeR::calcNormFactors(dl, method='upperquartile') + edgeR::cpm(dl) +} + +#' @rdname norm_scater +#' +#' @details `norm_tmm()` performs TMM normalization +#' +#' @examples +#' norm_tmm(sim_mat) +#' +#' @export +#norm_tmm <- function(mat) { +# print('start normalization using TMM') +# mat %>% as_SingleCellExperiment() %>% +# {suppressWarnings(scater::normaliseExprs(., "TMM"))} %>% +# scater::normalise() %>% SingleCellExperiment::normcounts() +#} + +norm_tmm <- function(mat) { + print('start normalization using TMM') + dl <- edgeR::DGEList(counts=mat) + dl <- edgeR::calcNormFactors(dl, method='TMM') + return(edgeR::cpm(dl)) +} + + +norm_UQ <- function(mat) { + print('start normalization using upperquartile') + dl <- edgeR::DGEList(counts=mat) + dl <- edgeR::calcNormFactors(dl, method='upperquartile') + return(edgeR::cpm(dl)) +} +#' @rdname norm_scater +#' +#' @details `norm_rle()` performs RLE normalization +#' +#' @examples +#' norm_rle(sim_mat) +#' +#' @export +#norm_rle <- function(mat) { +# print('start normalization using RLE') +# mat %>% as_SingleCellExperiment() %>% +# {suppressWarnings(scater::normaliseExprs(., "RLE"))} %>% +# scater::normalise() %>% SingleCellExperiment::normcounts() +#} +norm_rle <- function(mat) { + print('start normalization using RLE') + dl <- edgeR::DGEList(counts=mat) + dl <- edgeR::calcNormFactors(dl, method='RLE') + edgeR::cpm(dl) +} + +# norm_cpm ------------------ + +#' @title CPM normalization by some genes +#' +#' @param mat integer matrix. counts +#' @param row integer or logical. Use which rows (genes) as normalization factor +norm_cpm_impl <- function(mat, row) { + t(t(mat*1e6) / colSums(mat[row, , drop = F], na.rm = T)) + #edgeR::cpm(mat) +} + + +#' @title CPM normalization +#' +#' @description CPM normalization using counts sum of _certain_ genes as scaling factor +#' +#' @param mat integer matrix. counts. +#' +#' @details some functions may throw errors +#' +#' @family matrix normalization +#' +#' @name norm_cpm + + + + +#' @rdname norm_cpm +#' +#' @details `norm_cpm_total()` uses total genes +#' +#' @examples +#' norm_cpm_total(sim_mat) +#' +#' @export +norm_cpm_total <- function(mat) { + print('start normalization using CPM') + row_all <- nrow(mat) %>% seq_len() + + norm_cpm_impl(mat, row_all) +} + +#' @rdname norm_cpm +#' +#' @param top_n integer scalar. see `norm_cpm_top()` below +#' +#' @details `norm_cpm_top()` uses top 20 genes sorted by counts (assuming `top_n = 20L`) +#' +#' @examples +#' norm_cpm_top(sim_mat, 20L) +#' +#' @export +#norm_cpm_top <- function(mat, top_n) { +# print(paste('start normalization using top',top_n,'genes as scale factor',sep=' ')) +# if (nrow(mat) < top_n) +# stop('too few feature for CPM top n normalization') +# +# row_top <- mat %>% rowSums() %>% sort(decreasing = T, index.return = T) %>% +# {.$ix[seq_len(top_n)]} +# +# norm_cpm_impl(mat, -row_top) +#} +norm_cpm_top <- function(mat, top_n) { + print(paste('start normalization using top',top_n,'genes as scale factor',sep=' ')) + if (nrow(mat) < top_n) + stop('too few feature for CPM top n normalization') + + row_top <- mat %>% rowSums() %>% sort(decreasing = T, index.return = T) %>% + {.$ix[seq_len(top_n)]} + + top = t(t(mat[row_top,]*1e6) / colSums(mat[row_top, , drop = F], na.rm = T)) + top_down= t(t(mat[setdiff(seq_len(dim(mat)[1]),row_top),]*1e6) / colSums(mat[setdiff(seq_len(dim(mat)[1]),row_top), , drop = F], na.rm = T)) + mat_top <- rbind(top,top_down) + mat_top[rownames(mat),] +} + +#' @rdname norm_cpm +#' +#' @param gene_type character. see `norm_cpm_rm()` below +#' +#' @details `norm_cpm_rm()` uses non-piRNA genes (assuming `gene_type = 'piRNA'`) +#' +#' @examples +#' norm_cpm_rm(sim_mat, c('miRNA', 'piRNA')) +#' +#' @export +norm_cpm_rm <- function(mat, rm_gene_type) { + print(paste('start normalization by removed some kind of RNA type ',args$removetype,sep=' ')) + row_rm <- mat %>% rownames() %>% strsplit(split='|',fixed=TRUE) %>% data.frame() %>% {.[2,]} %>% {. %in% rm_gene_type} + return(norm_cpm_impl(mat, !row_rm)) +} + + + +#' @rdname norm_cpm +#' +#' @param refer_gene_id character. Ensembl transcript id, see `norm_cpm_refer()` below +#' +#' @details `norm_cpm_refer()` uses given reference genes +#' +#' @examples +#' norm_cpm_refer(sim_mat, suggest_refer$id) +#' +#' @export + +# mat = sim_mat +# refer_gene_id = suggest_refer$id +cv_fun <- function(x) { + sd(x, na.rm = T) / mean(x, na.rm = T) +} + + +norm_cpm_refer <- function(mat, ref_genes, cv_threshold=0.5) { + message('start normalization by reference gene with CV threshold', cv_threshold) + keeped_ref <- mat[ref_genes, , drop = F] %>% apply(1, cv_fun) < cv_threshold + if(!any(kept_ref)){ + stop('no reference genes left after filtering by CV') + } + norm_cpm_impl(mat, ref_genes[keeped_ref]) +} + +################################################################################ +#################################batch removal################################## +################################################################################ + + +remove_batch <- function( mat, method, class_info=NULL, batch_info=NULL, ruv_k=1){ + # only remove batch for samples with batch information + if(!is.null(batch_info)){ + samples_with_batch <- names(batch_info)[!is.na(batch_info)] + }else{ + samples_with_batch <- colnames(mat) + } + if (method == 'RUV') mat <- ruvs(mat, class_info=class_info, k=ruv_k) + else if(method == 'RUVn') { + class_info <- rep(1, ncol(mat)) + names(class_info) <- colnames(mat) + mat <- ruvs(mat, class_info=class_info, k=ruv_k) + } + else if (method == 'ComBat') mat[, samples_with_batch] <- combat(mat[, samples_with_batch], class_info=class_info, batch_info=batch_info) + else if (method == 'limma') mat[, samples_with_batch] <- limma(mat[, samples_with_batch], class_info=class_info, batch_info=batch_info) + else if (method == 'null') mat <- mat + else stop("unknown batch effect removal method: ", method) + mat +} + +ruv <- function( + mat, + classinfo_path, + label_column = 2, + k = 10 +){ + suppressMessages(library(RUVSeq)) + + print('start batch removal using RUVs') + cIdx <- rownames(mat) + + sample_info <- read.table(classinfo_path,sep='\t',header=TRUE, check.names=FALSE, stringsAsFactors=FALSE) + ##rank by mat + if(unique(is.na(sample_info$sample_id))) + stop("sample_id not in file") + rownames(sample_info) = sample_info$sample_id + sample_info=sample_info[names(mat),] + rownames(sample_info) <- c() + + names(sample_info)[label_column]="label" + scIdx <- matrix(-1, ncol = max(table(sample_info$label)), nrow = dim(table(sample_info$label))) + labellist <- names(table(sample_info$label)) + for(i in c(1:dim(table(sample_info$label)))) { + tmp <- which(sample_info$label == labellist[i]) + scIdx[i, 1:length(tmp)] <- tmp + } + mat <- log(mat+0.001) + ruv <- RUVs(as.matrix(mat), cIdx, k = k, scIdx = scIdx, isLog = TRUE) + exp(ruv$normalizedCounts) +} + +ruvs <- function(mat, class_info, batch_info=NULL, k = 1){ + if(is.null(class_info)) stop('class_info is needed for RUVs') + + message('start batch removal using RUVs') + suppressMessages(library(RUVSeq)) + + cIdx <- rownames(mat) + + class_sizes <- table(class_info) + scIdx <- matrix(-1, ncol = max(class_sizes), nrow = dim(class_sizes)) + for(i in c(1:dim(class_sizes))) { + tmp <- which(class_info == names(class_sizes)[i]) + scIdx[i, 1:length(tmp)] <- tmp + } + mat <- log(mat + 0.25) + seq_ruvs <- RUVs(as.matrix(mat), cIdx, k = k, scIdx = scIdx, isLog = TRUE) + exp(seq_ruvs$normalizedCounts) +} + + +combat <- function(mat,class_info=NULL, batch_info=NULL){ + if(is.null(batch_info)) stop('batch_info is needed for ComBat') + + message('start batch removal using combat') + suppressMessages(library(sva)) + #batch_info <-read.table(batchinfo_path,sep='\t',row.names=1,header=T,check.names = FALSE) + #if (!(dim(mat)[2]==dim(batch_info)[1])) + # stop('sample numbers in batch info and expression matrix should be same') + #batchname <-toString(names(batch_info)[batch_column]) + #batch_info=as.data.frame(batch_info[names(mat),]) + #batch_info <- as.factor(batch_info) + mod <- model.matrix(~ 1, data = as.factor(batch_info)) + combat <- ComBat( + dat = log(as.matrix(mat) + 0.25), + batch = as.factor(batch_info), + mod = mod, + par.prior = TRUE, + prior.plots = FALSE + ) + + mat <- exp(combat) + mat +} + +limma <- function( + mat, + class_info=NULL, + batch_info=NULL +){ + if(is.null(batch_info)) stop('batch_info is needed for limma') + + print('start batch removal using limma') + suppressMessages(library(limma)) + + mat <- removeBatchEffect(log(as.matrix(mat) + 0.25), as.factor(batch_info)) + mat <- exp(mat) + mat +} + +################################################################################ +#################################plot part###################################### +################################################################################ + +#' @export +plot_highest_exprs <- function(sce, top_n = 20) { + sce %>% {suppressMessages(scater::calculateQCMetrics(.))} %>% + scater::plotHighestExprs(n = top_n) +} + + +# plot_group -------------- + +plot_group_impl <- function(sce, shape = NULL, color = NULL, plot_fun) { + plot_fun( + sce, + shape_by = shape, colour_by = color, + run_args = list(exprs_values = 'counts') + ) +} + +#' @title plot PCA, TSNE +#' +#' @param sce A SingleCellExperiment object. +#' @param shape, color string. specify a column in `col_data` of [as_SingleCellExperiment()] to shape/color by +#' +#' @name plot_group + + + +#' @rdname plot_group +#' +#' @examples +#' as_SingleCellExperiment(sim_mat) %>% plot_PCA() +#' +#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA() +#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label') +#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(color = 'label') +#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label', color = 'label') +#' +#' @export +plot_PCA <- function(sce, shape = NULL, color = NULL) { + plot_group_impl(sce, shape, color, scater::plotPCA) +} + + + +#' @rdname plot_group +#' +#' @examples +#' as_SingleCellExperiment(sim_mat) %>% plot_PCA() +#' +#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA() +#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label') +#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(color = 'label') +#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label', color = 'label') +#' +#' @export +plot_TSNE <- function(sce, shape = NULL, color = NULL) { + plot_group_impl(sce, shape, color, scater::plotTSNE) +} + +# plot_variance -------------- + + +#' get y axis range of ggplot object. +get_y_range <- function(plot) { + ggplot2::ggplot_build(plot)$layout$panel_params[[1]]$y.range +} + +#' @title generate equally spaced y coordinates, not hit bottom nor top. +#' +#' @details Image `plot`'s y axis extends from 0 to 3, `x` contains 3 values, +#' then we give `c(0.5, 1.5, 2.5)`. +#' +#' @param plot ggplot object. +#' @param x numberic vector +#' +#' @return numeric vector. The same length as `x` +#' +#' @keywords internal +seq_y <- function(plot, x) { + y_range <- get_y_range(plot) + by <- diff(y_range) / length(x) + seq(y_range[1] + by, y_range[2], by) - by/2 +} + + +#' @title plot variance of counts across samples +#' +#' @param mat integer matrix. counts +#' @param refer_gene_id character. Ensembl transcript id, add a vertical line +#' for each gene to mark the corresponding CV (on x axis). Only genes in +#' counts matrix would be shown. Usually these genes should be the reference +#' genes you want to use for normalization. +#' @param refer_gene_name character. Transcript name +#' +#' @return [ggplot2::ggplot()] object. +#' +#' @name plot_variance + + + + + + +#' @details +#' `plot_cv_density()` produces density plot of coefficient of variation +#' +#' @examples +#' # only one gene exist in the matrix +#' plot_cv_density(sim_mat, suggest_refer$id) +#' plot_cv_density(sim_mat, suggest_refer$id, suggest_refer$name) +#' +#' # the name should be the same length as id +#' plot_cv_density(sim_mat, rownames(sim_mat)[1:6], letters[1:3]) +#' # if only part of the genes have name, you can pass the id of other genes +#' plot_cv_density(sim_mat, rownames(sim_mat)[1:6], c(letters[1:3], rownames(sim_mat)[4:6])) +#' +#' @export +#' +#' @rdname plot_variance + +# mat = sim_mat +# refer_gene_id = suggest_refer$id +# refer_gene_name = suggest_refer$name +plot_cv_density <- function(mat, refer_gene_id = '', refer_gene_name = refer_gene_id) { + cv <- mat %>% apply(1, cv_fun) %>% + {tibble::tibble(id = names(.), value = .)} %>% + dplyr::mutate(id = stringr::str_extract(id, '[^|]+')) + plot <- ggplot2::ggplot(cv, ggplot2::aes(value)) + + ggplot2::geom_density(color = 'blue') + + ggplot2::labs(x = 'coefficient of variation') + + if (length(refer_gene_id) != length(refer_gene_name)) { + warning("Ignoring refer_gene_name, since it isn't the same length as refer_gene_id") + refer_gene_name = refer_gene_id + } + cv_refer <- tibble::tibble(id = refer_gene_id, name = refer_gene_name) %>% + dplyr::inner_join(cv, by = 'id') + if (nrow(cv_refer) == 0L) { + warning("None refer gene found in the count matrix") + return(plot) + } + + plot + ggplot2::geom_vline(xintercept = cv_refer$value, color = 'green') + + ggplot2::geom_point( + ggplot2::aes(x = value, y = seq_y(plot, value)), + data = cv_refer, size = 2, shape = 1 + ) + + ggrepel::geom_label_repel( + ggplot2::aes(x = value, y = seq_y(plot, value), label = name), + data = cv_refer, hjust = 0.5 + ) +} + + + +#' @details +#' `plot_cv_density()` produces density plot of coefficient of variation +#' +#' @examples +#' # only one gene exist in the matrix +#' plot_refer_violin(sim_mat, suggest_refer$id) +#' plot_refer_violin(sim_mat, suggest_refer$id, suggest_refer$name) +#' +#' # the name should be the same length as id +#' plot_refer_violin(sim_mat, rownames(sim_mat)[1:6], letters[1:3]) +#' # if only part of the genes have name, you can pass the id of other genes +#' plot_refer_violin(sim_mat, rownames(sim_mat)[1:6], c(letters[1:3], rownames(sim_mat)[4:6])) +#' +#' @export +#' +#' @rdname plot_variance + +# mat = sim_mat +# refer_gene_id = rownames(mat)[1:6] +# refer_gene_name = paste0('gene_', letters[1:6]) +plot_refer_violin <- function(mat, refer_gene_id, refer_gene_name = refer_gene_id) { + if (length(refer_gene_id) != length(refer_gene_name)) { + warning("Ignoring refer_gene_name, since it isn't the same length as refer_gene_id") + refer_gene_name = refer_gene_id + } + + refer_gene <- tibble::tibble(id = refer_gene_id, name = refer_gene_name) + refer_count <- mat %>% tibble::as_tibble(rownames = 'id') %>% + dplyr::mutate(id = stringr::str_extract(id, '[^|]+')) %>% + dplyr::inner_join(refer_gene, ., by = 'id') %>% dplyr::select(-id) + if (nrow(refer_count) == 0L) { + warning('None refer gene found in the count matrix') + return(ggplot2::ggplot()) + } + + refer_count_long <- refer_count %>% tidyr::gather('sample', 'count', -1) %>% + dplyr::mutate_at('name', as.factor) + g_violin <- refer_count_long %>% + ggplot2::ggplot(ggplot2::aes(name, log2(count + 0.001))) + + ggplot2::geom_violin() + + ggplot2::labs(x = 'reference transcripts', y = quote(log[2](count))) + + # max y coordinate of each violin + y_max <- ggplot2::ggplot_build(g_violin)$data[[1]] %>% tibble::as_tibble() %>% + dplyr::group_by(x) %>% dplyr::arrange(dplyr::desc(y)) %>% dplyr::slice(1) %>% + dplyr::ungroup() %>% dplyr::arrange(x) %>% dplyr::select(x, y) + + cv_df <- refer_count_long %>% + dplyr::group_by(name) %>% dplyr::summarise(cv = cv_fun(count)) %>% + dplyr::arrange(name) %>% dplyr::mutate(x = seq_along(name)) %>% + dplyr::inner_join(y_max, by = 'x') %>% + dplyr::mutate(y = y + diff(get_y_range(g_violin)) / 20) %>% + dplyr::mutate(cv = formatC(cv, digits = 3, format = 'f')) + + g_violin + ggplot2::geom_text(ggplot2::aes(x, y, label = cv), cv_df, color = 'blue') +} + +################################################################################ +#########################process pipeline####################################### +################################################################################ + +#args$input: matrix_path = 'output/scirep/count_matrix/transcript.txt' +#args$class: classinfo_path = 'data/labels/scirep_classes.txt' +#args$batch: 'data/other_annotations/scirep_batch.txt' +#args$imputeout: impute_path = "output/matrix_processing/imputation/" + +#dummy_io<- function(filename){} + + +message('read expression matrix: ', args$input) +mat <- read_matrix(args$input) +sample_ids <- colnames(mat) + +if((args$step != 'filter') && (is.null(args$method))){ + stop('--method is required for step: ', args$step) +} +# filter +if (args$step =='filter'){ + if(!is.null(args$filtercount)){ + #message(sprintf('Filter features with count <= %d in %f %% samples', args$filtercount, args$filtersample*100)) + mat <- filter_low(mat, args$filtercount, args$filtersample) + } + else if(!is.null(args$filtercpm)){ + #message(sprintf('Filter features with CPM <= %d in %f %% samples', args$filtercpm, args$filtersample*100)) + mat <- filter_low_cpm(mat, args$filtercpm, args$filtersample) + } + else if(!is.null(args$filterrpkm)){ + #message(sprintf('Filter features with RPKM <= %d in %f %% samples', args$filterrpkm, args$filtersample*100)) + mat <- filter_low_rpkm(mat, args$filterrpkm, args$filtersample) + } +} else if(args$step =='imputation'){ + # imputation + library(readr) + mat <-read.table(args$input,sep='\t',header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE) + if (args$method == 'scimpute_count'){ + mat <- scimpute_count(mat, impute_path= args$imputeout, K = args$imputecluster, N = args$processors) + } else if(args$method == 'viper_count'){ + mat <- viper_count(mat, impute_path= args$imputeout,num = args$imputevipernum,percentage.cutoff = args$imputecutoff, alpha= args$imputealpha) + } else if(args$method == 'null'){ + + } else { + stop('unknown imputation method: ', args$method) + } +} else if(args$step =='normalization'){ + # normalization + if(!is.null(args$ref_gene_file)){ + ref_genes <- read.table(args$ref_gene_file)[,1] + }else{ + ref_genes <- NULL + } + mat <- normalize(mat, method=args$method,top_n = args$normtopk, + rm_gene_types = args$remove_gene_types, ref_genes=ref_genes) +} else if(args$step =='batch_removal'){ + class_info <- NULL + batch <- NULL + if(args$method == 'RUV'){ + if(!is.null(args$class)){ + message('read class information: ', args$class) + class_info <- read.table(args$class, sep='\t', header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE) + class_info <- class_info[sample_ids, 1] + }else{ + stop('argument --class is required for RUV') + } + }else if(args$method %in% c('ComBat', 'limma')){ + if(!is.null(args$batch)){ + message('read batch information: ', args$batch) + batch <- read.table(args$batch, sep='\t', header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE) + batch <- batch[sample_ids, args$batch_index] + names(batch) <- sample_ids + }else{ + stop('argument --batch is required for: ', args$method) + } + } + # batch removal + mat <- remove_batch(mat, method=args$method, class_info=class_info, batch_info=batch, ruv_k=args$ruv_k) + +} else{ + stop('unknown step: ', args$step) +} + +# write output matrix +message('write expression matrix: ', args$output) +write_matrix(mat, args$output)