Switch to side-by-side view

--- a
+++ b/exseek/scripts/matrix-process.R
@@ -0,0 +1,895 @@
+#!/usr/bin/env Rscript
+suppressPackageStartupMessages(library("argparse"))
+
+# create parser object
+parser <- ArgumentParser()
+parser$add_argument("-s", "--step", required=TRUE, help="which step to run")
+parser$add_argument("-i", "--input", required=TRUE, help="input expression matrix file")
+parser$add_argument("-o", "--output", required=TRUE, help="output expression matrix file")
+parser$add_argument("--temp-dir", required=FALSE, default=".", help="temporary directory")
+parser$add_argument("-c", "--class", required=FALSE, help="input class info file")
+parser$add_argument("-b", "--batch", required=FALSE, help="input batch info file")
+parser$add_argument("-m", "--method", required=FALSE, help="name of the method")
+
+parser$add_argument("--filtercount", type="integer", default=10, help="count threshold for filtering")
+parser$add_argument("--filtercpm", type="double", default=10, help="CPM threshold for filtering")
+parser$add_argument("--filterrpkm", type="double", help="RPKM threshold for filtering")
+
+parser$add_argument( "--filtermethod", type="character", default="filtercount",
+                    metavar="STRING",
+                    help="the filter algorithm to use [default = %(default)s], and return count matrix")
+parser$add_argument("--filterexpv", type="double", default=5,
+    help="filter by expression value of a gene [default = %(default)s]",
+    metavar="NUMBER")
+
+parser$add_argument("--filtersample", type="double", default=0.2,
+    help="filter by counts of sample above certain counts of a gene [default = %(default)s]",
+    metavar="NUMBER")
+parser$add_argument("--imputecluster", type="integer", default=5,
+    help="cluster number in scImpute [default = %(default)s]",
+    metavar="NUMBER")
+parser$add_argument("--imputevipernum", type="integer", default=5000,
+    help="number in viper [default = %(default)s]",
+    metavar="NUMBER")
+parser$add_argument( "--imputecutoff", type="double", default=0.5,
+                    metavar="NUMBER",
+                    help="cutoff in viper [default = %(default)s]")
+parser$add_argument( "--imputealpha", type="double", default=0.1,
+                    metavar="NUMBER",
+                    help="alpha in viper [default = %(default)s]")
+
+parser$add_argument( "--normtopk", type="integer", default=20,
+                    metavar="NUMBER",
+                    help="top K feature as scale factor [default = %(default)s]")
+parser$add_argument( "--cvthreshold", type="double", default=0.5,
+                    metavar="NUMBER",
+                    help="coefficient variance threshold of reference gene, filter ref gene with CV bigger than [default = %(default)s]")
+parser$add_argument( "--remove-gene-types", type="character", default="miRNA,piRNA",
+                    metavar="STRING",
+                    help="remove some time of RNA for normalization scale factor calculation [default = %(default)s]")
+parser$add_argument( "--ref-gene-file", type="character", #default="miRNA,piRNA",
+                    metavar="STRING",
+                    help="reference gene file path [default = %(default)s]")
+#they are feature name for full length feature, most are miRNA, for domain feature, they have the same feature name
+
+parser$add_argument("--batch-index", type="integer", default=1,
+                    metavar="INT",
+                    help="batch index to select which batch to use [default = %(default)s]")
+parser$add_argument("--ruv-k", type="integer", default=1, metavar="INT", 
+    help="parameter k for RUVs (The number of factors of unwanted variation to be estimated from the data)")
+
+parser$add_argument("-p", "--processors", type="integer", default=1,
+    help="Number of processors to use. This option is useful on multicore *nix or Mac machine only, when performing multiple runs (nrun > 1) [default = %(default)s]",
+    metavar="NUMBER")
+
+# get command line options, if help option encountered print help and exit,
+# otherwise if options not found on command line then set defaults,
+args <- parser$parse_args()
+
+splitname <- unlist(strsplit(args$input,split = '/',fixed = TRUE))
+splitname <- splitname[length(splitname)]
+
+#' @title read counts matrix
+#'
+#' @param path string.
+#' @param ... other arguments passsed on to [readr::read_tsv()]
+#'
+#' @return integer matrix
+#'
+#' @details In any case, first part (separated by `|`) of row names must be
+#'   Ensembl transcript id
+#'
+#' @export
+
+
+#' @title sample classinfo
+#'
+#' @param path string.
+#'
+#' @return string matrix
+#'
+#' @details column 1 represents sample name, column 2 represents classinfo
+#'
+#' @export
+
+# path = 'scirep_classes.txt'
+read_classinfo <- function(path, ...) {
+    read.table(path, sep='\t', header=TRUE,  check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
+}
+
+read_matrix <- function(filename){
+    read.table(filename, sep='\t', header=TRUE,  check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
+}
+
+write_matrix <- function(mat, filename){
+    write.table(mat, filename, sep='\t',quote=FALSE, row.names=TRUE, col.names=TRUE)
+}
+
+################################################################################
+#################################imputation#####################################
+################################################################################
+
+#' @title filter genes with low expression values
+#'
+#' @param mat integer matrix.
+#' @param min_count, min_sample_per_gene integer scalar. For each gene, it must
+#'   contain at least `min_count` reads in at least `min_sample_per_gene`
+#'   samples. Otherwise, it would be dropped.
+#'
+#' @return integer matrix.
+#'
+#' @examples
+#' filter_low(sim_mat)
+#'
+#' @export
+#filter_low <- function(mat, min_count = 5, min_sample_per_gene = 10) {
+#    print(paste('start filtering lowly expressed gene:','count threshold',min_count,'sample threshold',min_sample_per_gene,sep=' '))
+#	low_per_row <- rowSums(mat > min_count)
+#	keeped_row <- low_per_row > min_sample_per_gene
+#	mat[keeped_row, ]
+#}
+
+filter_low <- function(mat, min_count = 5, min_sample_per_gene = 0.5) {
+    print(paste('start filtering lowly expressed gene:','count threshold',min_count,'sample threshold',min_sample_per_gene,sep=' '))
+    min_sample_per_gene <- ceiling(dim(mat)[2]*min_sample_per_gene)
+    low_per_row <- rowSums(mat > min_count)
+    keeped_row <- low_per_row >= min_sample_per_gene
+    mat[keeped_row, ]
+}
+
+filter_low_cpm <- function(mat, min_cpm = 5, min_pct_sample_per_gene = 0.5) {
+    print(paste('start filtering lowly expressed gene:','CPM threshold',min_cpm,'sample threshold',min_pct_sample_per_gene,sep=' '))
+    row_all <- nrow(mat) %>% seq_len()
+    mat_cpm <- t(t(mat*1e6) / colSums(mat[row_all, , drop = F], na.rm = T))
+    min_sample_per_gene <- ceiling(dim(mat_cpm)[2]*min_pct_sample_per_gene)
+    low_per_row <- rowSums(mat_cpm > min_cpm)
+    keeped_row <- low_per_row >= min_sample_per_gene
+    mat[keeped_row, ]
+}
+
+filter_low_rpkm <- function(mat, min_rpkm = 5, min_pct_sample_per_gene = 0.5) {
+    print(paste('start filtering lowly expressed gene:','RPKM threshold',min_rpkm,'sample threshold',min_pct_sample_per_gene,sep=' '))
+    row_all <- nrow(mat) %>% seq_len()
+    mat_cpm <- t(t(mat*1e6) / colSums(mat[row_all, , drop = F], na.rm = T))
+    
+    gene_length <- c()
+    for(i in seq_len(length(rownames(mat_cpm)))){
+            gene_length[i] <- as.integer(unlist(strsplit(rownames(mat_cpm)[i],"|",fixed=T))[7])
+                             -as.integer(unlist(strsplit(rownames(mat_cpm)[i],"|",fixed=T))[6])
+    }
+    mat_rpkm <- mat_cpm*1000/gene_length
+    
+    min_sample_per_gene <- ceiling(dim(mat_rpkm)[2]*min_pct_sample_per_gene)
+    low_per_row <- rowSums(mat_rpkm > min_rpkm)
+    keeped_row <- low_per_row >= min_sample_per_gene
+    mat[keeped_row, ]
+}
+
+#' @imputation
+#'
+#' @param mat integer matrix.
+#' @param tmp_path where tmp files stores, "data/expression_matrix/" for example.
+#' @param out_path where outputs stores, "data/matrix_processing/imputation/" for example.
+#' @param K imputation Kcluster
+#' @param N imputation ncores
+#' @return integer matrix named "scimpute_count.txt" stored in out_path.
+#'
+#' @examples imputation(mat, "data/expression_matrix/", "data/matrix_processing/imputation/",5,3)
+#'
+#' @export
+scimpute_count <- function(mat, temp_dir, K = 5, N = 3) {
+    suppressMessages(library("scImpute"))
+    print('start imputation using scImpute')
+    mat_correct <- names(mat)
+    names(mat) <-  paste('C_',seq_len(length(names(mat))))
+    write.csv(mat, paste(temp_path, "input.csv",sep=""), sep=',')
+    dir.create(splitname)
+    scimpute(count_path = paste(temp_dir, "input.csv",sep=""), infile = "csv", outfile = "txt", out_dir = temp_dir, Kcluster = K, ncores = N)
+    mat <- read.table(paste(temp_dir, "scimpute_count.txt", sep="/"),sep=' ',header=TRUE,  check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
+    unlink(temp_dir, recursive=TRUE)
+    names(mat) <-mat_correct
+    mat
+}
+
+viper_count <- function(mat, num = 5000, percentage.cutoff = 0.1, alpha= 0.5, temp_dir=".") {
+    suppressWarnings(library(VIPER))
+    mat_correct <- names(mat)
+    names(mat) <-  paste('C_',seq_len(length(names(mat))))
+    print (num, percentage.cutoff, alpha)
+    VIPER(mat, num = num, percentage.cutoff = percentage.cutoff, minbool = FALSE, alpha = alpha, report = TRUE, outdir = temp_dir)
+    mat <- read.table(paste(temp_dir, 'imputed_counts.csv', sep='/'),sep=' ',header=TRUE,  check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
+    unlink(temp_dir, recursive=TRUE)
+    names(mat) <-mat_correct
+    mat
+}
+
+
+################################################################################
+###############################normalization####################################
+################################################################################
+#suppressPackageStartupMessages(library(clusterSim))
+suppressPackageStartupMessages(library(scRNA.seq.funcs))
+suppressPackageStartupMessages(library(scater))
+suppressPackageStartupMessages(library(scran))
+suppressPackageStartupMessages(library(SingleCellExperiment))
+suppressPackageStartupMessages(library(kBET))
+suppressPackageStartupMessages(library(sva))
+suppressPackageStartupMessages(library(edgeR))
+suppressPackageStartupMessages(library(ggpubr))
+options(stringsAsFactors = FALSE)
+library(magrittr)
+
+#' @title martix normalization
+#' @examples
+#' \donotrun{
+#'     norm_mat(
+#'         '/path/to/matrix'
+#'     )
+#' }
+normalize <- function(
+    mat,
+    method,
+    top_n = 20, 
+    rm_gene_types = NULL,
+    ref_genes=NULL
+) {
+    if (method == 'SCnorm')    mat <- norm_SCnorm(mat)
+    else if (method == 'TMM')       mat <- norm_tmm(mat)
+    else if (method == 'RLE')       mat <- norm_rle(mat)
+    else if (method == 'CPM')       mat <- norm_cpm_total(mat)
+    else if (method == 'UQ')        mat <- norm_uq(mat)
+    else if (method == 'CPM_top')   mat <- norm_cpm_top(mat, top_n)
+    else if (method == 'CPM_rm')    {
+        if(is.null(rm_gene_types)){
+            stop('argument rm_gene_types is required for normalization method: CPM_top')
+        }
+        mat <- norm_cpm_rm(mat, rm_gene_types)
+    }
+    else if (method == 'CPM_refer') mat <- norm_cpm_refer(mat, refer_gene_id_path)
+    else if (method == 'null')      mat <- mat
+    else stop('unknown normalization method: ', method)
+    mat
+}
+
+
+#' @title SCnorm normalization
+#'
+#' @param mat integer matrix. counts
+#' @param ... other arguments passed on to [SCnorm::SCnorm()]
+#'
+#' @examples
+#' norm_SCnorm(sim_mat*10)
+#'
+#' @family matrix normalization
+#'
+#' @export
+norm_SCnorm <- function(mat, ...) {
+    print('start normalization using SCnorm')
+    Conditions = rep(1, ncol(mat));
+    sce <- suppressMessages(SCnorm::SCnorm(mat, Conditions, NCores=4));
+    SCnorm::results(sce)
+}
+
+# norm_scater ------------------
+
+#' @title TMM/RLE normalization by scater package
+#'
+#' @param mat integer matrix. counts
+#'
+#' @family matrix normalization
+#'
+#' @name norm_scater
+
+as_SingleCellExperiment <- function(mat, col_data = NULL) {
+    assays = list(counts = as.matrix(mat))
+    if (is.null(col_data))
+    SingleCellExperiment::SingleCellExperiment(assays = assays)
+    else
+    SingleCellExperiment::SingleCellExperiment(assays = assays, colData = col_data)
+}
+
+#' @rdname  norm_scater
+#'
+#' @details `norm_uq()` performs upper-quartile normalization
+#'
+#' @examples
+#' norm_uq(sim_mat)
+#'
+#' @export
+norm_uq <- function(mat) {
+    print('start normalization using UQ')
+    #mat %>% as_SingleCellExperiment() %>%
+    #{suppressWarnings(scater::normaliseExprs(., "TMM"))} %>%
+    #scater::normalise() %>% SingleCellExperiment::normcounts()
+    dl <- edgeR::DGEList(counts=mat)
+    dl <- edgeR::calcNormFactors(dl, method='upperquartile')
+    edgeR::cpm(dl)
+}
+
+#' @rdname  norm_scater
+#'
+#' @details `norm_tmm()` performs TMM normalization
+#'
+#' @examples
+#' norm_tmm(sim_mat)
+#'
+#' @export
+#norm_tmm <- function(mat) {
+#    print('start normalization using TMM')
+#    mat %>% as_SingleCellExperiment() %>%
+#    {suppressWarnings(scater::normaliseExprs(., "TMM"))} %>%
+#    scater::normalise() %>% SingleCellExperiment::normcounts()
+#}
+
+norm_tmm <- function(mat) {
+    print('start normalization using TMM')
+    dl <- edgeR::DGEList(counts=mat)
+    dl <- edgeR::calcNormFactors(dl, method='TMM')
+    return(edgeR::cpm(dl))
+}
+
+
+norm_UQ <- function(mat) {
+    print('start normalization using upperquartile')
+    dl <- edgeR::DGEList(counts=mat)
+    dl <- edgeR::calcNormFactors(dl, method='upperquartile')
+    return(edgeR::cpm(dl))
+}
+#' @rdname  norm_scater
+#'
+#' @details `norm_rle()` performs RLE normalization
+#'
+#' @examples
+#' norm_rle(sim_mat)
+#'
+#' @export
+#norm_rle <- function(mat) {
+#    print('start normalization using RLE')
+#    mat %>% as_SingleCellExperiment() %>%
+#    {suppressWarnings(scater::normaliseExprs(., "RLE"))} %>%
+#    scater::normalise() %>% SingleCellExperiment::normcounts()
+#}
+norm_rle <- function(mat) {
+    print('start normalization using RLE')
+    dl <- edgeR::DGEList(counts=mat)
+    dl <- edgeR::calcNormFactors(dl, method='RLE')
+    edgeR::cpm(dl)
+}
+
+# norm_cpm ------------------
+
+#' @title CPM normalization by some genes
+#'
+#' @param mat integer matrix. counts
+#' @param row integer or logical. Use which rows (genes) as normalization factor
+norm_cpm_impl <- function(mat, row) {
+    t(t(mat*1e6) / colSums(mat[row, , drop = F], na.rm = T))
+    #edgeR::cpm(mat)
+}
+
+
+#' @title CPM normalization
+#'
+#' @description CPM normalization using counts sum of _certain_ genes as scaling factor
+#'
+#' @param mat integer matrix. counts.
+#'
+#' @details some functions may throw errors
+#'
+#' @family matrix normalization
+#'
+#' @name norm_cpm
+
+
+
+
+#' @rdname norm_cpm
+#'
+#' @details `norm_cpm_total()` uses total genes
+#'
+#' @examples
+#' norm_cpm_total(sim_mat)
+#'
+#' @export
+norm_cpm_total <- function(mat) {
+    print('start normalization using CPM')
+    row_all <- nrow(mat) %>% seq_len()
+    
+    norm_cpm_impl(mat, row_all)
+}
+
+#' @rdname norm_cpm
+#'
+#' @param top_n integer scalar. see `norm_cpm_top()` below
+#'
+#' @details `norm_cpm_top()` uses top 20 genes sorted by counts (assuming `top_n = 20L`)
+#'
+#' @examples
+#' norm_cpm_top(sim_mat, 20L)
+#'
+#' @export
+#norm_cpm_top <- function(mat, top_n) {
+#   print(paste('start normalization using top',top_n,'genes as scale factor',sep=' '))
+#   if (nrow(mat) < top_n)
+#    stop('too few feature for CPM top n normalization')
+#    
+#    row_top <-  mat %>% rowSums() %>% sort(decreasing = T, index.return = T) %>%
+#    {.$ix[seq_len(top_n)]}
+#    
+#    norm_cpm_impl(mat, -row_top)
+#}
+norm_cpm_top <- function(mat, top_n) {
+    print(paste('start normalization using top',top_n,'genes as scale factor',sep=' '))
+    if (nrow(mat) < top_n)
+    stop('too few feature for CPM top n normalization')
+    
+    row_top <-  mat %>% rowSums() %>% sort(decreasing = T, index.return = T) %>%
+    {.$ix[seq_len(top_n)]}
+    
+    top = t(t(mat[row_top,]*1e6) / colSums(mat[row_top, , drop = F], na.rm = T))
+    top_down= t(t(mat[setdiff(seq_len(dim(mat)[1]),row_top),]*1e6) / colSums(mat[setdiff(seq_len(dim(mat)[1]),row_top), , drop = F], na.rm = T))
+    mat_top <- rbind(top,top_down)
+    mat_top[rownames(mat),]
+}
+
+#' @rdname norm_cpm
+#'
+#' @param gene_type character. see `norm_cpm_rm()` below
+#'
+#' @details `norm_cpm_rm()` uses non-piRNA genes (assuming `gene_type = 'piRNA'`)
+#'
+#' @examples
+#' norm_cpm_rm(sim_mat, c('miRNA', 'piRNA'))
+#'
+#' @export
+norm_cpm_rm <- function(mat, rm_gene_type) {
+        print(paste('start normalization by removed some kind of RNA type ',args$removetype,sep=' '))
+        row_rm <-  mat %>% rownames() %>% strsplit(split='|',fixed=TRUE) %>% data.frame() %>% {.[2,]} %>% {. %in% rm_gene_type}
+        return(norm_cpm_impl(mat, !row_rm))
+}
+
+
+
+#' @rdname norm_cpm
+#'
+#' @param refer_gene_id character. Ensembl transcript id, see `norm_cpm_refer()` below
+#'
+#' @details `norm_cpm_refer()` uses given reference genes
+#'
+#' @examples
+#' norm_cpm_refer(sim_mat, suggest_refer$id)
+#'
+#' @export
+
+# mat = sim_mat
+# refer_gene_id = suggest_refer$id
+cv_fun <- function(x) {
+	sd(x, na.rm = T) / mean(x, na.rm = T)
+}
+
+
+norm_cpm_refer <- function(mat, ref_genes, cv_threshold=0.5) {
+    message('start normalization by reference gene with CV threshold', cv_threshold)
+    keeped_ref <- mat[ref_genes, , drop = F] %>% apply(1, cv_fun) < cv_threshold
+    if(!any(kept_ref)){
+        stop('no reference genes left after filtering by CV')
+    }
+    norm_cpm_impl(mat, ref_genes[keeped_ref])
+}
+
+################################################################################
+#################################batch removal##################################
+################################################################################
+
+
+remove_batch <- function( mat, method, class_info=NULL, batch_info=NULL, ruv_k=1){
+    # only remove batch for samples with batch information
+    if(!is.null(batch_info)){
+        samples_with_batch <- names(batch_info)[!is.na(batch_info)]
+    }else{
+        samples_with_batch <- colnames(mat)
+    }
+    if (method == 'RUV')       mat <- ruvs(mat, class_info=class_info, k=ruv_k)
+    else if(method == 'RUVn')       {
+        class_info <- rep(1, ncol(mat))
+        names(class_info) <- colnames(mat)
+        mat <- ruvs(mat, class_info=class_info, k=ruv_k)
+    }
+    else if (method == 'ComBat')    mat[, samples_with_batch] <- combat(mat[, samples_with_batch], class_info=class_info, batch_info=batch_info)
+    else if (method == 'limma')     mat[, samples_with_batch] <- limma(mat[, samples_with_batch], class_info=class_info, batch_info=batch_info)
+    else if (method == 'null')      mat <- mat
+    else stop("unknown batch effect removal method: ", method)
+    mat
+}
+
+ruv <- function(
+    mat,
+    classinfo_path,
+    label_column = 2,
+    k = 10
+){
+    suppressMessages(library(RUVSeq))
+
+    print('start batch removal using RUVs')
+    cIdx <- rownames(mat)
+    
+    sample_info <- read.table(classinfo_path,sep='\t',header=TRUE,  check.names=FALSE,  stringsAsFactors=FALSE)
+    ##rank by mat
+    if(unique(is.na(sample_info$sample_id))) 
+    stop("sample_id not in file")
+    rownames(sample_info) = sample_info$sample_id
+    sample_info=sample_info[names(mat),]
+    rownames(sample_info) <- c()
+    
+    names(sample_info)[label_column]="label"
+    scIdx <- matrix(-1, ncol = max(table(sample_info$label)), nrow = dim(table(sample_info$label)))
+    labellist <- names(table(sample_info$label))
+    for(i in c(1:dim(table(sample_info$label)))) {
+        tmp <- which(sample_info$label == labellist[i])
+        scIdx[i, 1:length(tmp)] <- tmp
+    }
+    mat <- log(mat+0.001)
+    ruv <- RUVs(as.matrix(mat), cIdx, k = k, scIdx = scIdx, isLog = TRUE)
+    exp(ruv$normalizedCounts)
+}
+
+ruvs <- function(mat, class_info, batch_info=NULL, k = 1){
+    if(is.null(class_info)) stop('class_info is needed for RUVs')
+
+    message('start batch removal using RUVs')
+    suppressMessages(library(RUVSeq))
+
+    cIdx <- rownames(mat)
+    
+    class_sizes <- table(class_info)
+    scIdx <- matrix(-1, ncol = max(class_sizes), nrow = dim(class_sizes))
+    for(i in c(1:dim(class_sizes))) {
+        tmp <- which(class_info == names(class_sizes)[i])
+        scIdx[i, 1:length(tmp)] <- tmp
+    }
+    mat <- log(mat + 0.25)
+    seq_ruvs <- RUVs(as.matrix(mat), cIdx, k = k, scIdx = scIdx, isLog = TRUE)
+    exp(seq_ruvs$normalizedCounts)
+}
+
+
+combat <- function(mat,class_info=NULL, batch_info=NULL){
+    if(is.null(batch_info)) stop('batch_info is needed for ComBat')
+
+    message('start batch removal using combat')
+    suppressMessages(library(sva))
+    #batch_info <-read.table(batchinfo_path,sep='\t',row.names=1,header=T,check.names = FALSE)
+    #if (!(dim(mat)[2]==dim(batch_info)[1]))
+    #    stop('sample numbers in batch info and expression matrix should be same')
+    #batchname <-toString(names(batch_info)[batch_column])
+    #batch_info=as.data.frame(batch_info[names(mat),])
+    #batch_info <- as.factor(batch_info)
+    mod <- model.matrix(~ 1, data = as.factor(batch_info))
+    combat <- ComBat(
+        dat = log(as.matrix(mat) + 0.25),
+        batch = as.factor(batch_info),
+        mod = mod,
+        par.prior = TRUE,
+        prior.plots = FALSE
+    )
+    
+    mat <- exp(combat)
+    mat
+}
+
+limma <- function(
+    mat,
+    class_info=NULL,
+    batch_info=NULL
+){
+    if(is.null(batch_info)) stop('batch_info is needed for limma')
+
+    print('start batch removal using limma')
+    suppressMessages(library(limma))
+
+    mat <- removeBatchEffect(log(as.matrix(mat) + 0.25), as.factor(batch_info))
+    mat <- exp(mat)
+    mat
+}
+
+################################################################################
+#################################plot part######################################
+################################################################################
+
+#' @export
+plot_highest_exprs <- function(sce, top_n = 20) {
+	sce %>% {suppressMessages(scater::calculateQCMetrics(.))} %>%
+		scater::plotHighestExprs(n = top_n)
+}
+
+
+# plot_group --------------
+
+plot_group_impl <- function(sce, shape = NULL, color = NULL, plot_fun) {
+ 	plot_fun(
+ 		sce,
+		shape_by = shape, colour_by = color,
+    	run_args = list(exprs_values = 'counts')
+	)
+}
+
+#' @title plot PCA, TSNE
+#'
+#' @param sce A SingleCellExperiment object.
+#' @param shape, color string. specify a column in `col_data` of [as_SingleCellExperiment()] to shape/color by
+#'
+#' @name plot_group
+
+
+
+#' @rdname plot_group
+#'
+#' @examples
+#' as_SingleCellExperiment(sim_mat) %>% plot_PCA()
+#' 
+#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA()
+#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label')
+#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(color = 'label')
+#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label', color = 'label')
+#'
+#' @export
+plot_PCA <- function(sce, shape = NULL, color = NULL) {
+	plot_group_impl(sce, shape, color, scater::plotPCA)
+}
+
+
+
+#' @rdname plot_group
+#'
+#' @examples
+#' as_SingleCellExperiment(sim_mat) %>% plot_PCA()
+#' 
+#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA()
+#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label')
+#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(color = 'label')
+#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label', color = 'label')
+#'
+#' @export
+plot_TSNE <- function(sce, shape = NULL, color = NULL) {
+	plot_group_impl(sce, shape, color, scater::plotTSNE)
+}
+
+# plot_variance --------------
+
+
+#' get y axis range of ggplot object.
+get_y_range <- function(plot) {
+    ggplot2::ggplot_build(plot)$layout$panel_params[[1]]$y.range
+}
+
+#' @title generate equally spaced y coordinates, not hit bottom nor top.
+#'
+#' @details Image `plot`'s y axis extends from 0 to 3, `x` contains 3 values,
+#' then we give `c(0.5, 1.5, 2.5)`.
+#'
+#' @param plot ggplot object.
+#' @param x numberic vector
+#'
+#' @return numeric vector. The same length as `x`
+#'
+#' @keywords internal
+seq_y <- function(plot, x) {
+    y_range <- get_y_range(plot)
+    by <- diff(y_range) / length(x)
+    seq(y_range[1] + by, y_range[2], by) - by/2
+}
+
+
+#' @title plot variance of counts across samples
+#'
+#' @param mat integer matrix. counts
+#' @param refer_gene_id character. Ensembl transcript id, add a vertical line
+#'   for each gene to mark the corresponding CV (on x axis). Only genes in
+#'   counts matrix would be shown. Usually these genes should be the reference
+#'   genes you want to use for normalization.
+#' @param refer_gene_name character. Transcript name
+#'
+#' @return [ggplot2::ggplot()] object.
+#'
+#' @name plot_variance
+
+
+
+
+
+
+#' @details
+#' `plot_cv_density()` produces density plot of coefficient of variation
+#'
+#' @examples
+#' # only one gene exist in the matrix
+#' plot_cv_density(sim_mat, suggest_refer$id)
+#' plot_cv_density(sim_mat, suggest_refer$id, suggest_refer$name)
+#'
+#' # the name should be the same length as id
+#' plot_cv_density(sim_mat, rownames(sim_mat)[1:6], letters[1:3])
+#' # if only part of the genes have name, you can pass the id of other genes
+#' plot_cv_density(sim_mat, rownames(sim_mat)[1:6], c(letters[1:3], rownames(sim_mat)[4:6]))
+#'
+#' @export
+#'
+#' @rdname plot_variance
+
+# mat = sim_mat
+# refer_gene_id = suggest_refer$id
+# refer_gene_name = suggest_refer$name
+plot_cv_density <- function(mat, refer_gene_id = '', refer_gene_name = refer_gene_id) {
+    cv <- mat %>% apply(1, cv_fun) %>%
+    {tibble::tibble(id = names(.), value = .)} %>%
+    dplyr::mutate(id = stringr::str_extract(id, '[^|]+'))
+    plot <- ggplot2::ggplot(cv, ggplot2::aes(value)) +
+    ggplot2::geom_density(color = 'blue') +
+    ggplot2::labs(x = 'coefficient of variation')
+    
+    if (length(refer_gene_id) != length(refer_gene_name)) {
+        warning("Ignoring refer_gene_name, since it isn't the same length as refer_gene_id")
+        refer_gene_name = refer_gene_id
+    }
+    cv_refer <- tibble::tibble(id = refer_gene_id, name = refer_gene_name) %>%
+    dplyr::inner_join(cv, by = 'id')
+    if (nrow(cv_refer) == 0L) {
+        warning("None refer gene found in the count matrix")
+        return(plot)
+    }
+    
+    plot + ggplot2::geom_vline(xintercept = cv_refer$value, color = 'green') +
+    ggplot2::geom_point(
+    ggplot2::aes(x = value, y = seq_y(plot, value)),
+    data = cv_refer, size = 2, shape = 1
+    ) +
+    ggrepel::geom_label_repel(
+    ggplot2::aes(x = value, y = seq_y(plot, value), label = name),
+    data = cv_refer, hjust = 0.5
+    )
+}
+
+
+
+#' @details
+#' `plot_cv_density()` produces density plot of coefficient of variation
+#'
+#' @examples
+#' # only one gene exist in the matrix
+#' plot_refer_violin(sim_mat, suggest_refer$id)
+#' plot_refer_violin(sim_mat, suggest_refer$id, suggest_refer$name)
+#'
+#' # the name should be the same length as id
+#' plot_refer_violin(sim_mat, rownames(sim_mat)[1:6], letters[1:3])
+#' # if only part of the genes have name, you can pass the id of other genes
+#' plot_refer_violin(sim_mat, rownames(sim_mat)[1:6], c(letters[1:3], rownames(sim_mat)[4:6]))
+#'
+#' @export
+#'
+#' @rdname plot_variance
+
+# mat = sim_mat
+# refer_gene_id = rownames(mat)[1:6]
+# refer_gene_name = paste0('gene_', letters[1:6])
+plot_refer_violin <- function(mat, refer_gene_id, refer_gene_name = refer_gene_id) {
+    if (length(refer_gene_id) != length(refer_gene_name)) {
+        warning("Ignoring refer_gene_name, since it isn't the same length as refer_gene_id")
+        refer_gene_name = refer_gene_id
+    }
+    
+    refer_gene <- tibble::tibble(id = refer_gene_id, name = refer_gene_name)
+    refer_count <- mat %>% tibble::as_tibble(rownames = 'id') %>%
+    dplyr::mutate(id = stringr::str_extract(id, '[^|]+')) %>%
+    dplyr::inner_join(refer_gene, ., by = 'id') %>% dplyr::select(-id)
+    if (nrow(refer_count) == 0L) {
+        warning('None refer gene found in the count matrix')
+        return(ggplot2::ggplot())
+    }
+    
+    refer_count_long <- refer_count %>%    tidyr::gather('sample', 'count', -1) %>%
+    dplyr::mutate_at('name', as.factor)
+    g_violin <- refer_count_long %>%
+    ggplot2::ggplot(ggplot2::aes(name, log2(count + 0.001))) +
+    ggplot2::geom_violin() +
+    ggplot2::labs(x = 'reference transcripts', y = quote(log[2](count)))
+    
+    # max y coordinate of each violin
+    y_max <- ggplot2::ggplot_build(g_violin)$data[[1]] %>% tibble::as_tibble() %>%
+    dplyr::group_by(x) %>% dplyr::arrange(dplyr::desc(y)) %>% dplyr::slice(1) %>%
+    dplyr::ungroup() %>% dplyr::arrange(x) %>% dplyr::select(x, y)
+    
+    cv_df <- refer_count_long %>%
+    dplyr::group_by(name) %>% dplyr::summarise(cv = cv_fun(count)) %>%
+    dplyr::arrange(name) %>% dplyr::mutate(x = seq_along(name)) %>%
+    dplyr::inner_join(y_max, by = 'x') %>%
+    dplyr::mutate(y = y + diff(get_y_range(g_violin)) / 20) %>%
+    dplyr::mutate(cv = formatC(cv, digits = 3, format = 'f'))
+    
+    g_violin + ggplot2::geom_text(ggplot2::aes(x, y, label = cv), cv_df, color = 'blue')
+}
+
+################################################################################
+#########################process pipeline#######################################
+################################################################################
+
+#args$input: matrix_path = 'output/scirep/count_matrix/transcript.txt'
+#args$class: classinfo_path = 'data/labels/scirep_classes.txt'
+#args$batch: 'data/other_annotations/scirep_batch.txt'
+#args$imputeout: impute_path = "output/matrix_processing/imputation/"
+
+#dummy_io<- function(filename){}
+
+
+message('read expression matrix: ', args$input)
+mat <- read_matrix(args$input)
+sample_ids <- colnames(mat)
+
+if((args$step != 'filter') && (is.null(args$method))){
+    stop('--method is required for step: ', args$step)
+}
+# filter
+if (args$step =='filter'){
+    if(!is.null(args$filtercount)){
+        #message(sprintf('Filter features with count <= %d in %f %% samples', args$filtercount, args$filtersample*100))
+        mat <- filter_low(mat, args$filtercount, args$filtersample)
+    }
+    else if(!is.null(args$filtercpm)){
+        #message(sprintf('Filter features with CPM <= %d in %f %% samples', args$filtercpm, args$filtersample*100))
+        mat <- filter_low_cpm(mat, args$filtercpm, args$filtersample)
+    }
+    else if(!is.null(args$filterrpkm)){
+        #message(sprintf('Filter features with RPKM <= %d in %f %% samples', args$filterrpkm, args$filtersample*100))
+        mat <- filter_low_rpkm(mat, args$filterrpkm, args$filtersample)
+    }
+} else if(args$step =='imputation'){
+    # imputation
+    library(readr)
+    mat <-read.table(args$input,sep='\t',header=TRUE,  check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
+    if (args$method == 'scimpute_count'){
+        mat <- scimpute_count(mat, impute_path= args$imputeout, K = args$imputecluster, N = args$processors)
+    } else if(args$method == 'viper_count'){
+        mat <- viper_count(mat, impute_path= args$imputeout,num = args$imputevipernum,percentage.cutoff = args$imputecutoff, alpha= args$imputealpha) 
+    } else if(args$method == 'null'){
+        
+    } else {
+        stop('unknown imputation method: ', args$method)
+    }
+}  else if(args$step =='normalization'){
+    # normalization
+    if(!is.null(args$ref_gene_file)){
+        ref_genes <- read.table(args$ref_gene_file)[,1]
+    }else{
+        ref_genes <- NULL
+    }
+    mat <- normalize(mat, method=args$method,top_n = args$normtopk, 
+        rm_gene_types = args$remove_gene_types, ref_genes=ref_genes)
+}  else if(args$step =='batch_removal'){
+    class_info <- NULL
+    batch <- NULL
+    if(args$method == 'RUV'){
+        if(!is.null(args$class)){
+            message('read class information: ', args$class)
+            class_info <- read.table(args$class, sep='\t', header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
+            class_info <- class_info[sample_ids, 1]
+        }else{
+            stop('argument --class is required for RUV')
+        }
+    }else if(args$method %in% c('ComBat', 'limma')){
+        if(!is.null(args$batch)){
+            message('read batch information: ', args$batch)
+            batch <- read.table(args$batch, sep='\t', header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
+            batch <- batch[sample_ids, args$batch_index]
+            names(batch) <- sample_ids
+        }else{
+            stop('argument --batch is required for: ', args$method)
+        }
+    }
+    # batch removal
+    mat <- remove_batch(mat, method=args$method, class_info=class_info, batch_info=batch, ruv_k=args$ruv_k)
+    
+}  else{
+    stop('unknown step: ', args$step)
+}   
+
+# write output matrix
+message('write expression matrix: ', args$output)
+write_matrix(mat, args$output)