exSEEK / Git / [41c1e8] /exseek/scripts/matrix-process.R

Models:
DanielG/
exSEEK
Downloads: 1
[41c1e8]: / exseek / scripts / matrix-process.R
History
Download this file
896 lines (776 with data), 32.5 kB

#!/usr/bin/env Rscript
suppressPackageStartupMessages(library("argparse"))

# create parser object
parser <- ArgumentParser()
parser$add_argument("-s", "--step", required=TRUE, help="which step to run")
parser$add_argument("-i", "--input", required=TRUE, help="input expression matrix file")
parser$add_argument("-o", "--output", required=TRUE, help="output expression matrix file")
parser$add_argument("--temp-dir", required=FALSE, default=".", help="temporary directory")
parser$add_argument("-c", "--class", required=FALSE, help="input class info file")
parser$add_argument("-b", "--batch", required=FALSE, help="input batch info file")
parser$add_argument("-m", "--method", required=FALSE, help="name of the method")

parser$add_argument("--filtercount", type="integer", default=10, help="count threshold for filtering")
parser$add_argument("--filtercpm", type="double", default=10, help="CPM threshold for filtering")
parser$add_argument("--filterrpkm", type="double", help="RPKM threshold for filtering")

parser$add_argument( "--filtermethod", type="character", default="filtercount",
                    metavar="STRING",
                    help="the filter algorithm to use [default = %(default)s], and return count matrix")
parser$add_argument("--filterexpv", type="double", default=5,
    help="filter by expression value of a gene [default = %(default)s]",
    metavar="NUMBER")

parser$add_argument("--filtersample", type="double", default=0.2,
    help="filter by counts of sample above certain counts of a gene [default = %(default)s]",
    metavar="NUMBER")
parser$add_argument("--imputecluster", type="integer", default=5,
    help="cluster number in scImpute [default = %(default)s]",
    metavar="NUMBER")
parser$add_argument("--imputevipernum", type="integer", default=5000,
    help="number in viper [default = %(default)s]",
    metavar="NUMBER")
parser$add_argument( "--imputecutoff", type="double", default=0.5,
                    metavar="NUMBER",
                    help="cutoff in viper [default = %(default)s]")
parser$add_argument( "--imputealpha", type="double", default=0.1,
                    metavar="NUMBER",
                    help="alpha in viper [default = %(default)s]")

parser$add_argument( "--normtopk", type="integer", default=20,
                    metavar="NUMBER",
                    help="top K feature as scale factor [default = %(default)s]")
parser$add_argument( "--cvthreshold", type="double", default=0.5,
                    metavar="NUMBER",
                    help="coefficient variance threshold of reference gene, filter ref gene with CV bigger than [default = %(default)s]")
parser$add_argument( "--remove-gene-types", type="character", default="miRNA,piRNA",
                    metavar="STRING",
                    help="remove some time of RNA for normalization scale factor calculation [default = %(default)s]")
parser$add_argument( "--ref-gene-file", type="character", #default="miRNA,piRNA",
                    metavar="STRING",
                    help="reference gene file path [default = %(default)s]")
#they are feature name for full length feature, most are miRNA, for domain feature, they have the same feature name

parser$add_argument("--batch-index", type="integer", default=1,
                    metavar="INT",
                    help="batch index to select which batch to use [default = %(default)s]")
parser$add_argument("--ruv-k", type="integer", default=1, metavar="INT", 
    help="parameter k for RUVs (The number of factors of unwanted variation to be estimated from the data)")

parser$add_argument("-p", "--processors", type="integer", default=1,
    help="Number of processors to use. This option is useful on multicore *nix or Mac machine only, when performing multiple runs (nrun > 1) [default = %(default)s]",
    metavar="NUMBER")

# get command line options, if help option encountered print help and exit,
# otherwise if options not found on command line then set defaults,
args <- parser$parse_args()

splitname <- unlist(strsplit(args$input,split = '/',fixed = TRUE))
splitname <- splitname[length(splitname)]

#' @title read counts matrix
#'
#' @param path string.
#' @param ... other arguments passsed on to [readr::read_tsv()]
#'
#' @return integer matrix
#'
#' @details In any case, first part (separated by `|`) of row names must be
#'   Ensembl transcript id
#'
#' @export


#' @title sample classinfo
#'
#' @param path string.
#'
#' @return string matrix
#'
#' @details column 1 represents sample name, column 2 represents classinfo
#'
#' @export

# path = 'scirep_classes.txt'
read_classinfo <- function(path, ...) {
    read.table(path, sep='\t', header=TRUE,  check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
}

read_matrix <- function(filename){
    read.table(filename, sep='\t', header=TRUE,  check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
}

write_matrix <- function(mat, filename){
    write.table(mat, filename, sep='\t',quote=FALSE, row.names=TRUE, col.names=TRUE)
}

################################################################################
#################################imputation#####################################
################################################################################

#' @title filter genes with low expression values
#'
#' @param mat integer matrix.
#' @param min_count, min_sample_per_gene integer scalar. For each gene, it must
#'   contain at least `min_count` reads in at least `min_sample_per_gene`
#'   samples. Otherwise, it would be dropped.
#'
#' @return integer matrix.
#'
#' @examples
#' filter_low(sim_mat)
#'
#' @export
#filter_low <- function(mat, min_count = 5, min_sample_per_gene = 10) {
#    print(paste('start filtering lowly expressed gene:','count threshold',min_count,'sample threshold',min_sample_per_gene,sep=' '))
#	low_per_row <- rowSums(mat > min_count)
#	keeped_row <- low_per_row > min_sample_per_gene
#	mat[keeped_row, ]
#}

filter_low <- function(mat, min_count = 5, min_sample_per_gene = 0.5) {
    print(paste('start filtering lowly expressed gene:','count threshold',min_count,'sample threshold',min_sample_per_gene,sep=' '))
    min_sample_per_gene <- ceiling(dim(mat)[2]*min_sample_per_gene)
    low_per_row <- rowSums(mat > min_count)
    keeped_row <- low_per_row >= min_sample_per_gene
    mat[keeped_row, ]
}

filter_low_cpm <- function(mat, min_cpm = 5, min_pct_sample_per_gene = 0.5) {
    print(paste('start filtering lowly expressed gene:','CPM threshold',min_cpm,'sample threshold',min_pct_sample_per_gene,sep=' '))
    row_all <- nrow(mat) %>% seq_len()
    mat_cpm <- t(t(mat*1e6) / colSums(mat[row_all, , drop = F], na.rm = T))
    min_sample_per_gene <- ceiling(dim(mat_cpm)[2]*min_pct_sample_per_gene)
    low_per_row <- rowSums(mat_cpm > min_cpm)
    keeped_row <- low_per_row >= min_sample_per_gene
    mat[keeped_row, ]
}

filter_low_rpkm <- function(mat, min_rpkm = 5, min_pct_sample_per_gene = 0.5) {
    print(paste('start filtering lowly expressed gene:','RPKM threshold',min_rpkm,'sample threshold',min_pct_sample_per_gene,sep=' '))
    row_all <- nrow(mat) %>% seq_len()
    mat_cpm <- t(t(mat*1e6) / colSums(mat[row_all, , drop = F], na.rm = T))
    
    gene_length <- c()
    for(i in seq_len(length(rownames(mat_cpm)))){
            gene_length[i] <- as.integer(unlist(strsplit(rownames(mat_cpm)[i],"|",fixed=T))[7])
                             -as.integer(unlist(strsplit(rownames(mat_cpm)[i],"|",fixed=T))[6])
    }
    mat_rpkm <- mat_cpm*1000/gene_length
    
    min_sample_per_gene <- ceiling(dim(mat_rpkm)[2]*min_pct_sample_per_gene)
    low_per_row <- rowSums(mat_rpkm > min_rpkm)
    keeped_row <- low_per_row >= min_sample_per_gene
    mat[keeped_row, ]
}

#' @imputation
#'
#' @param mat integer matrix.
#' @param tmp_path where tmp files stores, "data/expression_matrix/" for example.
#' @param out_path where outputs stores, "data/matrix_processing/imputation/" for example.
#' @param K imputation Kcluster
#' @param N imputation ncores
#' @return integer matrix named "scimpute_count.txt" stored in out_path.
#'
#' @examples imputation(mat, "data/expression_matrix/", "data/matrix_processing/imputation/",5,3)
#'
#' @export
scimpute_count <- function(mat, temp_dir, K = 5, N = 3) {
    suppressMessages(library("scImpute"))
    print('start imputation using scImpute')
    mat_correct <- names(mat)
    names(mat) <-  paste('C_',seq_len(length(names(mat))))
    write.csv(mat, paste(temp_path, "input.csv",sep=""), sep=',')
    dir.create(splitname)
    scimpute(count_path = paste(temp_dir, "input.csv",sep=""), infile = "csv", outfile = "txt", out_dir = temp_dir, Kcluster = K, ncores = N)
    mat <- read.table(paste(temp_dir, "scimpute_count.txt", sep="/"),sep=' ',header=TRUE,  check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
    unlink(temp_dir, recursive=TRUE)
    names(mat) <-mat_correct
    mat
}

viper_count <- function(mat, num = 5000, percentage.cutoff = 0.1, alpha= 0.5, temp_dir=".") {
    suppressWarnings(library(VIPER))
    mat_correct <- names(mat)
    names(mat) <-  paste('C_',seq_len(length(names(mat))))
    print (num, percentage.cutoff, alpha)
    VIPER(mat, num = num, percentage.cutoff = percentage.cutoff, minbool = FALSE, alpha = alpha, report = TRUE, outdir = temp_dir)
    mat <- read.table(paste(temp_dir, 'imputed_counts.csv', sep='/'),sep=' ',header=TRUE,  check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
    unlink(temp_dir, recursive=TRUE)
    names(mat) <-mat_correct
    mat
}


################################################################################
###############################normalization####################################
################################################################################
#suppressPackageStartupMessages(library(clusterSim))
suppressPackageStartupMessages(library(scRNA.seq.funcs))
suppressPackageStartupMessages(library(scater))
suppressPackageStartupMessages(library(scran))
suppressPackageStartupMessages(library(SingleCellExperiment))
suppressPackageStartupMessages(library(kBET))
suppressPackageStartupMessages(library(sva))
suppressPackageStartupMessages(library(edgeR))
suppressPackageStartupMessages(library(ggpubr))
options(stringsAsFactors = FALSE)
library(magrittr)

#' @title martix normalization
#' @examples
#' \donotrun{
#'     norm_mat(
#'         '/path/to/matrix'
#'     )
#' }
normalize <- function(
    mat,
    method,
    top_n = 20, 
    rm_gene_types = NULL,
    ref_genes=NULL
) {
    if (method == 'SCnorm')    mat <- norm_SCnorm(mat)
    else if (method == 'TMM')       mat <- norm_tmm(mat)
    else if (method == 'RLE')       mat <- norm_rle(mat)
    else if (method == 'CPM')       mat <- norm_cpm_total(mat)
    else if (method == 'UQ')        mat <- norm_uq(mat)
    else if (method == 'CPM_top')   mat <- norm_cpm_top(mat, top_n)
    else if (method == 'CPM_rm')    {
        if(is.null(rm_gene_types)){
            stop('argument rm_gene_types is required for normalization method: CPM_top')
        }
        mat <- norm_cpm_rm(mat, rm_gene_types)
    }
    else if (method == 'CPM_refer') mat <- norm_cpm_refer(mat, refer_gene_id_path)
    else if (method == 'null')      mat <- mat
    else stop('unknown normalization method: ', method)
    mat
}


#' @title SCnorm normalization
#'
#' @param mat integer matrix. counts
#' @param ... other arguments passed on to [SCnorm::SCnorm()]
#'
#' @examples
#' norm_SCnorm(sim_mat*10)
#'
#' @family matrix normalization
#'
#' @export
norm_SCnorm <- function(mat, ...) {
    print('start normalization using SCnorm')
    Conditions = rep(1, ncol(mat));
    sce <- suppressMessages(SCnorm::SCnorm(mat, Conditions, NCores=4));
    SCnorm::results(sce)
}

# norm_scater ------------------

#' @title TMM/RLE normalization by scater package
#'
#' @param mat integer matrix. counts
#'
#' @family matrix normalization
#'
#' @name norm_scater

as_SingleCellExperiment <- function(mat, col_data = NULL) {
    assays = list(counts = as.matrix(mat))
    if (is.null(col_data))
    SingleCellExperiment::SingleCellExperiment(assays = assays)
    else
    SingleCellExperiment::SingleCellExperiment(assays = assays, colData = col_data)
}

#' @rdname  norm_scater
#'
#' @details `norm_uq()` performs upper-quartile normalization
#'
#' @examples
#' norm_uq(sim_mat)
#'
#' @export
norm_uq <- function(mat) {
    print('start normalization using UQ')
    #mat %>% as_SingleCellExperiment() %>%
    #{suppressWarnings(scater::normaliseExprs(., "TMM"))} %>%
    #scater::normalise() %>% SingleCellExperiment::normcounts()
    dl <- edgeR::DGEList(counts=mat)
    dl <- edgeR::calcNormFactors(dl, method='upperquartile')
    edgeR::cpm(dl)
}

#' @rdname  norm_scater
#'
#' @details `norm_tmm()` performs TMM normalization
#'
#' @examples
#' norm_tmm(sim_mat)
#'
#' @export
#norm_tmm <- function(mat) {
#    print('start normalization using TMM')
#    mat %>% as_SingleCellExperiment() %>%
#    {suppressWarnings(scater::normaliseExprs(., "TMM"))} %>%
#    scater::normalise() %>% SingleCellExperiment::normcounts()
#}

norm_tmm <- function(mat) {
    print('start normalization using TMM')
    dl <- edgeR::DGEList(counts=mat)
    dl <- edgeR::calcNormFactors(dl, method='TMM')
    return(edgeR::cpm(dl))
}


norm_UQ <- function(mat) {
    print('start normalization using upperquartile')
    dl <- edgeR::DGEList(counts=mat)
    dl <- edgeR::calcNormFactors(dl, method='upperquartile')
    return(edgeR::cpm(dl))
}
#' @rdname  norm_scater
#'
#' @details `norm_rle()` performs RLE normalization
#'
#' @examples
#' norm_rle(sim_mat)
#'
#' @export
#norm_rle <- function(mat) {
#    print('start normalization using RLE')
#    mat %>% as_SingleCellExperiment() %>%
#    {suppressWarnings(scater::normaliseExprs(., "RLE"))} %>%
#    scater::normalise() %>% SingleCellExperiment::normcounts()
#}
norm_rle <- function(mat) {
    print('start normalization using RLE')
    dl <- edgeR::DGEList(counts=mat)
    dl <- edgeR::calcNormFactors(dl, method='RLE')
    edgeR::cpm(dl)
}

# norm_cpm ------------------

#' @title CPM normalization by some genes
#'
#' @param mat integer matrix. counts
#' @param row integer or logical. Use which rows (genes) as normalization factor
norm_cpm_impl <- function(mat, row) {
    t(t(mat*1e6) / colSums(mat[row, , drop = F], na.rm = T))
    #edgeR::cpm(mat)
}


#' @title CPM normalization
#'
#' @description CPM normalization using counts sum of _certain_ genes as scaling factor
#'
#' @param mat integer matrix. counts.
#'
#' @details some functions may throw errors
#'
#' @family matrix normalization
#'
#' @name norm_cpm




#' @rdname norm_cpm
#'
#' @details `norm_cpm_total()` uses total genes
#'
#' @examples
#' norm_cpm_total(sim_mat)
#'
#' @export
norm_cpm_total <- function(mat) {
    print('start normalization using CPM')
    row_all <- nrow(mat) %>% seq_len()
    
    norm_cpm_impl(mat, row_all)
}

#' @rdname norm_cpm
#'
#' @param top_n integer scalar. see `norm_cpm_top()` below
#'
#' @details `norm_cpm_top()` uses top 20 genes sorted by counts (assuming `top_n = 20L`)
#'
#' @examples
#' norm_cpm_top(sim_mat, 20L)
#'
#' @export
#norm_cpm_top <- function(mat, top_n) {
#   print(paste('start normalization using top',top_n,'genes as scale factor',sep=' '))
#   if (nrow(mat) < top_n)
#    stop('too few feature for CPM top n normalization')
#    
#    row_top <-  mat %>% rowSums() %>% sort(decreasing = T, index.return = T) %>%
#    {.$ix[seq_len(top_n)]}
#    
#    norm_cpm_impl(mat, -row_top)
#}
norm_cpm_top <- function(mat, top_n) {
    print(paste('start normalization using top',top_n,'genes as scale factor',sep=' '))
    if (nrow(mat) < top_n)
    stop('too few feature for CPM top n normalization')
    
    row_top <-  mat %>% rowSums() %>% sort(decreasing = T, index.return = T) %>%
    {.$ix[seq_len(top_n)]}
    
    top = t(t(mat[row_top,]*1e6) / colSums(mat[row_top, , drop = F], na.rm = T))
    top_down= t(t(mat[setdiff(seq_len(dim(mat)[1]),row_top),]*1e6) / colSums(mat[setdiff(seq_len(dim(mat)[1]),row_top), , drop = F], na.rm = T))
    mat_top <- rbind(top,top_down)
    mat_top[rownames(mat),]
}

#' @rdname norm_cpm
#'
#' @param gene_type character. see `norm_cpm_rm()` below
#'
#' @details `norm_cpm_rm()` uses non-piRNA genes (assuming `gene_type = 'piRNA'`)
#'
#' @examples
#' norm_cpm_rm(sim_mat, c('miRNA', 'piRNA'))
#'
#' @export
norm_cpm_rm <- function(mat, rm_gene_type) {
        print(paste('start normalization by removed some kind of RNA type ',args$removetype,sep=' '))
        row_rm <-  mat %>% rownames() %>% strsplit(split='|',fixed=TRUE) %>% data.frame() %>% {.[2,]} %>% {. %in% rm_gene_type}
        return(norm_cpm_impl(mat, !row_rm))
}



#' @rdname norm_cpm
#'
#' @param refer_gene_id character. Ensembl transcript id, see `norm_cpm_refer()` below
#'
#' @details `norm_cpm_refer()` uses given reference genes
#'
#' @examples
#' norm_cpm_refer(sim_mat, suggest_refer$id)
#'
#' @export

# mat = sim_mat
# refer_gene_id = suggest_refer$id
cv_fun <- function(x) {
	sd(x, na.rm = T) / mean(x, na.rm = T)
}


norm_cpm_refer <- function(mat, ref_genes, cv_threshold=0.5) {
    message('start normalization by reference gene with CV threshold', cv_threshold)
    keeped_ref <- mat[ref_genes, , drop = F] %>% apply(1, cv_fun) < cv_threshold
    if(!any(kept_ref)){
        stop('no reference genes left after filtering by CV')
    }
    norm_cpm_impl(mat, ref_genes[keeped_ref])
}

################################################################################
#################################batch removal##################################
################################################################################


remove_batch <- function( mat, method, class_info=NULL, batch_info=NULL, ruv_k=1){
    # only remove batch for samples with batch information
    if(!is.null(batch_info)){
        samples_with_batch <- names(batch_info)[!is.na(batch_info)]
    }else{
        samples_with_batch <- colnames(mat)
    }
    if (method == 'RUV')       mat <- ruvs(mat, class_info=class_info, k=ruv_k)
    else if(method == 'RUVn')       {
        class_info <- rep(1, ncol(mat))
        names(class_info) <- colnames(mat)
        mat <- ruvs(mat, class_info=class_info, k=ruv_k)
    }
    else if (method == 'ComBat')    mat[, samples_with_batch] <- combat(mat[, samples_with_batch], class_info=class_info, batch_info=batch_info)
    else if (method == 'limma')     mat[, samples_with_batch] <- limma(mat[, samples_with_batch], class_info=class_info, batch_info=batch_info)
    else if (method == 'null')      mat <- mat
    else stop("unknown batch effect removal method: ", method)
    mat
}

ruv <- function(
    mat,
    classinfo_path,
    label_column = 2,
    k = 10
){
    suppressMessages(library(RUVSeq))

    print('start batch removal using RUVs')
    cIdx <- rownames(mat)
    
    sample_info <- read.table(classinfo_path,sep='\t',header=TRUE,  check.names=FALSE,  stringsAsFactors=FALSE)
    ##rank by mat
    if(unique(is.na(sample_info$sample_id))) 
    stop("sample_id not in file")
    rownames(sample_info) = sample_info$sample_id
    sample_info=sample_info[names(mat),]
    rownames(sample_info) <- c()
    
    names(sample_info)[label_column]="label"
    scIdx <- matrix(-1, ncol = max(table(sample_info$label)), nrow = dim(table(sample_info$label)))
    labellist <- names(table(sample_info$label))
    for(i in c(1:dim(table(sample_info$label)))) {
        tmp <- which(sample_info$label == labellist[i])
        scIdx[i, 1:length(tmp)] <- tmp
    }
    mat <- log(mat+0.001)
    ruv <- RUVs(as.matrix(mat), cIdx, k = k, scIdx = scIdx, isLog = TRUE)
    exp(ruv$normalizedCounts)
}

ruvs <- function(mat, class_info, batch_info=NULL, k = 1){
    if(is.null(class_info)) stop('class_info is needed for RUVs')

    message('start batch removal using RUVs')
    suppressMessages(library(RUVSeq))

    cIdx <- rownames(mat)
    
    class_sizes <- table(class_info)
    scIdx <- matrix(-1, ncol = max(class_sizes), nrow = dim(class_sizes))
    for(i in c(1:dim(class_sizes))) {
        tmp <- which(class_info == names(class_sizes)[i])
        scIdx[i, 1:length(tmp)] <- tmp
    }
    mat <- log(mat + 0.25)
    seq_ruvs <- RUVs(as.matrix(mat), cIdx, k = k, scIdx = scIdx, isLog = TRUE)
    exp(seq_ruvs$normalizedCounts)
}


combat <- function(mat,class_info=NULL, batch_info=NULL){
    if(is.null(batch_info)) stop('batch_info is needed for ComBat')

    message('start batch removal using combat')
    suppressMessages(library(sva))
    #batch_info <-read.table(batchinfo_path,sep='\t',row.names=1,header=T,check.names = FALSE)
    #if (!(dim(mat)[2]==dim(batch_info)[1]))
    #    stop('sample numbers in batch info and expression matrix should be same')
    #batchname <-toString(names(batch_info)[batch_column])
    #batch_info=as.data.frame(batch_info[names(mat),])
    #batch_info <- as.factor(batch_info)
    mod <- model.matrix(~ 1, data = as.factor(batch_info))
    combat <- ComBat(
        dat = log(as.matrix(mat) + 0.25),
        batch = as.factor(batch_info),
        mod = mod,
        par.prior = TRUE,
        prior.plots = FALSE
    )
    
    mat <- exp(combat)
    mat
}

limma <- function(
    mat,
    class_info=NULL,
    batch_info=NULL
){
    if(is.null(batch_info)) stop('batch_info is needed for limma')

    print('start batch removal using limma')
    suppressMessages(library(limma))

    mat <- removeBatchEffect(log(as.matrix(mat) + 0.25), as.factor(batch_info))
    mat <- exp(mat)
    mat
}

################################################################################
#################################plot part######################################
################################################################################

#' @export
plot_highest_exprs <- function(sce, top_n = 20) {
	sce %>% {suppressMessages(scater::calculateQCMetrics(.))} %>%
		scater::plotHighestExprs(n = top_n)
}


# plot_group --------------

plot_group_impl <- function(sce, shape = NULL, color = NULL, plot_fun) {
 	plot_fun(
 		sce,
		shape_by = shape, colour_by = color,
    	run_args = list(exprs_values = 'counts')
	)
}

#' @title plot PCA, TSNE
#'
#' @param sce A SingleCellExperiment object.
#' @param shape, color string. specify a column in `col_data` of [as_SingleCellExperiment()] to shape/color by
#'
#' @name plot_group



#' @rdname plot_group
#'
#' @examples
#' as_SingleCellExperiment(sim_mat) %>% plot_PCA()
#' 
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA()
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(color = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label', color = 'label')
#'
#' @export
plot_PCA <- function(sce, shape = NULL, color = NULL) {
	plot_group_impl(sce, shape, color, scater::plotPCA)
}



#' @rdname plot_group
#'
#' @examples
#' as_SingleCellExperiment(sim_mat) %>% plot_PCA()
#' 
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA()
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(color = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label', color = 'label')
#'
#' @export
plot_TSNE <- function(sce, shape = NULL, color = NULL) {
	plot_group_impl(sce, shape, color, scater::plotTSNE)
}

# plot_variance --------------


#' get y axis range of ggplot object.
get_y_range <- function(plot) {
    ggplot2::ggplot_build(plot)$layout$panel_params[[1]]$y.range
}

#' @title generate equally spaced y coordinates, not hit bottom nor top.
#'
#' @details Image `plot`'s y axis extends from 0 to 3, `x` contains 3 values,
#' then we give `c(0.5, 1.5, 2.5)`.
#'
#' @param plot ggplot object.
#' @param x numberic vector
#'
#' @return numeric vector. The same length as `x`
#'
#' @keywords internal
seq_y <- function(plot, x) {
    y_range <- get_y_range(plot)
    by <- diff(y_range) / length(x)
    seq(y_range[1] + by, y_range[2], by) - by/2
}


#' @title plot variance of counts across samples
#'
#' @param mat integer matrix. counts
#' @param refer_gene_id character. Ensembl transcript id, add a vertical line
#'   for each gene to mark the corresponding CV (on x axis). Only genes in
#'   counts matrix would be shown. Usually these genes should be the reference
#'   genes you want to use for normalization.
#' @param refer_gene_name character. Transcript name
#'
#' @return [ggplot2::ggplot()] object.
#'
#' @name plot_variance






#' @details
#' `plot_cv_density()` produces density plot of coefficient of variation
#'
#' @examples
#' # only one gene exist in the matrix
#' plot_cv_density(sim_mat, suggest_refer$id)
#' plot_cv_density(sim_mat, suggest_refer$id, suggest_refer$name)
#'
#' # the name should be the same length as id
#' plot_cv_density(sim_mat, rownames(sim_mat)[1:6], letters[1:3])
#' # if only part of the genes have name, you can pass the id of other genes
#' plot_cv_density(sim_mat, rownames(sim_mat)[1:6], c(letters[1:3], rownames(sim_mat)[4:6]))
#'
#' @export
#'
#' @rdname plot_variance

# mat = sim_mat
# refer_gene_id = suggest_refer$id
# refer_gene_name = suggest_refer$name
plot_cv_density <- function(mat, refer_gene_id = '', refer_gene_name = refer_gene_id) {
    cv <- mat %>% apply(1, cv_fun) %>%
    {tibble::tibble(id = names(.), value = .)} %>%
    dplyr::mutate(id = stringr::str_extract(id, '[^|]+'))
    plot <- ggplot2::ggplot(cv, ggplot2::aes(value)) +
    ggplot2::geom_density(color = 'blue') +
    ggplot2::labs(x = 'coefficient of variation')
    
    if (length(refer_gene_id) != length(refer_gene_name)) {
        warning("Ignoring refer_gene_name, since it isn't the same length as refer_gene_id")
        refer_gene_name = refer_gene_id
    }
    cv_refer <- tibble::tibble(id = refer_gene_id, name = refer_gene_name) %>%
    dplyr::inner_join(cv, by = 'id')
    if (nrow(cv_refer) == 0L) {
        warning("None refer gene found in the count matrix")
        return(plot)
    }
    
    plot + ggplot2::geom_vline(xintercept = cv_refer$value, color = 'green') +
    ggplot2::geom_point(
    ggplot2::aes(x = value, y = seq_y(plot, value)),
    data = cv_refer, size = 2, shape = 1
    ) +
    ggrepel::geom_label_repel(
    ggplot2::aes(x = value, y = seq_y(plot, value), label = name),
    data = cv_refer, hjust = 0.5
    )
}



#' @details
#' `plot_cv_density()` produces density plot of coefficient of variation
#'
#' @examples
#' # only one gene exist in the matrix
#' plot_refer_violin(sim_mat, suggest_refer$id)
#' plot_refer_violin(sim_mat, suggest_refer$id, suggest_refer$name)
#'
#' # the name should be the same length as id
#' plot_refer_violin(sim_mat, rownames(sim_mat)[1:6], letters[1:3])
#' # if only part of the genes have name, you can pass the id of other genes
#' plot_refer_violin(sim_mat, rownames(sim_mat)[1:6], c(letters[1:3], rownames(sim_mat)[4:6]))
#'
#' @export
#'
#' @rdname plot_variance

# mat = sim_mat
# refer_gene_id = rownames(mat)[1:6]
# refer_gene_name = paste0('gene_', letters[1:6])
plot_refer_violin <- function(mat, refer_gene_id, refer_gene_name = refer_gene_id) {
    if (length(refer_gene_id) != length(refer_gene_name)) {
        warning("Ignoring refer_gene_name, since it isn't the same length as refer_gene_id")
        refer_gene_name = refer_gene_id
    }
    
    refer_gene <- tibble::tibble(id = refer_gene_id, name = refer_gene_name)
    refer_count <- mat %>% tibble::as_tibble(rownames = 'id') %>%
    dplyr::mutate(id = stringr::str_extract(id, '[^|]+')) %>%
    dplyr::inner_join(refer_gene, ., by = 'id') %>% dplyr::select(-id)
    if (nrow(refer_count) == 0L) {
        warning('None refer gene found in the count matrix')
        return(ggplot2::ggplot())
    }
    
    refer_count_long <- refer_count %>%    tidyr::gather('sample', 'count', -1) %>%
    dplyr::mutate_at('name', as.factor)
    g_violin <- refer_count_long %>%
    ggplot2::ggplot(ggplot2::aes(name, log2(count + 0.001))) +
    ggplot2::geom_violin() +
    ggplot2::labs(x = 'reference transcripts', y = quote(log[2](count)))
    
    # max y coordinate of each violin
    y_max <- ggplot2::ggplot_build(g_violin)$data[[1]] %>% tibble::as_tibble() %>%
    dplyr::group_by(x) %>% dplyr::arrange(dplyr::desc(y)) %>% dplyr::slice(1) %>%
    dplyr::ungroup() %>% dplyr::arrange(x) %>% dplyr::select(x, y)
    
    cv_df <- refer_count_long %>%
    dplyr::group_by(name) %>% dplyr::summarise(cv = cv_fun(count)) %>%
    dplyr::arrange(name) %>% dplyr::mutate(x = seq_along(name)) %>%
    dplyr::inner_join(y_max, by = 'x') %>%
    dplyr::mutate(y = y + diff(get_y_range(g_violin)) / 20) %>%
    dplyr::mutate(cv = formatC(cv, digits = 3, format = 'f'))
    
    g_violin + ggplot2::geom_text(ggplot2::aes(x, y, label = cv), cv_df, color = 'blue')
}

################################################################################
#########################process pipeline#######################################
################################################################################

#args$input: matrix_path = 'output/scirep/count_matrix/transcript.txt'
#args$class: classinfo_path = 'data/labels/scirep_classes.txt'
#args$batch: 'data/other_annotations/scirep_batch.txt'
#args$imputeout: impute_path = "output/matrix_processing/imputation/"

#dummy_io<- function(filename){}


message('read expression matrix: ', args$input)
mat <- read_matrix(args$input)
sample_ids <- colnames(mat)

if((args$step != 'filter') && (is.null(args$method))){
    stop('--method is required for step: ', args$step)
}
# filter
if (args$step =='filter'){
    if(!is.null(args$filtercount)){
        #message(sprintf('Filter features with count <= %d in %f %% samples', args$filtercount, args$filtersample*100))
        mat <- filter_low(mat, args$filtercount, args$filtersample)
    }
    else if(!is.null(args$filtercpm)){
        #message(sprintf('Filter features with CPM <= %d in %f %% samples', args$filtercpm, args$filtersample*100))
        mat <- filter_low_cpm(mat, args$filtercpm, args$filtersample)
    }
    else if(!is.null(args$filterrpkm)){
        #message(sprintf('Filter features with RPKM <= %d in %f %% samples', args$filterrpkm, args$filtersample*100))
        mat <- filter_low_rpkm(mat, args$filterrpkm, args$filtersample)
    }
} else if(args$step =='imputation'){
    # imputation
    library(readr)
    mat <-read.table(args$input,sep='\t',header=TRUE,  check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
    if (args$method == 'scimpute_count'){
        mat <- scimpute_count(mat, impute_path= args$imputeout, K = args$imputecluster, N = args$processors)
    } else if(args$method == 'viper_count'){
        mat <- viper_count(mat, impute_path= args$imputeout,num = args$imputevipernum,percentage.cutoff = args$imputecutoff, alpha= args$imputealpha) 
    } else if(args$method == 'null'){
        
    } else {
        stop('unknown imputation method: ', args$method)
    }
}  else if(args$step =='normalization'){
    # normalization
    if(!is.null(args$ref_gene_file)){
        ref_genes <- read.table(args$ref_gene_file)[,1]
    }else{
        ref_genes <- NULL
    }
    mat <- normalize(mat, method=args$method,top_n = args$normtopk, 
        rm_gene_types = args$remove_gene_types, ref_genes=ref_genes)
}  else if(args$step =='batch_removal'){
    class_info <- NULL
    batch <- NULL
    if(args$method == 'RUV'){
        if(!is.null(args$class)){
            message('read class information: ', args$class)
            class_info <- read.table(args$class, sep='\t', header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
            class_info <- class_info[sample_ids, 1]
        }else{
            stop('argument --class is required for RUV')
        }
    }else if(args$method %in% c('ComBat', 'limma')){
        if(!is.null(args$batch)){
            message('read batch information: ', args$batch)
            batch <- read.table(args$batch, sep='\t', header=TRUE, check.names=FALSE, row.names=1, stringsAsFactors=FALSE)
            batch <- batch[sample_ids, args$batch_index]
            names(batch) <- sample_ids
        }else{
            stop('argument --batch is required for: ', args$method)
        }
    }
    # batch removal
    mat <- remove_batch(mat, method=args$method, class_info=class_info, batch_info=batch, ruv_k=args$ruv_k)
    
}  else{
    stop('unknown step: ', args$step)
}   

# write output matrix
message('write expression matrix: ', args$output)
write_matrix(mat, args$output)