b/src/Simulation.R
+library(SymSim)
+library(rhdf5)
+library(Seurat)
+phyla <- read.tree("tree.txt")
+phyla2 <- read.tree("tree.txt")
+data(gene_len_pool)
+#This is a simulation script with adding batch effect
+#First we need to modify the DivideBatches2 function in SymSim to make the same batch partition in mRNA and ADT data.
+DivideBatches2 <- function(observed_counts_res, batchIDs, batch_effect_size=1){
+  observed_counts <- observed_counts_res[["counts"]]
+  meta_cell <- observed_counts_res[["cell_meta"]]
+  ncells <- dim(observed_counts)[2]
+  ngenes <- dim(observed_counts)[1]
+  nbatch <- unique(batchIDs)
+  meta_cell2 <- data.frame(batch = batchIDs, stringsAsFactors = F)
+  meta_cell <- cbind(meta_cell, meta_cell2)
+  mean_matrix <- matrix(0, ngenes, nbatch)
+  gene_mean <- rnorm(ngenes, 0, 1)
+  temp <- lapply(1:ngenes, function(igene) {
+    return(runif(nbatch, min = gene_mean[igene] - batch_effect_size,
+                 max = gene_mean[igene] + batch_effect_size))
+  })
+  mean_matrix <- do.call(rbind, temp)
+  batch_factor <- matrix(0, ngenes, ncells)
+  for (igene in 1:ngenes) {
+    for (icell in 1:ncells) {
+      batch_factor[igene, icell] <- rnorm(n = 1, mean = mean_matrix[igene,
+                                                                    batchIDs[icell]], sd = 0.01)
+    }
+  }
+  observed_counts <- round(2^(log2(observed_counts) + batch_factor))
+  return(list(counts = observed_counts, cell_meta = meta_cell))
+}
+for(k in 1:10){
+  ##RNA
+  ncells = 1000
+  nbatchs = 2
+  batchIDs <- sample(1:nbatchs, ncells, replace = TRUE)
+  print(k)
+  print("Simulate RNA")
+  ngenes <- 2000
+  gene_len <- sample(gene_len_pool, ngenes, replace = FALSE)
+  true_RNAcounts_res <- SimulateTrueCounts(ncells_total=ncells,
+                                           min_popsize=50,
+                                           i_minpop=1,
+                                           ngenes=ngenes,
+                                           nevf=10,
+                                           evf_type="discrete",
+                                           n_de_evf=6,
+                                           vary="s",
+                                           Sigma=0.6,
+                                           phyla=phyla,
+                                           randseed=k+1000)
+  observed_RNAcounts <- True2ObservedCounts(true_counts=true_RNAcounts_res[[1]],
+                                            meta_cell=true_RNAcounts_res[[3]],
+                                            protocol="UMI",
+                                            alpha_mean=0.00075,
+                                            alpha_sd=0.0001,
+                                            gene_len=gene_len,
+                                            depth_mean=50000,
+                                            depth_sd=3000,
+  )
+  batch_RNAcounts <- DivideBatches2(observed_RNAcounts, batchIDs, batch_effect_size = 1)
+  ## Add batch effects
+  print((sum(batch_RNAcounts$counts==0)-sum(true_RNAcounts_res$counts==0))/sum(true_RNAcounts_res$counts>0))
+  print(sum(batch_RNAcounts$counts==0)/prod(dim(batch_RNAcounts$counts)))
+  ##ADT
+  print("Simulate ADT")
+  nadts <- 100
+  gene_len <- sample(gene_len_pool, nadts, replace = FALSE)
+  #The true counts of the five populations can be simulated:
+  true_ADTcounts_res <- SimulateTrueCounts(ncells_total=ncells,
+                                           min_popsize=50,
+                                           i_minpop=1,
+                                           ngenes=nadts,
+                                           nevf=10,
+                                           evf_type="discrete",
+                                           n_de_evf=6,
+                                           vary="s",
+                                           Sigma=0.3,
+                                           phyla=phyla2,
+                                           randseed=k+1000)
+  observed_ADTcounts <- True2ObservedCounts(true_counts=true_ADTcounts_res[[1]],
+                                            meta_cell=true_ADTcounts_res[[3]],
+                                            protocol="UMI",
+                                            alpha_mean=0.045,
+                                            alpha_sd=0.01,
+                                            gene_len=gene_len,
+                                            depth_mean=50000,
+                                            depth_sd=3000,
+  )
+  ## Add batch effects
+  batch_ADTcounts <- DivideBatches2(observed_ADTcounts, batchIDs, batch_effect_size = 1)
+  print((sum(batch_ADTcounts$counts==0)-sum(true_ADTcounts_res$counts==0))/sum(true_ADTcounts_res$counts>0))
+  print(sum(batch_ADTcounts$counts==0)/prod(dim(batch_ADTcounts$counts)))
+  y1 = batch_RNAcounts$cell_meta$pop
+  y2 = batch_ADTcounts$cell_meta$pop
+  batch1 = batch_ADTcounts$cell_meta$batch
+  batch2 = batch_RNAcounts$cell_meta$batch
+  print(sum(y1==y2))
+  print(sum(batch1 == batch2))
+  counts1 <- batch_RNAcounts[[1]]
+  counts2 <- batch_ADTcounts[[1]]
+  #filter
+  rownames(counts2) <- paste("G",1:nrow(counts2),sep = "")
+  colnames(counts2) <- paste("C",1:ncol(counts2),sep = "")
+  pbmc <- CreateSeuratObject(counts = counts2, project = "P2", min.cells = 0, min.features = 0)
+  pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize")
+  pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 30)
+  counts2 <- counts2[pbmc@assays[["RNA"]]@var.features,]
+  h5file = paste("./batch/Simulation.", k, ".h5", sep="")
+  h5createFile(h5file)
+  h5write(as.matrix(counts1), h5file,"X1")
+  h5write(as.matrix(counts2), h5file,"X2")
+  h5write(y1, h5file,"Y")
+  h5write(batch1, h5file,"Batch")
+}