library(SymSim)
library(rhdf5)
library(Seurat)
phyla <- read.tree("tree.txt")
phyla2 <- read.tree("tree.txt")
data(gene_len_pool)
#This is a simulation script with adding batch effect
#First we need to modify the DivideBatches2 function in SymSim to make the same batch partition in mRNA and ADT data.
DivideBatches2 <- function(observed_counts_res, batchIDs, batch_effect_size=1){
observed_counts <- observed_counts_res[["counts"]]
meta_cell <- observed_counts_res[["cell_meta"]]
ncells <- dim(observed_counts)[2]
ngenes <- dim(observed_counts)[1]
nbatch <- unique(batchIDs)
meta_cell2 <- data.frame(batch = batchIDs, stringsAsFactors = F)
meta_cell <- cbind(meta_cell, meta_cell2)
mean_matrix <- matrix(0, ngenes, nbatch)
gene_mean <- rnorm(ngenes, 0, 1)
temp <- lapply(1:ngenes, function(igene) {
return(runif(nbatch, min = gene_mean[igene] - batch_effect_size,
max = gene_mean[igene] + batch_effect_size))
})
mean_matrix <- do.call(rbind, temp)
batch_factor <- matrix(0, ngenes, ncells)
for (igene in 1:ngenes) {
for (icell in 1:ncells) {
batch_factor[igene, icell] <- rnorm(n = 1, mean = mean_matrix[igene,
batchIDs[icell]], sd = 0.01)
}
}
observed_counts <- round(2^(log2(observed_counts) + batch_factor))
return(list(counts = observed_counts, cell_meta = meta_cell))
}
for(k in 1:10){
##RNA
ncells = 1000
nbatchs = 2
batchIDs <- sample(1:nbatchs, ncells, replace = TRUE)
print(k)
print("Simulate RNA")
ngenes <- 2000
gene_len <- sample(gene_len_pool, ngenes, replace = FALSE)
true_RNAcounts_res <- SimulateTrueCounts(ncells_total=ncells,
min_popsize=50,
i_minpop=1,
ngenes=ngenes,
nevf=10,
evf_type="discrete",
n_de_evf=6,
vary="s",
Sigma=0.6,
phyla=phyla,
randseed=k+1000)
observed_RNAcounts <- True2ObservedCounts(true_counts=true_RNAcounts_res[[1]],
meta_cell=true_RNAcounts_res[[3]],
protocol="UMI",
alpha_mean=0.00075,
alpha_sd=0.0001,
gene_len=gene_len,
depth_mean=50000,
depth_sd=3000,
)
batch_RNAcounts <- DivideBatches2(observed_RNAcounts, batchIDs, batch_effect_size = 1)
## Add batch effects
print((sum(batch_RNAcounts$counts==0)-sum(true_RNAcounts_res$counts==0))/sum(true_RNAcounts_res$counts>0))
print(sum(batch_RNAcounts$counts==0)/prod(dim(batch_RNAcounts$counts)))
##ADT
print("Simulate ADT")
nadts <- 100
gene_len <- sample(gene_len_pool, nadts, replace = FALSE)
#The true counts of the five populations can be simulated:
true_ADTcounts_res <- SimulateTrueCounts(ncells_total=ncells,
min_popsize=50,
i_minpop=1,
ngenes=nadts,
nevf=10,
evf_type="discrete",
n_de_evf=6,
vary="s",
Sigma=0.3,
phyla=phyla2,
randseed=k+1000)
observed_ADTcounts <- True2ObservedCounts(true_counts=true_ADTcounts_res[[1]],
meta_cell=true_ADTcounts_res[[3]],
protocol="UMI",
alpha_mean=0.045,
alpha_sd=0.01,
gene_len=gene_len,
depth_mean=50000,
depth_sd=3000,
)
## Add batch effects
batch_ADTcounts <- DivideBatches2(observed_ADTcounts, batchIDs, batch_effect_size = 1)
print((sum(batch_ADTcounts$counts==0)-sum(true_ADTcounts_res$counts==0))/sum(true_ADTcounts_res$counts>0))
print(sum(batch_ADTcounts$counts==0)/prod(dim(batch_ADTcounts$counts)))
y1 = batch_RNAcounts$cell_meta$pop
y2 = batch_ADTcounts$cell_meta$pop
batch1 = batch_ADTcounts$cell_meta$batch
batch2 = batch_RNAcounts$cell_meta$batch
print(sum(y1==y2))
print(sum(batch1 == batch2))
counts1 <- batch_RNAcounts[[1]]
counts2 <- batch_ADTcounts[[1]]
#filter
rownames(counts2) <- paste("G",1:nrow(counts2),sep = "")
colnames(counts2) <- paste("C",1:ncol(counts2),sep = "")
pbmc <- CreateSeuratObject(counts = counts2, project = "P2", min.cells = 0, min.features = 0)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize")
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 30)
counts2 <- counts2[pbmc@assays[["RNA"]]@var.features,]
h5file = paste("./batch/Simulation.", k, ".h5", sep="")
h5createFile(h5file)
h5write(as.matrix(counts1), h5file,"X1")
h5write(as.matrix(counts2), h5file,"X2")
h5write(y1, h5file,"Y")
h5write(batch1, h5file,"Batch")
}