a b/src/Simulation.R
1
library(SymSim)
2
library(rhdf5)
3
library(Seurat)
4
5
phyla <- read.tree("tree.txt")
6
phyla2 <- read.tree("tree.txt")
7
data(gene_len_pool)
8
9
#This is a simulation script with adding batch effect
10
#First we need to modify the DivideBatches2 function in SymSim to make the same batch partition in mRNA and ADT data.
11
DivideBatches2 <- function(observed_counts_res, batchIDs, batch_effect_size=1){
12
  observed_counts <- observed_counts_res[["counts"]]
13
  meta_cell <- observed_counts_res[["cell_meta"]]
14
  ncells <- dim(observed_counts)[2]
15
  ngenes <- dim(observed_counts)[1]
16
  nbatch <- unique(batchIDs)
17
  meta_cell2 <- data.frame(batch = batchIDs, stringsAsFactors = F)
18
  meta_cell <- cbind(meta_cell, meta_cell2)
19
  mean_matrix <- matrix(0, ngenes, nbatch)
20
  gene_mean <- rnorm(ngenes, 0, 1)
21
  temp <- lapply(1:ngenes, function(igene) {
22
    return(runif(nbatch, min = gene_mean[igene] - batch_effect_size, 
23
                 max = gene_mean[igene] + batch_effect_size))
24
  })
25
  mean_matrix <- do.call(rbind, temp)
26
  batch_factor <- matrix(0, ngenes, ncells)
27
  for (igene in 1:ngenes) {
28
    for (icell in 1:ncells) {
29
      batch_factor[igene, icell] <- rnorm(n = 1, mean = mean_matrix[igene, 
30
                                                                    batchIDs[icell]], sd = 0.01)
31
    }
32
  }
33
  observed_counts <- round(2^(log2(observed_counts) + batch_factor))
34
  return(list(counts = observed_counts, cell_meta = meta_cell))
35
}
36
37
for(k in 1:10){
38
  ##RNA
39
  ncells = 1000
40
  nbatchs = 2
41
  batchIDs <- sample(1:nbatchs, ncells, replace = TRUE)
42
  print(k)
43
  print("Simulate RNA")
44
  ngenes <- 2000
45
  gene_len <- sample(gene_len_pool, ngenes, replace = FALSE)
46
  true_RNAcounts_res <- SimulateTrueCounts(ncells_total=ncells, 
47
                                           min_popsize=50, 
48
                                           i_minpop=1, 
49
                                           ngenes=ngenes, 
50
                                           nevf=10, 
51
                                           evf_type="discrete", 
52
                                           n_de_evf=6, 
53
                                           vary="s", 
54
                                           Sigma=0.6, 
55
                                           phyla=phyla,
56
                                           randseed=k+1000)
57
58
  observed_RNAcounts <- True2ObservedCounts(true_counts=true_RNAcounts_res[[1]], 
59
                                            meta_cell=true_RNAcounts_res[[3]], 
60
                                            protocol="UMI", 
61
                                            alpha_mean=0.00075, 
62
                                            alpha_sd=0.0001, 
63
                                            gene_len=gene_len, 
64
                                            depth_mean=50000, 
65
                                            depth_sd=3000,
66
  )
67
68
  batch_RNAcounts <- DivideBatches2(observed_RNAcounts, batchIDs, batch_effect_size = 1)
69
  
70
  ## Add batch effects 
71
  print((sum(batch_RNAcounts$counts==0)-sum(true_RNAcounts_res$counts==0))/sum(true_RNAcounts_res$counts>0))
72
  print(sum(batch_RNAcounts$counts==0)/prod(dim(batch_RNAcounts$counts)))
73
  
74
  ##ADT
75
  print("Simulate ADT")
76
  nadts <- 100
77
  gene_len <- sample(gene_len_pool, nadts, replace = FALSE)
78
  #The true counts of the five populations can be simulated:
79
  true_ADTcounts_res <- SimulateTrueCounts(ncells_total=ncells, 
80
                                           min_popsize=50, 
81
                                           i_minpop=1, 
82
                                           ngenes=nadts, 
83
                                           nevf=10, 
84
                                           evf_type="discrete", 
85
                                           n_de_evf=6, 
86
                                           vary="s", 
87
                                           Sigma=0.3, 
88
                                           phyla=phyla2,
89
                                           randseed=k+1000)
90
  
91
  observed_ADTcounts <- True2ObservedCounts(true_counts=true_ADTcounts_res[[1]], 
92
                                            meta_cell=true_ADTcounts_res[[3]], 
93
                                            protocol="UMI", 
94
                                            alpha_mean=0.045, 
95
                                            alpha_sd=0.01, 
96
                                            gene_len=gene_len, 
97
                                            depth_mean=50000, 
98
                                            depth_sd=3000,
99
  )
100
  
101
  ## Add batch effects 
102
  batch_ADTcounts <- DivideBatches2(observed_ADTcounts, batchIDs, batch_effect_size = 1)
103
  
104
  print((sum(batch_ADTcounts$counts==0)-sum(true_ADTcounts_res$counts==0))/sum(true_ADTcounts_res$counts>0))
105
  print(sum(batch_ADTcounts$counts==0)/prod(dim(batch_ADTcounts$counts)))
106
  
107
  y1 = batch_RNAcounts$cell_meta$pop
108
  y2 = batch_ADTcounts$cell_meta$pop
109
  batch1 = batch_ADTcounts$cell_meta$batch
110
  batch2 = batch_RNAcounts$cell_meta$batch
111
  print(sum(y1==y2))
112
  print(sum(batch1 == batch2))
113
  
114
  counts1 <- batch_RNAcounts[[1]]
115
  counts2 <- batch_ADTcounts[[1]]
116
  
117
  #filter
118
  rownames(counts2) <- paste("G",1:nrow(counts2),sep = "")
119
  colnames(counts2) <- paste("C",1:ncol(counts2),sep = "")
120
  
121
  pbmc <- CreateSeuratObject(counts = counts2, project = "P2", min.cells = 0, min.features = 0)
122
  pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize")
123
  pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 30)
124
  counts2 <- counts2[pbmc@assays[["RNA"]]@var.features,]
125
  
126
  h5file = paste("./batch/Simulation.", k, ".h5", sep="")
127
  h5createFile(h5file)
128
  h5write(as.matrix(counts1), h5file,"X1")
129
  h5write(as.matrix(counts2), h5file,"X2")
130
  h5write(y1, h5file,"Y")
131
  h5write(batch1, h5file,"Batch")
132
}