[dbb3ea]: / settings.R

Download this file

318 lines (284 with data), 15.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
suppressPackageStartupMessages(library(SingleCellExperiment))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(argparse))
#########
## I/O ##
#########
io <- list()
if (grepl("ricard",Sys.info()['nodename'])) {
io$basedir <- "/Users/ricard/data/gastrulation_multiome_10x"
io$atlas.basedir <- "/Users/ricard/data/gastrulation10x"
io$gene_metadata <- "/Users/ricard/data/ensembl/mouse/v87/BioMart/all_genes/Mmusculus_genes_BioMart.87.txt"
} else if (Sys.info()[['nodename']]=="BI2404M") {
io$basedir <- "/Users/argelagr/data/gastrulation_multiome_10x/test"
io$atlas.basedir <- "/Users/argelagr/data/gastrulation10x"
io$gene_metadata <- "/Users/argelagr/data/ensembl/mouse/v87/BioMart/mRNA/Mmusculus_genes_BioMart.87.txt"
io$TFs_file <- "/Users/argelagr/data/mm10_regulation/TFs/TFs.txt"
} else if (grepl("pebble|headstone", Sys.info()['nodename'])) {
if (grepl("Clark", Sys.info()['effective_user'])) {
io$basedir <- "/bi/scratch/Stephen_Clark/multiome/resilio"
io$rawdata <- "/bi/scratch/Stephen_Clark/multiome/raw"
io$gene_metadata <- "/bi/scratch/Stephen_Clark/annotations/Mmusculus_genes_BioMart.87.txt"
} else if (grepl("argelag", Sys.info()['effective_user'])) {
io$basedir <-'/bi/group/reik/ricard/data/gastrulation_multiome_10x/test'
io$archR.directory <- file.path(io$basedir,"processed/atac/archR")
io$gene_metadata <- "/bi/group/reik/ricard/data/ensembl/mouse/v87/BioMart/all_genes/Mmusculus_genes_BioMart.87.txt"
io$atlas.basedir <- "/bi/group/reik/ricard/data/pijuansala2019_gastrulation10x"
io$TFs_file <- "/bi/group/reik/ricard/data/mm10_regulation/TFs/TFs.txt"
}
} else if (grepl("bi2228m", Sys.info()['nodename'])) {
io$basedir <- "/Users/clarks/data/multiome/"
io$rawdata <- "/Users/clarks/data/multiome_raw_sub/"
} else if(grepl('LAPTOP',Sys.info()['nodename'])){
io$basedir <- 'D:/OneDrive - zju.edu.cn/research/bioin/lab/Ricard/data/gastrulation_multiome_10x'
} else if(grepl('Workstation',Sys.info()['nodename'])){
io$basedir <-'/home/lijingyu/gastrulation/data/gastrulation_multiome_10x'
io$gene_metadata <- "/home/lijingyu/gastrulation/data/gastrulation_multiome_10x/Mmusculus_genes_BioMart.87.txt"
io$atlas.basedir <- "/home/lijingyu/gastrulation/data/gastrulation10x"
} else {
stop("Computer not recognised")
}
io$metadata <- file.path(io$basedir,"sample_metadata.txt.gz")
# TFs
# io$TFs <- file.path(io$basedir,"results/TFs.txt")
# RNA
io$rna.anndata <- file.path(io$basedir,"processed/rna/anndata.h5ad")
io$rna.seurat <- file.path(io$basedir,"processed/rna/seurat.rds")
io$rna.sce <- file.path(io$basedir,"processed/rna/SingleCellExperiment.rds")
io$rna.tfs.sce <- file.path(io$basedir,"processed/rna/SingleCellExperiment_TFs.rds")
io$rna.pseudobulk.sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_pseudobulk.rds")
io$rna.tfs.pseudobulk.sce <- file.path(io$basedir,"results/rna/pseudobulk/celltype/SingleCellExperiment_TFs_pseudobulk.rds")
io$rna.metacells.sce <- file.path(io$basedir,"results/rna/metacells/all_cells/SingleCellExperiment_metacells.rds")
# Differential RNA expression and marker genes
# io$rna.differential <- file.path(io$basedir,"results/rna/differential")
io$rna.celltype.marker_genes.all <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype/parsed/marker_genes_all.txt.gz")
io$rna.celltype.marker_genes.filt <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype/parsed/marker_genes_filtered.txt.gz")
io$rna.celltype.marker_tfs.all <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype/parsed/marker_tfs_all.txt.gz")
io$rna.celltype.marker_tfs.filt <- file.path(io$basedir,"results/rna/differential/pseudobulk/celltype/parsed/marker_tfs_filtered.txt.gz")
# RNA atlas (Pijuan-Sala2019)
io$rna.atlas.metadata <- file.path(io$atlas.basedir,"sample_metadata.txt.gz")
io$rna.atlas.sce <- file.path(io$atlas.basedir,"processed/SingleCellExperiment.rds")
io$rna.atlas.marker_genes.up <- file.path(io$atlas.basedir,"results/differential/celltypes/genes/all_stages/marker_genes/marker_genes_upregulated_filtered.txt.gz")
io$rna.atlas.marker_genes.all <- file.path(io$atlas.basedir,"results/differential/celltypes/genes/all_stages/marker_genes/marker_genes_upregulated_all.txt.gz")
io$rna.atlas.marker_TFs.up <- file.path(io$atlas.basedir,"results/differential/celltypes/TFs/TF_markers/marker_TFs_upregulated_filtered.txt.gz")
io$rna.atlas.marker_TFs.all <- file.path(io$atlas.basedir,"results/differential/celltypes/TFs/TF_markers/marker_TFs_upregulated_all.txt.gz")
io$rna.atlas.differential <- file.path(io$atlas.basedir,"results/differential")
# io$rna.atlas.average_expression_per_celltype <- file.path(io$atlas.basedir,"results/marker_genes/all_stages/avg_expr_per_celltype_and_gene.txt.gz")
io$rna.atlas.sce.pseudobulk <- file.path(io$atlas.basedir,"results/pseudobulk/SingleCellExperiment_pseudobulk.rds")
io$rna.atlas.celltype_proportions <- file.path(io$atlas.basedir,"results/celltype_proportions/celltype_proportions.txt.gz")
# ATAC
# io$signac.directory <- file.path(io$basedir,"processed/atac/signac")
# io$signac <- file.path(io$signac.directory,"/signac.rds")
# io$cistopic.model <- file.path(io$basedir,"results/atac/cistopic/cistopic_warpLDA_bestModel.rds")
io$archR.directory <- file.path(io$basedir,"processed/atac/archR")
io$motifmatcher_scores.se <- file.path(io$archR.directory,"Annotations/CISBP-Scores.rds")
io$motifmatcher_positions.se <-file.path(io$archR.directory,"Annotations/CISBP-Positions.rds")
io$archR.projectMetadata <- file.path(io$archR.directory,"projectMetadata.rds")
io$archR.peakSet.granges <- file.path(io$archR.directory,"PeakSet.rds")
io$archR.peakSet.stats <- file.path(io$basedir,"results/atac/archR/feature_stats/PeakMatrix/PeakMatrix_celltype_stats.txt.gz")
io$archR.bgdPeaks <- file.path(io$archR.directory,"Background-Peaks.rds")
io$archR.peakSet.bed <- file.path(io$archR.directory,"PeakCalls/peaks_archR_macs2.bed.gz")
io$archR.peakMatrix.cells <- file.path(io$archR.directory,"Matrices/PeakMatrix_summarized_experiment.rds")
io$archR.peakMatrix.metacells <- file.path(io$basedir,"results/atac/archR/metacells/all_cells/PeakMatrix/PeakMatrix_summarized_experiment_metacells.rds")
io$archR.peakMatrix.pseudobulk <- file.path(io$basedir,"results/atac/archR/pseudobulk/celltype/PeakMatrix/pseudobulk_PeakMatrix_summarized_experiment.rds")
io$archR.peak.metadata <- file.path(io$archR.directory,"PeakCalls/peak_metadata.tsv.gz")
io$archR.peak.stats <- file.path(io$basedir,"results/atac/archR/feature_stats/PeakMatrix_celltype.mapped_stats.txt.gz")
io$archR.peak2gene.all <- file.path(io$basedir,"results/atac/archR/peak_calling/peaks2genes/peaks2genes_all.txt.gz")
io$archR.peak2gene.nearest <- file.path(io$basedir,"results/atac/archR/peak_calling/peaks2genes/peaks2genes_nearest.txt.gz")
io$archR.GeneScoreMatrix.cells <- file.path(io$basedir,"processed/atac/archR/Matrices/GeneScoreMatrix_TSS_summarized_experiment.rds")
io$archR.GeneScoreMatrix.pseudobulk <- file.path(io$basedir,"results/atac/archR/pseudobulk/celltype/GeneScoreMatrix_TSS/pseudobulk_GeneScoreMatrix_TSS_summarized_experiment.rds")
io$archR.chromvar.cells <- file.path(io$basedir,"results/atac/archR/chromvar/cells/chromVAR_deviations_CISBP_archr.rds")
io$archR.chromvar.pseudobulk <- file.path(io$basedir,"results/atac/archR/chromvar/pseudobulk/celltype/chromVAR_deviations_CISBP_pseudobulk_archr.rds")
io$archR.feature_stats <- file.path(io$basedir,"results/atac/archR/chromvar_chip/pseudobulk/chromVAR_chip_CISBP_archr.rds")
io$archR.motif2gene <- file.path(io$basedir,"processed/atac/archR/Annotations/CISBP_motif2gene.txt.gz")
# io$archR.chromvar_chip.cells <- file.path(io$basedir,"results/atac/archR/chromvar/cells/celltype/chromVAR_deviations_CISBP_pseudobulk_archr.rds")
# io$archR.pseudobulk.genes.se <- file.path(io$basedir,"processed/atac/archR/pseudobulk/celltype.predicted/pseudobulk_GeneScoreMatrix_summarized_experiment.rds")
# Differential ATAC and marker peaks
io$atac.markers_peaks.pseudobulk.all <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype/PeakMatrix/parsed/markers_all.txt.gz")
io$atac.markers_peaks.pseudobulk.filt <- file.path(io$basedir,"results/atac/archR/differential/pseudobulk/celltype/PeakMatrix/parsed/markers_filt.txt.gz")
# io$archR.markers_chromvar <- file.path(io$basedir,"results/atac/archR/chromvar/differential/markers/marker_TFs.txt.gz")
# ATAC public data (Pijuan-Sala2020)
io$pijuansala.basedir <- file.path(io$basedir,"public_datasets/Pijuan-Sala_2020")
io$pijuansala.archR.directory <- file.path(io$pijuansala.basedir,"data/processed/archR")
# paga
io$paga.connectivity <- file.path(io$atlas.basedir,"results/paga/paga_connectivity.csv")
io$paga.coordinates <- file.path(io$atlas.basedir,"results/paga/paga_coordinates.csv")
# Virtual ChIP-seq
# io$tf2peak_cor.se <- file.path(io$basedir,"results/rna_atac/rna_vs_acc/pseudobulk/TFexpr_vs_peakAcc/cor_TFexpr_vs_peakAcc_SummarizedExperiment.rds")
# io$tf.activities <- file.path(io$atlas.basedir,"results/rna_atac/rna_vs_chromvar/TF_activities/tf_activities.txt.gz")
io$virtual_chip.dir <- file.path(io$basedir,"results/rna_atac/virtual_chipseq")
io$virtual_chip.mtx <- file.path(io$basedir,"results/rna_atac/virtual_chipseq/virtual_chip_mtx.rds")
io$archR.chromvar_chip.pseudobulk <- file.path(io$basedir,"results/atac/archR/chromvar_chip/pseudobulk/chromVAR_chip_CISBP_archr.rds")
# Coexpression
# io$tf2tf_cor.mtx <- file.path(io$basedir,"results/rna/coexpression/correlation_matrix_tf2tf.rds")
# io$tf2gene_cor.mtx <- file.path(io$basedir,"results/rna/coexpression/correlation_matrix_tf2gene.rds")
# Dimensionality reduction
io$rna.dimred.pca <- file.path(io$basedir,"results/rna/dimensionality_reduction/sce/batch_correction_sample_remove_ExE_cells_False/pca_features2500_pcs50.txt.gz")
io$atac.dimred.lsi <- file.path(io$basedir,"results/atac/archR/dimensionality_reduction/cells/PeakMatrix/remove_ExE_cells_False/batch_correction_None/lsi_nfeatures25000_ndims50.txt.gz")
# io$rna.dimred.umap <- file.path(io$basedir,"results/rna/dimensionality_reduction/all_cells/E7.5_rep1-E7.5_rep2-E8.0_rep1-E8.0_rep2-E8.5_rep1-E8.5_rep2_umap_features2500_pcs30_neigh25_dist0.3.txt.gz")
# io$atac.dimred.umap <- file.path(io$basedir,"results/atac/archR/dimensionality_reduction/PeakMatrix/all_cells/E7.5_rep1-E7.5_rep2-E8.0_rep1-E8.0_rep2-E8.5_rep1-E8.5_rep2_umap_nfeatures50000_ndims50_neigh30_dist0.45.txt.gz")
#############
## Options ##
#############
opts <- list()
opts$stages <- c(
"E7.5",
"E7.75",
"E8.0",
"E8.5",
"E8.75"
)
opts$samples <- c(
"E7.5_rep1",
"E7.5_rep2",
"E7.75_rep1",
"E8.0_rep1",
"E8.0_rep2",
"E8.5_rep1",
"E8.5_rep2",
"E8.5_CRISPR_T_WT",
"E8.5_CRISPR_T_KO",
"E8.75_rep1",
"E8.75_rep2"
)
# opts$rename.samples <- c(
# "E8.75_rep1" = "e8_75_1_L002",
# "E8.75_rep2" = "e8_75_2_L002",
# "E8.5_rep1" = "multiome1",
# "E8.5_rep2" = "multiome2",
# "E8.0_rep1" = "E8_0_rep1_multiome",
# "E8.0_rep2" = "E8_0_rep2_multiome",
# "E7.5_rep1" = "rep1_L001_multiome",
# "E7.5_rep2" = "rep2_L002_multiome"
# # E7.75_rep1
# # E8.5_CRISPR_T_WT
# # E8.5_CRISPR_T_KO
# )
opts$sample2stage <- c(
"E7.5_rep1" = "E7.5",
"E7.5_rep2" = "E7.5",
"E7.75_rep1" = "E7.75",
"E8.0_rep1" = "E8.0",
"E8.0_rep2" = "E8.0",
"E8.5_rep1" = "E8.5",
"E8.5_rep2" = "E8.5",
"E8.75_rep1" = "E8.75",
"E8.75_rep2" = "E8.75",
"E8.5_CRISPR_T_KO" = "E8.5",
"E8.5_CRISPR_T_WT" = "E8.5"
)
opts$celltypes <- c(
"Epiblast",
"Primitive_Streak",
"Caudal_epiblast",
"PGC",
"Anterior_Primitive_Streak",
"Notochord",
"Def._endoderm",
"Gut",
"Nascent_mesoderm",
"Mixed_mesoderm",
"Intermediate_mesoderm",
"Caudal_Mesoderm",
"Paraxial_mesoderm",
"Somitic_mesoderm",
"Pharyngeal_mesoderm",
"Cardiomyocytes",
"Allantois",
"ExE_mesoderm",
"Mesenchyme",
"Haematoendothelial_progenitors",
"Endothelium",
"Blood_progenitors_1",
"Blood_progenitors_2",
"Erythroid1",
"Erythroid2",
"Erythroid3",
"NMP",
"Rostral_neurectoderm",
"Caudal_neurectoderm",
"Neural_crest",
"Forebrain_Midbrain_Hindbrain",
"Spinal_cord",
"Surface_ectoderm",
"Visceral_endoderm",
"ExE_endoderm",
"ExE_ectoderm",
"Parietal_endoderm"
)
opts$stage.colors = c(
"E7.5" = "#edf8b1",
"E7.75" = "#c7e9b4",
"E8.0" = "#7fcdbb",
"E8.25" = "#41b6c4",
"E8.5" = "#1d91c0",
"E8.75" = "#225ea8"
)
# opts$stage.colors <- viridis::viridis(n=length(opts$stages))
# names(opts$stage.colors) <- rev(opts$stages)
opts$celltype.colors = c(
"Epiblast" = "#635547",
"Primitive_Streak" = "#DABE99",
"Caudal_epiblast" = "#9e6762",
"PGC" = "#FACB12",
"Anterior_Primitive_Streak" = "#c19f70",
"Notochord" = "#0F4A9C",
"Def._endoderm" = "#F397C0",
"Gut" = "#EF5A9D",
"Nascent_mesoderm" = "#C594BF",
"Mixed_mesoderm" = "#DFCDE4",
"Intermediate_mesoderm" = "#139992",
"Caudal_Mesoderm" = "#3F84AA",
"Paraxial_mesoderm" = "#8DB5CE",
"Somitic_mesoderm" = "#005579",
"Pharyngeal_mesoderm" = "#C9EBFB",
"Cardiomyocytes" = "#B51D8D",
"Allantois" = "#532C8A",
"ExE_mesoderm" = "#8870ad",
"Mesenchyme" = "#cc7818",
"Haematoendothelial_progenitors" = "#FBBE92",
"Endothelium" = "#ff891c",
"Blood_progenitors" = "#c9a997",
"Blood_progenitors_1" = "#f9decf",
"Blood_progenitors_2" = "#c9a997",
"Erythroid" = "#EF4E22",
"Erythroid1" = "#C72228",
"Erythroid2" = "#f79083",
"Erythroid3" = "#EF4E22",
"NMP" = "#8EC792",
"Neurectoderm" = "#65A83E",
"Rostral_neurectoderm" = "#65A83E",
"Caudal_neurectoderm" = "#354E23",
"Neural_crest" = "#C3C388",
"Forebrain_Midbrain_Hindbrain" = "#647a4f",
"Spinal_cord" = "#CDE088",
"Surface_ectoderm" = "#f7f79e",
"Visceral_endoderm" = "#F6BFCB",
"ExE_endoderm" = "#7F6874",
"ExE_ectoderm" = "#989898",
"Parietal_endoderm" = "#1A1A1A"
)
opts$chr <- paste0("chr",c(1:19,"X","Y"))
###################
## Load metadata ##
###################
# sample_metadata <- fread(io$metadata)
# .[pass_QC==T] %>%
# .[batch%in%opts$batches] %>%
# .[,celltype.mapped:=stringr::str_replace_all(celltype.mapped," ","_")] %>%
# .[,celltype.mapped:=stringr::str_replace_all(celltype.mapped,"/","_")] %>%
# .[,celltype.mapped:=factor(celltype.mapped, levels=names(opts$celltype.colors))]
###################
## Edit metadata ##
###################
# io$metadata <- file.path(io$basedir,"results/atac/archR/qc/sample_metadata_after_qc.txt.gz")
# io$metadata <- file.path(io$basedir,"results/rna/mapping/sample_metadata_after_mapping.txt.gz")
# sample_metadata <- fread(io$metadata)
# sample_metadata[pass_rnaQC==FALSE & nFrags_atac<=1e4,pass_atacQC:=FALSE]
# sample_metadata[sample=="E7.75_rep1" & ribosomal_percent_RNA>=8,pass_rnaQC:=FALSE]
# sample_metadata[sample=="E7.75_rep1" & ribosomal_percent_RNA>=4.5 & nFeature_RNA<=3500,pass_rnaQC:=FALSE]
# fwrite(sample_metadata, io$metadata, sep="\t", quote=F, na="NA")