1377 lines (1268 with data), 72.3 kB
import os
from re import search
import getpass
############
## Config ##
############
host = os.uname()[1]
if search("BI2404M", host) and getpass.getuser()=="argelagr":
configfile: "config_ricard_local.yaml"
elif search("[headstone|pebble]", host) and getpass.getuser()=="argelagr":
configfile: "config_ricard_babraham.yaml"
elif search("[headstone|pebble]", host) and getpass.getuser()=="stephen":
configfile: "config_stephen_babraham.yaml"
else:
print("Computer not recognised")
exit()
# validate(config, schema="schemas/config.schema.yaml")
#############################################
## Wildcard constraints to avoid ambiguity ##
#############################################
# honestly I don't understand why do I have to do this, but otherwise I get ambiguity and strange wildcards
wildcard_constraints:
trajectory_metacells = '|'.join([re.escape(x) for x in config["run_metacells_trajectory"]["trajectories"]]),
sample_metacell = '|'.join([re.escape(x) for x in config["samples"]]),
celltypeA = '|'.join([re.escape(x) for x in config["celltypes"]]),
celltypeB = '|'.join([re.escape(x) for x in config["celltypes"]]),
group_by = '|'.join([re.escape(x) for x in config["pseudobulk_rna"]["group_by"]])
##################################
## Filter wildcard combinations ##
##################################
from itertools import product
def filter_combinator(combinator, blacklist):
def filtered_combinator(*args, **kwargs):
for wc_comb in combinator(*args, **kwargs):
if frozenset(wc_comb) not in blacklist:
yield wc_comb
return filtered_combinator
forbidden = {
frozenset({("celltypeA",i),("celltypeB",i)}) for i in config["celltypes"]
# frozenset({("celltypeA", "Epiblast"), ("celltypeB", "Epiblast")}),
# frozenset({("text", "C"), ("num", 2)})
}
filtered_product_celltypes = filter_combinator(product, forbidden)
###########
## Rules ##
###########
rule all:
input:
# Create Seurat and SingleCellExperiment objects
config["directories"]["processed_data"]+"/seurat.rds",
config["directories"]["processed_data"]+"/SingleCellExperiment.rds",
# QC
config["directories"]["results"]+"/rna/qc/sample_metadata_after_qc.txt.gz",
config["directories"]["results"]+"/rna/doublet_detection/sample_metadata_after_doublets.txt.gz",
# Plot stats
config["directories"]["results"]+"/rna/stats/ncells_per_sample.pdf",
# Mapping and cell type proportions
config["directories"]["results"]+"/rna/mapping/sample_metadata_after_mapping.txt.gz",
expand("%s/rna/mapping/mapping_mnn_{sample}.txt.gz" % config["directories"]["results"], sample=config["samples"]),
config["directories"]["results"]+"/rna/mapping/pdf/umap_mapped_allcells.pdf",
config["directories"]["results"]+"/rna/celltype_proportions/celltype_proportions_stacked_barplots.pdf",
# Pseudobulk
expand("%s/rna/pseudobulk/{group_by}/SingleCellExperiment_pseudobulk.rds" % config["directories"]["results"], group_by=config["pseudobulk_rna"]["group_by"]),
expand("%s/rna/pseudobulk/{group_by}/SingleCellExperiment_pseudobulk_with_replicates.rds" % config["directories"]["results"], group_by=config["pseudobulk_rna"]["group_by"]),
# Dimensionality reduction (all cells)
expand("%s/rna/dimensionality_reduction/sce/per_stage/{stage}/pca_features{dimred_sce_features}_pcs{dimred_sce_npcs}.txt.gz" % (config["directories"]["results"]),
dimred_sce_features = config["dimensionality_reduction_sce"]["features"],
dimred_sce_npcs = config["dimensionality_reduction_sce"]["npcs"],
stage = config["stages"]
),
expand("%s/rna/dimensionality_reduction/sce/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}/pca_features{dimred_sce_features}_pcs{dimred_sce_npcs}.txt.gz" % (config["directories"]["results"]),
dimred_sce_features = config["dimensionality_reduction_sce"]["features"],
batch_variable = config["dimensionality_reduction_sce"]["batch_variable"],
remove_ExE_cells = config["dimensionality_reduction_sce"]["remove_ExE_cells"],
dimred_sce_npcs = config["dimensionality_reduction_sce"]["npcs"]
),
# expand("%s/rna/dimensionality_reduction/seurat/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}/pca_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}.txt.gz" % (config["directories"]["results"]),
# dimred_seurat_features = config["dimensionality_reduction_seurat"]["features"],
# batch_variable = config["dimensionality_reduction_seurat"]["batch_variable"],
# remove_ExE_cells = config["dimensionality_reduction_seurat"]["remove_ExE_cells"],
# dimred_seurat_npcs = config["dimensionality_reduction_seurat"]["npcs"]
# ),
# Dimensionality reduction (per stage)
expand("%s/rna/dimensionality_reduction/seurat/per_stage/{stage}/pca_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}.txt.gz" % (config["directories"]["results"]),
dimred_seurat_features = config["dimensionality_reduction_seurat"]["features"],
dimred_seurat_npcs = config["dimensionality_reduction_seurat"]["npcs"],
stage = config["stages"]
),
# Differential expression between cell types
expand(config["directories"]["results"]+"/rna/differential/cells/celltype/{celltypeA}_vs_{celltypeB}.txt.gz", filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]),
expand(config["directories"]["results"]+"/rna/differential/metacells/celltype/{celltypeA}_vs_{celltypeB}.txt.gz", filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]),
expand(config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/{celltypeA}_vs_{celltypeB}.txt.gz", filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]),
# Parse differential expression results
config["directories"]["results"]+"/rna/differential/cells/celltype/parsed/diff_expr_results.txt.gz",
config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed/diff_expr_results.txt.gz",
config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/diff_expr_results.txt.gz",
config["directories"]["results"]+"/rna/differential/pseudobulk/celltype_genotype/parsed/diff_expr_results.txt.gz",
# Extract TFs from differential analysis
config["directories"]["results"] + "/rna/differential/cells/celltype/parsed/diff_expr_results_tfs.txt.gz",
config["directories"]["results"] + "/rna/differential/metacells/celltype/parsed/diff_expr_results_tfs.txt.gz",
config["directories"]["results"] + "/rna/differential/pseudobulk/celltype/parsed/diff_expr_results_tfs.txt.gz",
# Differential expression between genotypes
expand(config["directories"]["results"]+"/rna/differential/pseudobulk/celltype_genotype/{diff_celltype_genotype}.txt.gz", diff_celltype_genotype=config["celltypes"]),
# Marker genes
# config["directories"]["results"]+"/rna/differential/cells/celltype/parsed/marker_genes_all.txt.gz",
# config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed/marker_genes_all.txt.gz",
config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_genes_all.txt.gz",
# Marker TFs
# # config["directories"]["results"]+"/rna/differential/cells/celltype/parsed/marker_tfs_all.txt.gz",
# config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed/marker_tfs_all.txt.gz",
config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_tfs_all.txt.gz",
# TF2gene correlations
expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_pseudobulk.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_pseudobulk.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_pseudobulk_no_ExE.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_pseudobulk_no_ExE.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_metacells.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_metacells.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
# expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_metacells_no_ExE.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
# expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_metacells_no_ExE.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
# Extract TF matrices
config["directories"]["processed_data"]+"/SingleCellExperiment_TFs.rds",
expand(config["directories"]["results"]+"/rna/pseudobulk/{group_by}/SingleCellExperiment_TFs_pseudobulk.rds", group_by=config["pseudobulk_rna"]["group_by"]),
config["directories"]["results"] + "/rna/metacells/all_cells/SingleCellExperiment_TFs_metacells.rds",
# Trajectory inference
# expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_SingleCellExperiment.rds",
# trajectory_name=config["infer_trajectories"]["trajectory_name"]),
# expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_adata.h5ad",
# trajectory_name=config["infer_trajectories"]["trajectory_name"]),
# expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_trajectory.txt.gz",
# trajectory_name=config["infer_trajectories"]["trajectory_name"]),
# expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_sample_metadata.txt.gz",
# trajectory_name=config["infer_trajectories"]["trajectory_name"]),
# expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2gene_single_cells.rds",
# trajectory_name=config["infer_trajectories"]["trajectory_name"]),
# expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2tf_single_cells.rds",
# trajectory_name=config["infer_trajectories"]["trajectory_name"])
# expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2gene_single_cells_denoised.rds",
# trajectory_name=config["infer_trajectories"]["trajectory_name"]),
# expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2tf_single_cells_denoised.rds",
# trajectory_name=config["infer_trajectories"]["trajectory_name"])
# anndata
config["directories"]["processed_data"] + "/anndata.h5ad",
# velocity
# expand("%s/{sample}/outs/cellsorted_possorted_genome_bam.bam" % config["directories"]["original_data"], sample=config["samples"][0]),
expand("%s/{sample}/velocyto/{sample}.loom" % config["directories"]["original_data"], sample=config["samples"]),
config["directories"]["processed_data"] + "/velocyto/anndata_velocyto.h5ad",
# Infer metacells
expand("%s/rna/metacells/all_cells/{sample_metacell}/cell2metacell_assignment.txt.gz" % config["directories"]["results"], sample_metacell=config["samples"]),
expand("%s/rna/metacells/all_cells/{sample_metacell}/anndata_metacells.h5ad" % config["directories"]["results"], sample_metacell=config["samples"]),
config["directories"]["results"] + "/rna/metacells/all_cells/metacells_metadata.txt.gz",
config["directories"]["results"] + "/rna/metacells/all_cells/SingleCellExperiment_metacells.rds",
config["directories"]["results"] + "/rna/metacells/all_cells/anndata_metacells.h5ad",
# Infer metacells for tajectories
expand("%s/rna/metacells/trajectories/{trajectory_metacells}/cell2metacell_assignment.txt.gz" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"]),
# expand("%s/rna/metacells/trajectories/{trajectory_metacells}/SingleCellExperiment_metacells.rds" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"]),
expand("%s/rna/metacells/trajectories/{trajectory_metacells}/metacells_metadata.txt.gz" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"]),
expand("%s/rna/metacells/trajectories/{trajectory_metacells}/anndata_metacells.h5ad" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"]),
expand("%s/rna/metacells/trajectories/{trajectory_metacells}/metacell_trajectory.txt.gz" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"])
# Trajectory-specific TF2gene coexpression using metacells
# expand("%s/rna/coexpression/trajectories/{trajectory_metacells}/correlation_matrix_tf2gene_metacells.rds" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"]),
# expand("%s/rna/coexpression/trajectories/{trajectory_metacells}/correlation_matrix_tf2tf_metacells.rds" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"])
##################################################
## Load count matrices and create Seurat object ##
##################################################
rule create_seurat:
input:
script=config["scripts"]["create_seurat"],
input_dir=config["directories"]["original_data"]
output:
seurat=config["directories"]["processed_data"]+"/seurat.rds",
metadata=config["directories"]["processed_data"]+"/metadata.txt.gz"
params:
outdir=config["directories"]["processed_data"],
samples=expand("{sample}", sample=config["samples"])
conda:
"environment.yaml"
log:
"logs/create_seurat.log"
threads:
config["slurm"]["create_seurat"]["threads"]
resources:
mem_mb = config["slurm"]["create_seurat"]["memory"]
shell:
"Rscript {input.script} --inputdir {input.input_dir} --outdir {params.outdir} --samples {params.samples} > {log}"
#####################
## Quality control ##
#####################
rule qc:
input:
metadata=rules.create_seurat.output.metadata,
script=config["scripts"]["qc"]
output:
# qc_metrics_boxplot=config["directories"]["results"]+"/rna/qc/qc_metrics_boxplot.pdf",
qc_metrics_histogram=config["directories"]["results"]+"/rna/qc/qc_metrics_histogram.pdf",
metadata=config["directories"]["results"]+"/rna/qc/sample_metadata_after_qc.txt.gz"
params:
sample = expand("{sample}", sample=config["samples"]),
min_nFeature_RNA = config["qc"]["min_nFeature_RNA"],
max_nFeature_RNA = config["qc"]["max_nFeature_RNA"],
percent_mt = config["qc"]["percent_mt"],
percent_rib = config["qc"]["percent_rib"],
outdir=config["directories"]["results"]+"/rna/qc"
conda:
"environment.yaml"
log:
"logs/qc.log"
threads:
config["slurm"]["qc"]["threads"]
resources:
mem_mb = config["slurm"]["qc"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --outputdir {params.outdir} --samples {params.sample} --min_nFeature_RNA {params.min_nFeature_RNA} \
--max_nFeature_RNA {params.max_nFeature_RNA} --ribosomal_percent_RNA {params.percent_rib} \
--mitochondrial_percent_RNA {params.percent_mt} > {log}"
###################################################
## Convert Seurat object to SingleCellExperiment ##
###################################################
rule seurat_to_sce:
input:
seurat = rules.create_seurat.output.seurat,
metadata = rules.qc.output.metadata,
script = config["scripts"]["seurat_to_sce"],
output:
config["directories"]["processed_data"]+"/SingleCellExperiment.rds",
params:
sample = expand("{sample}", sample=config["samples"])
conda:
"environment.yaml"
log:
"logs/seurat_to_sce.log"
threads:
config["slurm"]["seurat_to_sce"]["threads"]
resources:
mem_mb = config["slurm"]["seurat_to_sce"]["memory"]
shell:
"Rscript {input.script} --samples {params.sample} --seurat {input.seurat} \
--metadata {input.metadata} --outfile {output} > {log}"
#######################
## Doublet detection ##
#######################
rule doublet_detection:
input:
sce = rules.seurat_to_sce.output,
metadata = rules.qc.output.metadata,
script = config["scripts"]["doublet_detection"]
output:
outfile=config["directories"]["results"]+"/rna/doublet_detection/doublets_{sample}_{doublet_score_threshold}.txt.gz"
# metadata=config["directories"]["results"]+"/rna/doublet_detection/sample_metadata_after_doublet_detection.txt.gz"
conda:
"environment.yaml"
log:
"logs/doublet_detection_{sample}_{doublet_score_threshold}.log"
threads:
config["slurm"]["doublet_detection"]["threads"]
resources:
mem_mb = config["slurm"]["doublet_detection"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --samples {wildcards.sample} \
--hybrid_score_threshold {wildcards.doublet_score_threshold} --outfile {output} > {log}"
rule parse_doublet_results:
input:
metadata = rules.qc.output.metadata,
script = config["scripts"]["parse_doublets"],
doublet_files=expand(config["directories"]["results"]+"/rna/doublet_detection/doublets_{sample}_{doublet_score_threshold}.txt.gz",
sample=config["samples"], doublet_score_threshold=config["doublet_detection"]["doublet_score_threshold"])
# doublet_files = rules.doublet_detection.output
output:
config["directories"]["results"]+"/rna/doublet_detection/sample_metadata_after_doublets.txt.gz"
conda:
"environment.yaml"
log:
"logs/parse_doublet_results.log"
threads:
config["slurm"]["parse_doublet_results"]["threads"]
resources:
mem_mb = config["slurm"]["parse_doublet_results"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --doublet_files {input.doublet_files} --outfile {output} > {log}"
################
## Plot stats ##
################
rule plot_stats_per_sample:
input:
sce = rules.seurat_to_sce.output,
metadata = rules.parse_doublet_results.output,
script = config["scripts"]["plot_stats_per_sample"]
output:
config["directories"]["results"]+"/rna/stats/ncells_per_sample.pdf"
params:
outdir = config["directories"]["results"]+"/rna/stats",
samples = expand("{sample}", sample=config["samples"])
conda:
"environment.yaml"
log:
"logs/plot_stats_per_sample.log"
threads:
config["slurm"]["plot_stats_per_sample"]["threads"]
resources:
mem_mb = config["slurm"]["plot_stats_per_sample"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --samples {params.samples} \
--outdir {params.outdir} > {log}"
##########################
## Mapping to the atlas ##
##########################
rule mapping_mnn:
input:
atlas_sce = config["directories"]["atlas"]+"/processed/SingleCellExperiment.rds",
atlas_metadata = config["directories"]["atlas"]+"/sample_metadata.txt.gz",
query_sce = rules.seurat_to_sce.output,
query_metadata = rules.parse_doublet_results.output,
script = config["scripts"]["mapping_mnn"]
output:
config["directories"]["results"]+"/rna/mapping/mapping_mnn_{sample}.txt.gz"
params:
# sample = expand("{sample}", sample=config["samples"]),
atlas_stages=config["mapping_mnn"]["atlas_stages"],
npcs = config["mapping_mnn"]["npcs"],
n_neighbours = config["mapping_mnn"]["n_neighbours"]
conda:
"environment.yaml"
log:
"logs/mapping_mnn_{sample}.log"
threads:
config["slurm"]["mapping_mnn"]["threads"]
resources:
mem_mb = config["slurm"]["mapping_mnn"]["memory"]
shell:
"Rscript {input.script} --query_samples {wildcards.sample} --atlas_stages {params.atlas_stages} --query_sce {input.query_sce} \
--atlas_sce {input.atlas_sce} --atlas_metadata {input.atlas_metadata} --query_metadata {input.query_metadata} \
--npcs {params.npcs} --n_neighbours {params.n_neighbours} --outfile {output} > {log}"
rule mapping_seurat:
input:
atlas_sce = config["directories"]["atlas"]+"/processed/SingleCellExperiment.rds",
atlas_metadata = config["directories"]["atlas"]+"/sample_metadata.txt.gz",
query_sce = rules.seurat_to_sce.output,
query_metadata = rules.parse_doublet_results.output,
script = config["scripts"]["mapping_seurat"]
output:
config["directories"]["results"]+"/rna/mapping/mapping_seurat_{sample}.txt.gz"
params:
# sample = expand("{sample}", sample=config["samples"]),
atlas_stages=config["mapping_seurat"]["atlas_stages"],
npcs = config["mapping_seurat"]["npcs"],
n_neighbours = config["mapping_seurat"]["n_neighbours"]
conda:
"environment.yaml"
log:
"logs/mapping_seurat_{sample}.log"
threads:
config["slurm"]["mapping_seurat"]["threads"]
resources:
mem_mb = config["slurm"]["mapping_seurat"]["memory"]
shell:
"Rscript {input.script} --query_samples {wildcards.sample} --atlas_stages {params.atlas_stages} --query_sce {input.query_sce} \
--atlas_sce {input.atlas_sce} --atlas_metadata {input.atlas_metadata} --query_metadata {input.query_metadata} \
--npcs {params.npcs} --n_neighbours {params.n_neighbours} --outfile {output} > {log}"
rule parse_mapping_results:
input:
query_metadata = rules.parse_doublet_results.output,
# mapping_seurat = expand(config["directories"]["results"]+"/rna/mapping/mapping_seurat_{sample}.txt.gz", sample=config["samples"]),
mapping_mnn = expand(config["directories"]["results"]+"/rna/mapping/mapping_mnn_{sample}.txt.gz", sample=config["samples"]),
# mapping_seurat = rules.mapping_seurat.output,
# mapping_mnn = rules.mapping_mnn.output,
script = config["scripts"]["parse_mapping"]
output:
config["directories"]["results"]+"/rna/mapping/sample_metadata_after_mapping.txt.gz"
conda:
"environment.yaml"
log:
"logs/parse_mapping_results.log"
threads:
config["slurm"]["parse_mapping_results"]["threads"]
resources:
mem_mb = config["slurm"]["parse_mapping_results"]["memory"]
shell:
"Rscript {input.script} --metadata {input.query_metadata} --mapping_mnn {input.mapping_mnn} --outfile {output} > {log}"
# "Rscript {input.script} --metadata {input.query_metadata} --mapping_seurat {input.mapping_seurat} --mapping_mnn {input.mapping_mnn} --outfile {output} > {log}"
##############################
## Dimensionality reduction ##
##############################
rule dimensionality_reduction_sce:
input:
script = config["scripts"]["dimensionality_reduction_sce"],
sce = rules.seurat_to_sce.output,
metadata = rules.parse_mapping_results.output
output:
config["directories"]["results"]+"/rna/dimensionality_reduction/sce/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}/pca_features{dimred_sce_features}_pcs{dimred_sce_npcs}.txt.gz"
params:
outdir = config["directories"]["results"]+"/rna/dimensionality_reduction/sce/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}",
n_neighbors = config["dimensionality_reduction_sce"]["n_neighbors"],
min_dist = config["dimensionality_reduction_sce"]["min_dist"],
vars_to_regress = config["dimensionality_reduction_sce"]["vars_to_regress"],
colour_by = config["dimensionality_reduction_sce"]["colour_by"]
conda:
"environment.yaml"
log:
"logs/dimensionality_reduction_features{dimred_sce_features}_pcs{dimred_sce_npcs}_batch_correction_{batch_variable}_remove_ExE_cells{remove_ExE_cells}.log"
threads:
config["slurm"]["dimensionality_reduction_sce"]["threads"]
resources:
mem_mb = config["slurm"]["dimensionality_reduction_sce"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --npcs {wildcards.dimred_sce_npcs} --features {wildcards.dimred_sce_features} \
--vars_to_regress {params.vars_to_regress} --remove_ExE_cells {wildcards.remove_ExE_cells} --batch_variable {wildcards.batch_variable} \
--n_neighbors {params.n_neighbors} --min_dist {params.min_dist} --colour_by {params.colour_by} --outdir {params.outdir} > {log}"
rule dimensionality_reduction_sce_per_stage:
input:
script=config["scripts"]["dimensionality_reduction_sce"],
sce=rules.seurat_to_sce.output,
metadata=rules.parse_mapping_results.output
output:
# config["directories"]["results"]+"/rna/dimensionality_reduction/umap_features{dimred_sce_features}_pcs{dimred_sce_npcs}_neigh{n_neighbors}_dist{min_dist}.txt.gz",
config["directories"]["results"]+"/rna/dimensionality_reduction/sce/per_stage/{stage}/pca_features{dimred_sce_features}_pcs{dimred_sce_npcs}.txt.gz"
params:
outdir = config["directories"]["results"]+"/rna/dimensionality_reduction/sce/per_stage/{stage}",
n_neighbors = config["dimensionality_reduction_sce"]["n_neighbors"],
min_dist = config["dimensionality_reduction_sce"]["min_dist"],
vars_to_regress = config["dimensionality_reduction_sce"]["vars_to_regress"],
colour_by = config["dimensionality_reduction_sce"]["colour_by"]
conda:
"environment.yaml"
log:
"logs/dimensionality_reduction_features{dimred_sce_features}_pcs{dimred_sce_npcs}_per_stage_{stage}.log"
threads:
config["slurm"]["dimensionality_reduction_sce"]["threads"]
resources:
mem_mb = config["slurm"]["dimensionality_reduction_sce"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --stages {wildcards.stage} --metadata {input.metadata} --npcs {wildcards.dimred_sce_npcs} --features {wildcards.dimred_sce_features} \
--vars_to_regress {params.vars_to_regress} --n_neighbors {params.n_neighbors} --min_dist {params.min_dist} --colour_by {params.colour_by} --outdir {params.outdir} > {log}"
rule dimensionality_reduction_seurat:
input:
script=config["scripts"]["dimensionality_reduction_seurat"],
seurat=rules.create_seurat.output.seurat,
metadata=rules.parse_mapping_results.output
output:
config["directories"]["results"]+"/rna/dimensionality_reduction/seurat/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}/pca_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}.txt.gz"
params:
outdir = config["directories"]["results"]+"/rna/dimensionality_reduction/seurat/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}",
colour_by = config["dimensionality_reduction_seurat"]["colour_by"],
n_neighbors = config["dimensionality_reduction_seurat"]["n_neighbors"],
min_dist = config["dimensionality_reduction_seurat"]["min_dist"],
vars_to_regress = config["dimensionality_reduction_seurat"]["vars_to_regress"],
seed = config["dimensionality_reduction_seurat"]["seed"]
conda:
"environment.yaml"
log:
"logs/dimensionality_reduction_seurat_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}_batch_correction_{batch_variable}_remove_ExE_cells{remove_ExE_cells}.log"
threads:
config["slurm"]["dimensionality_reduction_seurat"]["threads"]
resources:
mem_mb = config["slurm"]["dimensionality_reduction_seurat"]["memory"]
shell:
"Rscript {input.script} --seurat {input.seurat} --metadata {input.metadata} --npcs {wildcards.dimred_seurat_npcs} \
--features {wildcards.dimred_seurat_features} --n_neighbors {params.n_neighbors} --min_dist {params.min_dist} \
--vars_to_regress {params.vars_to_regress} --remove_ExE_cells {wildcards.remove_ExE_cells} --batch_variable {wildcards.batch_variable} \
--seed {params.seed} --colour_by {params.colour_by} --outdir {params.outdir} > {log}"
rule dimensionality_reduction_seurat_per_stage:
input:
script=config["scripts"]["dimensionality_reduction_seurat"],
seurat=rules.create_seurat.output.seurat,
metadata=rules.parse_mapping_results.output
output:
config["directories"]["results"]+"/rna/dimensionality_reduction/seurat/per_stage/{stage}/pca_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}.txt.gz"
params:
outdir = config["directories"]["results"]+"/rna/dimensionality_reduction/seurat/per_stage/{stage}",
colour_by = config["dimensionality_reduction_seurat"]["colour_by"],
n_neighbors = config["dimensionality_reduction_seurat"]["n_neighbors"],
min_dist = config["dimensionality_reduction_seurat"]["min_dist"],
vars_to_regress = config["dimensionality_reduction_seurat"]["vars_to_regress"],
seed = config["dimensionality_reduction_seurat"]["seed"]
conda:
"environment.yaml"
log:
"logs/dimensionality_reduction_seurat_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}_per_stage_{stage}.log"
threads:
config["slurm"]["dimensionality_reduction_seurat_per_stage"]["threads"]
resources:
mem_mb = config["slurm"]["dimensionality_reduction_seurat_per_stage"]["memory"]
shell:
"Rscript {input.script} --seurat {input.seurat} --metadata {input.metadata} --stage {wildcards.stage} --features {wildcards.dimred_seurat_features} \
--npcs {wildcards.dimred_seurat_npcs} --n_neighbors {params.n_neighbors} --min_dist {params.min_dist} \
--vars_to_regress {params.vars_to_regress} --seed {params.seed}\
--colour_by {params.colour_by} --outdir {params.outdir} > {log}"
rule plot_mapping_results:
input:
script = config["scripts"]["plot_mapping_results"],
query_metadata=rules.parse_mapping_results.output,
atlas_metadata = config["directories"]["atlas"]+"/sample_metadata.txt.gz"
output:
config["directories"]["results"]+"/rna/mapping/pdf/umap_mapped_allcells.pdf"
params:
samples = expand("{sample}", sample=config["samples"]),
outdir = config["directories"]["results"]+"/rna/mapping/pdf"
conda:
"environment.yaml"
log:
"logs/plot_mapping_results.log"
threads:
config["slurm"]["plot_mapping_results"]["threads"]
resources:
mem_mb = config["slurm"]["plot_mapping_results"]["memory"]
shell:
"Rscript {input.script} --query_metadata {input.query_metadata} --atlas_metadata {input.atlas_metadata} \
--samples {params.samples} --outdir {params.outdir} > {log}"
####################
## Create anndata ##
####################
rule convert_SingleCellExperiment_to_anndata:
input:
script = config["scripts"]["convert_SingleCellExperiment_to_anndata"],
metadata = rules.parse_mapping_results.output,
sce = rules.seurat_to_sce.output
output:
config["directories"]["processed_data"] + "/anndata.h5ad"
# params:
# python = config["resources"]["python"]
conda:
"environment.yaml"
log:
"logs/convert_SingleCellExperiment_to_anndata.log"
threads:
config["slurm"]["convert_SingleCellExperiment_to_anndata"]["threads"]
resources:
mem_mb = config["slurm"]["convert_SingleCellExperiment_to_anndata"]["memory"]
shell:
# "Rscript {input.script} --python_path {params.python} --sce {input.sce} --metadata {input.metadata} --outfile {output} > {log}"
"Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --outfile {output} > {log}"
################
## Pseudobulk ##
################
rule pseudobulk_rna:
input:
sce = rules.seurat_to_sce.output,
metadata = rules.parse_mapping_results.output,
script = config["scripts"]["pseudobulk_rna"]
output:
# seurat = config["directories"]["results"]+"/rna/pseudobulk/{group_by}/Seurat_pseudobulk.rds",
sce = config["directories"]["results"]+"/rna/pseudobulk/{group_by}/SingleCellExperiment_pseudobulk.rds",
stats = config["directories"]["results"]+"/rna/pseudobulk/{group_by}/stats.txt"
params:
normalisation_method = config["pseudobulk_rna"]["normalisation_method"],
outdir = config["directories"]["results"]+"/rna/pseudobulk/{group_by}"
conda:
"environment.yaml"
log:
"logs/pseudobulk_rna_{group_by}.log"
threads:
config["slurm"]["pseudobulk_rna"]["threads"]
resources:
mem_mb = config["slurm"]["pseudobulk_rna"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --group_by {wildcards.group_by} \
--normalisation_method {params.normalisation_method} --outdir {params.outdir} > {log}"
rule pseudobulk_rna_with_replicates:
input:
sce = rules.seurat_to_sce.output,
metadata = rules.parse_mapping_results.output,
script = config["scripts"]["pseudobulk_rna_with_replicates"]
output:
sce = config["directories"]["results"]+"/rna/pseudobulk/{group_by}/SingleCellExperiment_pseudobulk_with_replicates.rds",
cell2replicate = config["directories"]["results"]+"/rna/pseudobulk/{group_by}/cell2replicate.txt.gz"
params:
min_cells = config["pseudobulk_rna_with_replicates"]["min_cells"],
nrep = config["pseudobulk_rna_with_replicates"]["nrep"],
fraction_cells_per_replicate = config["pseudobulk_rna_with_replicates"]["fraction_cells_per_replicate"],
outdir = config["directories"]["results"]+"/rna/pseudobulk/{group_by}"
conda:
"environment.yaml"
log:
"logs/pseudobulk_rna_with_replicates_{group_by}.log"
threads:
config["slurm"]["pseudobulk_rna_with_replicates"]["threads"]
resources:
mem_mb = config["slurm"]["pseudobulk_rna_with_replicates"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --group_by {wildcards.group_by} --nrep {params.nrep} \
--min_cells {params.min_cells} --fraction_cells_per_replicate {params.fraction_cells_per_replicate} --outdir {params.outdir} > {log}"
###############
## Metacells ##
###############
rule run_metacells:
input:
script = config["scripts"]["run_metacells"],
metadata = rules.parse_mapping_results.output,
anndata = rules.convert_SingleCellExperiment_to_anndata.output
output:
cell2metacell_assignments = config["directories"]["results"] + "/rna/metacells/all_cells/{sample_metacell}/cell2metacell_assignment.txt.gz",
anndata = config["directories"]["results"] + "/rna/metacells/all_cells/{sample_metacell}/anndata_metacells.h5ad"
params:
# python = config["resources"]["python"],
# python = "/bi/group/reik/ricard/software/miniconda3/envs/main/envs/metacells/bin/python",
percent_metacells = config["run_metacells"]["percent_metacells"],
n_pcs = config["run_metacells"]["n_pcs"],
n_features = config["run_metacells"]["n_features"],
outdir = config["directories"]["results"] + "/rna/metacells/all_cells/{sample_metacell}"
conda:
"environment.yaml"
log:
"logs/run_metacells_{sample_metacell}.log"
threads:
config["slurm"]["run_metacells"]["threads"]
resources:
mem_mb = config["slurm"]["run_metacells"]["memory"]
shell:
# "{params.python} {input.script} --anndata {input.anndata} --metadata {input.metadata} --samples {wildcards.sample_metacell} \
"python {input.script} --anndata {input.anndata} --metadata {input.metadata} --samples {wildcards.sample_metacell} \
--percent_metacells {params.percent_metacells} --n_pcs {params.n_pcs} --n_features {params.n_features} --outdir {params.outdir} > {log}"
rule run_metacells_trajectory:
input:
script = config["scripts"]["run_metacells_trajectory"],
metadata = rules.parse_mapping_results.output,
anndata = rules.convert_SingleCellExperiment_to_anndata.output
output:
cell2metacell_assignments = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}/cell2metacell_assignment.txt.gz",
trajectory = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}/metacell_trajectory.txt.gz",
anndata = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}/anndata_metacells.h5ad"
params:
# python = config["resources"]["python"],
# python = "/bi/group/reik/ricard/software/miniconda3/envs/main/envs/metacells/bin/python",
percent_metacells = config["run_metacells_trajectory"]["percent_metacells"],
n_pcs = config["run_metacells_trajectory"]["n_pcs"],
n_features = config["run_metacells_trajectory"]["n_features"],
outdir = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}"
conda:
"environment.yaml"
log:
"logs/run_metacells_trajectory_{trajectory_metacells}.log"
threads:
config["slurm"]["run_metacells_trajectory"]["threads"]
resources:
mem_mb = config["slurm"]["run_metacells_trajectory"]["memory"]
shell:
# "{params.python} {input.script} --anndata {input.anndata} --metadata {input.metadata} --samples all --trajectory {wildcards.trajectory_metacells} \
"python {input.script} --anndata {input.anndata} --metadata {input.metadata} --samples all --trajectory {wildcards.trajectory_metacells} \
--percent_metacells {params.percent_metacells} --n_pcs {params.n_pcs} --n_features {params.n_features} --outdir {params.outdir} > {log}"
rule aggregate_rna_metacells:
input:
script = config["scripts"]["aggregate_rna_metacells"],
metadata = rules.parse_mapping_results.output,
sce = rules.seurat_to_sce.output,
cell2metacell = expand(rules.run_metacells.output.cell2metacell_assignments, sample_metacell=config["samples"])
output:
metadata = config["directories"]["results"] + "/rna/metacells/all_cells/metacells_metadata.txt.gz",
sce = config["directories"]["results"] + "/rna/metacells/all_cells/SingleCellExperiment_metacells.rds",
anndata = config["directories"]["results"] + "/rna/metacells/all_cells/anndata_metacells.h5ad"
params:
outdir = config["directories"]["results"] + "/rna/metacells/all_cells",
python = config["resources"]["python"],
metacell_min_reads = config["aggregate_rna_metacells"]["metacell_min_reads"],
normalisation_method = config["aggregate_rna_metacells"]["normalisation_method"]
conda:
"environment.yaml"
log:
"logs/aggregate_rna_metacells.log"
threads:
config["slurm"]["aggregate_rna_metacells"]["threads"]
resources:
mem_mb = config["slurm"]["aggregate_rna_metacells"]["memory"]
shell:
"Rscript {input.script} --python {params.python} --sce {input.sce} --metadata {input.metadata} --cell2metacell {input.cell2metacell} \
--metacell_min_reads {params.metacell_min_reads} --normalisation_method {params.normalisation_method} \
--outdir {params.outdir} > {log}"
rule aggregate_rna_metacells_trajectory:
input:
script = config["scripts"]["aggregate_rna_metacells"],
metadata = rules.parse_mapping_results.output,
sce = rules.seurat_to_sce.output,
cell2metacell = rules.run_metacells_trajectory.output.cell2metacell_assignments
output:
metadata = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}/metacells_metadata.txt.gz",
sce = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}/SingleCellExperiment_metacells.rds"
params:
outdir = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}",
python = config["resources"]["python"],
metacell_min_reads = config["aggregate_rna_metacells"]["metacell_min_reads"],
normalisation_method = config["aggregate_rna_metacells"]["normalisation_method"]
conda:
"environment.yaml"
log:
"logs/aggregate_rna_metacells_{trajectory_metacells}.log"
threads:
config["slurm"]["aggregate_rna_metacells"]["threads"]
resources:
mem_mb = config["slurm"]["aggregate_rna_metacells"]["memory"]
shell:
"Rscript {input.script} --python {params.python} --sce {input.sce} --metadata {input.metadata} --cell2metacell {input.cell2metacell} \
--metacell_min_reads {params.metacell_min_reads} --normalisation_method {params.normalisation_method} \
--outdir {params.outdir} > {log}"
###############################
## Extract TF RNA expression ##
###############################
rule extract_TF_SingleCellExperiment_cells:
input:
sce = rules.seurat_to_sce.output,
script = config["scripts"]["extract_TF_SingleCellExperiment"],
TF_file = config["resources"]["TFs_file"]
output:
config["directories"]["processed_data"]+"/SingleCellExperiment_TFs.rds"
conda:
"environment.yaml"
log:
"logs/extract_TF_SingleCellExperiment_cells.log"
threads:
config["slurm"]["extract_TF_SingleCellExperiment_cells"]["threads"]
resources:
mem_mb = config["slurm"]["extract_TF_SingleCellExperiment_cells"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --TF_file {input.TF_file} --outfile {output} > {log}"
rule extract_TF_SingleCellExperiment_metacells:
input:
sce = rules.aggregate_rna_metacells.output.sce,
script = config["scripts"]["extract_TF_SingleCellExperiment"],
TF_file = config["resources"]["TFs_file"]
output:
config["directories"]["results"] + "/rna/metacells/all_cells/SingleCellExperiment_TFs_metacells.rds"
conda:
"environment.yaml"
log:
"logs/extract_TF_SingleCellExperiment_metacells.log"
threads:
config["slurm"]["extract_TF_SingleCellExperiment_metacells"]["threads"]
resources:
mem_mb = config["slurm"]["extract_TF_SingleCellExperiment_metacells"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --TF_file {input.TF_file} --outfile {output} > {log}"
rule extract_TF_SingleCellExperiment_pseudobulk:
input:
sce = rules.pseudobulk_rna.output.sce,
script = config["scripts"]["extract_TF_SingleCellExperiment"],
TF_file = config["resources"]["TFs_file"]
output:
config["directories"]["results"]+"/rna/pseudobulk/{group_by}/SingleCellExperiment_TFs_pseudobulk.rds"
conda:
"environment.yaml"
log:
"logs/extract_TF_SingleCellExperiment_pseudobulk_{group_by}.log"
threads:
config["slurm"]["extract_TF_SingleCellExperiment_pseudobulk"]["threads"]
resources:
mem_mb = config["slurm"]["extract_TF_SingleCellExperiment_pseudobulk"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --TF_file {input.TF_file} --outfile {output} > {log}"
rule extract_TF_SingleCellExperiment_pseudobulk_with_replicates:
input:
sce = rules.pseudobulk_rna_with_replicates.output.sce,
script = config["scripts"]["extract_TF_SingleCellExperiment"],
TF_file = config["resources"]["TFs_file"]
output:
config["directories"]["results"]+"/rna/pseudobulk/{group_by}/SingleCellExperiment_TFs_pseudobulk_with_replicates.rds"
conda:
"environment.yaml"
log:
"logs/extract_TF_SingleCellExperiment_pseudobulk_with_replicates_{group_by}.log"
threads:
config["slurm"]["extract_TF_SingleCellExperiment_pseudobulk"]["threads"]
resources:
mem_mb = config["slurm"]["extract_TF_SingleCellExperiment_pseudobulk"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --TF_file {input.TF_file} --outfile {output} > {log}"
################################
## Plot cell type proportions ##
################################
rule plot_celltype_proportions:
input:
script = config["scripts"]["plot_celltype_proportions"],
metadata=rules.parse_mapping_results.output
output:
config["directories"]["results"]+"/rna/celltype_proportions/celltype_proportions_stacked_barplots.pdf"
# expand(config["directories"]["results"]+"/rna/celltype_proportions/celltype_proportions_{stage}_horizontal_barplots.pdf", stage=config["stages"])
params:
samples = expand("{sample}", sample=config["samples"]),
celltype_label = config["plot_celltype_proportions"]["celltype_label"],
outdir = config["directories"]["results"]+"/rna/celltype_proportions"
conda:
"environment.yaml"
log:
"logs/plot_celltype_proportions.log"
threads:
config["slurm"]["plot_celltype_proportions"]["threads"]
resources:
mem_mb = config["slurm"]["plot_celltype_proportions"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --celltype_label {params.celltype_label} \
--samples {params.samples} --outdir {params.outdir} > {log}"
#############################
## TF vs gene coexpression ##
#############################
# TO-DO: COMPRESS RULES
rule coexpression_TF_vs_gene_single_cells:
input:
script = config["scripts"]["coexpression_TF_vs_gene_single_cells"],
TFs_file = config["resources"]["TFs_file"],
metadata = rules.parse_mapping_results.output,
sce = rules.seurat_to_sce.output
output:
config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_single_cells.rds",
config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_single_cells.rds"
params:
outdir = config["directories"]["results"]+"/rna/coexpression"
conda:
"environment.yaml"
log:
"logs/coexpression_TF_vs_gene_{cor_test}_single_cells.log"
threads:
config["slurm"]["coexpression_TF_vs_gene_single_cells"]["threads"]
resources:
mem_mb = config["slurm"]["coexpression_TF_vs_gene_single_cells"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --cor_test {wildcards.cor_test} \
--TFs_file {input.TFs_file} --outdir {params.outdir} > {log}"
rule coexpression_TF_vs_gene_single_cells_no_ExE:
input:
script = config["scripts"]["coexpression_TF_vs_gene_single_cells"],
metadata=rules.parse_mapping_results.output,
sce=rules.seurat_to_sce.output
output:
config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_single_cells_no_ExE.rds",
config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_single_cells_no_ExE.rds"
params:
TFs_file = config["resources"]["TFs_file"],
outdir = config["directories"]["results"]+"/rna/coexpression"
conda:
"environment.yaml"
log:
"logs/coexpression_TF_vs_gene_{cor_test}_single_cells_no_ExE.log"
threads:
config["slurm"]["coexpression_TF_vs_gene_single_cells"]["threads"]
resources:
mem_mb = config["slurm"]["coexpression_TF_vs_gene_single_cells"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --sce {input.sce} \
--TFs_file {params.TFs_file} --remove_ExE_cells --cor_test {wildcards.cor_test} --outdir {params.outdir} > {log}"
rule coexpression_TF_vs_gene_pseudobulk:
input:
script = config["scripts"]["coexpression_TF_vs_gene_pseudobulk"],
sce = expand(rules.pseudobulk_rna.output.sce, group_by="celltype")
output:
config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_pseudobulk.rds",
config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_pseudobulk.rds"
params:
TFs_file = config["resources"]["TFs_file"],
outdir = config["directories"]["results"]+"/rna/coexpression"
conda:
"environment.yaml"
log:
"logs/coexpression_TF_vs_gene_{cor_test}_pseudobulk.log"
threads:
config["slurm"]["coexpression_TF_vs_gene_pseudobulk"]["threads"]
resources:
mem_mb = config["slurm"]["coexpression_TF_vs_gene_pseudobulk"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --TFs_file {params.TFs_file} --cor_test {wildcards.cor_test} --outdir {params.outdir} > {log}"
rule coexpression_TF_vs_gene_pseudobulk_no_ExE:
input:
script = config["scripts"]["coexpression_TF_vs_gene_pseudobulk"],
sce = expand(rules.pseudobulk_rna.output.sce, group_by="celltype")
output:
config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_pseudobulk_no_ExE.rds",
config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_pseudobulk_no_ExE.rds"
params:
TFs_file = config["resources"]["TFs_file"],
outdir = config["directories"]["results"]+"/rna/coexpression"
conda:
"environment.yaml"
log:
"logs/coexpression_TF_vs_gene_{cor_test}_pseudobulk_no_ExE.log"
threads:
config["slurm"]["coexpression_TF_vs_gene_pseudobulk"]["threads"]
resources:
mem_mb = config["slurm"]["coexpression_TF_vs_gene_pseudobulk"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --TFs_file {params.TFs_file} --remove_ExE_cells --cor_test {wildcards.cor_test} --outdir {params.outdir} > {log}"
rule coexpression_TF_vs_gene_metacells:
input:
script = config["scripts"]["coexpression_TF_vs_gene_metacells"],
metadata = rules.aggregate_rna_metacells.output.metadata,
sce = rules.aggregate_rna_metacells.output.sce
output:
config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_metacells.rds",
config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_metacells.rds"
params:
TFs_file = config["resources"]["TFs_file"],
outdir = config["directories"]["results"]+"/rna/coexpression"
conda:
"environment.yaml"
log:
"logs/coexpression_TF_vs_gene_{cor_test}_metacells.log"
threads:
config["slurm"]["coexpression_TF_vs_gene_metacells"]["threads"]
resources:
mem_mb = config["slurm"]["coexpression_TF_vs_gene_metacells"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --TFs_file {params.TFs_file} --cor_test {wildcards.cor_test} --outdir {params.outdir} > {log}"
rule coexpression_TF_vs_gene_metacells_trajectories:
input:
script = config["scripts"]["coexpression_TF_vs_gene_metacells"],
metadata = rules.aggregate_rna_metacells_trajectory.output.metadata,
sce = rules.aggregate_rna_metacells_trajectory.output.sce
output:
config["directories"]["results"]+"/rna/coexpression/trajectories/{trajectory_metacells}/correlation_matrix_tf2gene_metacells.rds",
config["directories"]["results"]+"/rna/coexpression/trajectories/{trajectory_metacells}/correlation_matrix_tf2tf_metacells.rds"
params:
TFs_file = config["resources"]["TFs_file"],
outdir = config["directories"]["results"]+"/rna/coexpression/trajectories/{trajectory_metacells}"
conda:
"environment.yaml"
log:
"logs/coexpression_TF_vs_gene_metacells_{trajectory_metacells}.log"
threads:
config["slurm"]["coexpression_TF_vs_gene_metacells"]["threads"]
resources:
mem_mb = config["slurm"]["coexpression_TF_vs_gene_metacells"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --TFs_file {params.TFs_file} --outdir {params.outdir} > {log}"
##################
## Trajectories ##
##################
# FIX CELL_METADATA
rule infer_trajectories:
input:
script = config["scripts"]["infer_trajectories"],
# metadata = rules.parse_mapping_results.output,
metadata = config["resources"]["cell_metadata"], # this is part of the ATAC pipeline
sce = rules.seurat_to_sce.output
output:
sce = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_SingleCellExperiment.rds",
adata = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_adata.h5ad",
txt = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_trajectory.txt.gz",
metadata = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_sample_metadata.txt.gz"
params:
celltype_label = config["infer_trajectories"]["celltype_label"],
outdir = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}"
conda:
"environment.yaml"
log:
"logs/infer_trajectories_{trajectory_name}.log"
threads:
config["slurm"]["infer_trajectories"]["threads"]
resources:
mem_mb = config["slurm"]["infer_trajectories"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --trajectory_name {wildcards.trajectory_name} --celltype_label {params.celltype_label} \
--outdir {params.outdir} > {log}"
rule coexpression_TF_vs_gene_trajectories:
input:
script = config["scripts"]["coexpression_TF_vs_gene_single_cells"],
metadata = rules.infer_trajectories.output.metadata,
sce = rules.infer_trajectories.output.sce
output:
config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2gene_single_cells.rds",
config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2tf_single_cells.rds"
params:
TFs_file = config["resources"]["TFs_file"],
outdir = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}"
conda:
"environment.yaml"
log:
"logs/coexpression_TF_vs_gene_{trajectory_name}_trajectory.log"
threads:
config["slurm"]["coexpression_TF_vs_gene_single_cells"]["threads"]
resources:
mem_mb = config["slurm"]["coexpression_TF_vs_gene_single_cells"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --sce {input.sce} \
--TFs_file {params.TFs_file} --outdir {params.outdir} > {log}"
rule coexpression_TF_vs_gene_trajectories_denoised:
input:
script = config["scripts"]["coexpression_TF_vs_gene_single_cells"],
metadata = rules.infer_trajectories.output.metadata,
sce = rules.infer_trajectories.output.sce
output:
config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2gene_single_cells_denoised.rds",
config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2tf_single_cells_denoised.rds"
params:
knn = 25,
TFs_file = config["resources"]["TFs_file"],
outdir = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}"
conda:
"environment.yaml"
log:
"logs/coexpression_TF_vs_gene_{trajectory_name}_trajectory.log"
threads:
config["slurm"]["coexpression_TF_vs_gene_single_cells"]["threads"]
resources:
mem_mb = config["slurm"]["coexpression_TF_vs_gene_single_cells"]["memory"]
shell:
"Rscript {input.script} --metadata {input.metadata} --sce {input.sce} \
--TFs_file {params.TFs_file} --denoise --knn {params.knn} --outdir {params.outdir} > {log}"
##############
## Velocyto ##
##############
# TO-DO: NEEDS A CONDA ENVIRONEMNT
rule run_velocyto:
input:
# rules.samtools_sort.output,
mask = config["resources"]["mm10_mask"],
genes_gtf = config["resources"]["genes_gtf"]
output:
config["directories"]["original_data"] + "/{sample}/velocyto/{sample}.loom"
params:
input_dir = config["directories"]["original_data"],
output_folder = config["directories"]["original_data"] + "/{sample}/velocyto"
conda:
"environment.yaml"
log:
"logs/run_velocyto_{sample}.log"
threads:
config["slurm"]["run_velocyto"]["threads"]
resources:
mem_mb = config["slurm"]["run_velocyto"]["memory"],
mem_mb_thread = int(config["slurm"]["run_velocyto"]["memory"] / (1.25*config["slurm"]["run_velocyto"]["threads"]))
shell:
"velocyto run -e {wildcards.sample} -@ {threads} --samtools-memory {resources.mem_mb_thread} -m {input.mask} -b {params.input_dir}/{wildcards.sample}/outs/barcodes.tsv.gz {params.input_dir}/{wildcards.sample}/outs/gex_possorted.bam {input.genes_gtf} -o {params.output_folder} "
rule create_anndata_velocyto:
input:
loom_files = expand(rules.run_velocyto.output, sample=config["samples"]),
script = config["scripts"]["create_anndata_velocyto"],
metadata = rules.parse_mapping_results.output,
anndata = rules.convert_SingleCellExperiment_to_anndata.output
output:
config["directories"]["processed_data"] + "/velocyto/anndata_velocyto.h5ad"
params:
samples = config["samples"][:1],
conda:
"environment.yaml"
log:
"logs/create_anndata_velocyto.log"
threads:
config["slurm"]["create_anndata_velocyto"]["threads"]
resources:
mem_mb = config["slurm"]["create_anndata_velocyto"]["memory"],
shell:
"python {input.script} --anndata {input.anndata} --metadata {input.metadata} --samples {params.samples} --outfile {output} > {log} "
################################################
## Differential expression between cell types ##
################################################
rule differential_celltype_cells:
input:
script = config["scripts"]["differential_celltype_cells"],
metadata = rules.parse_mapping_results.output,
sce = rules.seurat_to_sce.output
output:
config["directories"]["results"]+"/rna/differential/cells/celltype/{celltypeA}_vs_{celltypeB}.txt.gz"
params:
min_cells = config["differential_celltype_cells"]["min_cells"]
log:
"logs/differential_celltype_cells_{celltypeA}_vs_{celltypeB}.log"
threads:
config["slurm"]["differential_cells"]["threads"]
resources:
mem_mb = config["slurm"]["differential_cells"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --groupA {wildcards.celltypeA} --groupB {wildcards.celltypeB} \
--group_variable celltype --min_cells {params.min_cells} --outfile {output} > {log}"
rule parse_differential_celltype_cells:
input:
expand(rules.differential_celltype_cells.output, filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]), # dummy wildcard assignment
script = config["scripts"]["parse_differential_celltype_cells"],
output:
results = config["directories"]["results"]+"/rna/differential/cells/celltype/parsed/diff_expr_results.txt.gz",
stats = config["directories"]["results"]+"/rna/differential/cells/celltype/parsed/diff_expr_stats.txt.gz"
params:
diff_results_dir = config["directories"]["results"]+"/rna/differential/cells/celltype",
outdir = config["directories"]["results"]+"/rna/differential/cells/celltype/parsed",
min_cells = config["differential_celltype_cells"]["min_cells"]
log:
"logs/parse_differential_celltype_cells.log"
threads:
config["slurm"]["parse_differential_celltype"]["threads"]
resources:
mem_mb = config["slurm"]["parse_differential_celltype"]["memory"]
shell:
"Rscript {input.script} --diff_results_dir {params.diff_results_dir} --min_cells {params.min_cells} --outdir {params.outdir} > {log}"
rule differential_celltype_metacells:
input:
script = config["scripts"]["differential_celltype_metacells"],
metadata = rules.aggregate_rna_metacells.output.metadata,
sce = rules.aggregate_rna_metacells.output.sce
output:
config["directories"]["results"]+"/rna/differential/metacells/celltype/{celltypeA}_vs_{celltypeB}.txt.gz"
params:
min_cells = config["differential_celltype_metacells"]["min_cells"]
log:
"logs/differential_celltype_metacells_{celltypeA}_vs_{celltypeB}.log"
threads:
config["slurm"]["differential_metacells"]["threads"]
resources:
mem_mb = config["slurm"]["differential_metacells"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --groupA {wildcards.celltypeA} --groupB {wildcards.celltypeB} \
--group_variable celltype --min_cells {params.min_cells} --outfile {output} > {log}"
rule parse_differential_celltype_metacells:
input:
expand(rules.differential_celltype_metacells.output, filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]), # dummy wildcard assignment
script = config["scripts"]["parse_differential_celltype_metacells"]
output:
results = config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed/diff_expr_results.txt.gz",
stats = config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed/diff_expr_stats.txt.gz"
params:
diff_results_dir = config["directories"]["results"]+"/rna/differential/metacells/celltype",
outdir = config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed",
min_cells = config["differential_celltype_metacells"]["min_cells"]
log:
"logs/parse_differential_celltype_metacells.log"
threads:
config["slurm"]["parse_differential_celltype"]["threads"]
resources:
mem_mb = config["slurm"]["parse_differential_celltype"]["memory"]
shell:
"Rscript {input.script} --diff_results_dir {params.diff_results_dir} --min_cells {params.min_cells} --outdir {params.outdir} > {log}"
rule differential_celltype_pseudobulk:
input:
script = config["scripts"]["differential_celltype_pseudobulk"],
sce = expand(rules.pseudobulk_rna_with_replicates.output.sce, group_by="celltype")
output:
config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/{celltypeA}_vs_{celltypeB}.txt.gz"
log:
"logs/differential_celltype_pseudobulk_{celltypeA}_vs_{celltypeB}.log"
threads:
config["slurm"]["differential_pseudobulk"]["threads"]
resources:
mem_mb = config["slurm"]["differential_pseudobulk"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --groupA {wildcards.celltypeA} --groupB {wildcards.celltypeB} --outfile {output} > {log}"
rule parse_differential_celltype_pseudobulk:
input:
expand(rules.differential_celltype_pseudobulk.output, filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]), # dummy wildcard assignment
script = config["scripts"]["parse_differential_celltype_pseudobulk"]
output:
config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/diff_expr_results.txt.gz"
params:
diff_results_dir = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype",
outdir = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed"
log:
"logs/parse_differential_celltype_pseudobulk.log"
threads:
config["slurm"]["parse_differential_celltype"]["threads"]
resources:
mem_mb = config["slurm"]["parse_differential_celltype"]["memory"]
shell:
"Rscript {input.script} --diff_results_dir {params.diff_results_dir} --outdir {params.outdir} > {log}"
###############################################
## Differential expression between genotypes ##
###############################################
rule differential_celltype_genotype_pseudobulk:
input:
script = config["scripts"]["differential_celltype_genotype_pseudobulk"],
sce = expand(rules.pseudobulk_rna_with_replicates.output.sce, group_by="celltype_genotype")
output:
config["directories"]["results"]+"/rna/differential/pseudobulk/celltype_genotype/{diff_celltype_genotype}.txt.gz"
log:
"logs/differential_celltype_genotype_pseudobulk_{diff_celltype_genotype}.log"
threads:
config["slurm"]["differential_pseudobulk"]["threads"]
resources:
mem_mb = config["slurm"]["differential_pseudobulk"]["memory"]
shell:
"Rscript {input.script} --sce {input.sce} --celltypes {wildcards.diff_celltype_genotype} --groupA WT --groupB T_KO --outfile {output} > {log}"
rule parse_differential_celltype_genotype_pseudobulk:
input:
expand(rules.differential_celltype_genotype_pseudobulk.output, diff_celltype_genotype=config["celltypes"]),
script = config["scripts"]["parse_differential_celltype_genotype_pseudobulk"]
output:
config["directories"]["results"]+"/rna/differential/pseudobulk/celltype_genotype/parsed/diff_expr_results.txt.gz"
params:
diff_results_dir = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype_genotype"
log:
"logs/parse_differential_celltype_genotype_pseudobulk.log"
threads:
config["slurm"]["parse_differential_celltype"]["threads"]
resources:
mem_mb = config["slurm"]["parse_differential_celltype"]["memory"]
shell:
"Rscript {input.script} --diff_results_dir {params.diff_results_dir} --outfile {output} > {log}"
############################################
## Extract TFs from differential analysis ##
############################################
rule differential_celltype_extract_TFs_cells:
input:
diff_results = rules.parse_differential_celltype_cells.output.results,
script = config["scripts"]["differential_extract_TFs"],
TFs = config["resources"]["TFs_file"],
output:
config["directories"]["results"] + "/rna/differential/cells/celltype/parsed/diff_expr_results_tfs.txt.gz"
log:
"logs/differential_celltype_extract_TFs_cells.log"
threads:
config["slurm"]["differential_celltype_extract_TFs"]["threads"]
resources:
mem_mb = config["slurm"]["differential_celltype_extract_TFs"]["memory"]
shell:
"Rscript {input.script} --diff_results {input.diff_results} --TFs {input.TFs} --outfile {output} > {log}"
rule differential_celltype_extract_TFs_metacells:
input:
diff_results = rules.parse_differential_celltype_metacells.output.results,
script = config["scripts"]["differential_extract_TFs"],
TFs = config["resources"]["TFs_file"],
output:
config["directories"]["results"] + "/rna/differential/metacells/celltype/parsed/diff_expr_results_tfs.txt.gz"
log:
"logs/differential_celltype_extract_TFs_metacells.log"
threads:
config["slurm"]["differential_celltype_extract_TFs"]["threads"]
resources:
mem_mb = config["slurm"]["differential_celltype_extract_TFs"]["memory"]
shell:
"Rscript {input.script} --diff_results {input.diff_results} --TFs {input.TFs} --outfile {output} > {log}"
rule differential_celltype_extract_TFs_pseudobulk:
input:
diff_results = rules.parse_differential_celltype_pseudobulk.output,
script = config["scripts"]["differential_extract_TFs"],
TFs = config["resources"]["TFs_file"],
output:
config["directories"]["results"] + "/rna/differential/pseudobulk/celltype/parsed/diff_expr_results_tfs.txt.gz"
log:
"logs/differential_celltype_extract_TFs_pseudobulk.log"
threads:
config["slurm"]["differential_celltype_extract_TFs"]["threads"]
resources:
mem_mb = config["slurm"]["differential_celltype_extract_TFs"]["memory"]
shell:
"Rscript {input.script} --diff_results {input.diff_results} --TFs {input.TFs} --outfile {output} > {log}"
##################################
## Define celltype marker genes ##
##################################
rule define_marker_genes_pseudobulk:
input:
script = config["scripts"]["define_marker_genes_pseudobulk"],
differential_results = rules.parse_differential_celltype_pseudobulk.output
output:
all_genes = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_genes_all.txt.gz",
filt_genes = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_genes_filtered.txt.gz"
params:
outdir = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed",
min_fold_change = config["define_marker_genes"]["min_fold_change"],
min_score = config["define_marker_genes"]["min_score"],
fdr = config["define_marker_genes"]["fdr"]
log:
"logs/define_marker_genes_pseudobulk.log"
threads:
config["slurm"]["define_marker_genes"]["threads"]
resources:
mem_mb = config["slurm"]["define_marker_genes"]["memory"]
shell:
"Rscript {input.script} --differential_results {input.differential_results} --fdr {params.fdr} --min_score {params.min_score} --min_fold_change {params.min_fold_change} --outdir {params.outdir} > {log}"
rule define_marker_tfs_pseudobulk:
input:
script = config["scripts"]["define_marker_TFs_pseudobulk"],
differential_results = rules.differential_celltype_extract_TFs_pseudobulk.output
output:
all_genes = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_tfs_all.txt.gz",
filt_genes = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_tfs_filtered.txt.gz"
params:
outdir = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed",
min_fold_change = config["define_marker_genes"]["min_fold_change"],
min_score = config["define_marker_genes"]["min_score"],
fdr = config["define_marker_genes"]["fdr"]
log:
"logs/define_marker_genes_pseudobulk.log"
threads:
config["slurm"]["define_marker_genes"]["threads"]
resources:
mem_mb = config["slurm"]["define_marker_genes"]["memory"]
shell:
"Rscript {input.script} --differential_results {input.differential_results} --fdr {params.fdr} --min_score {params.min_score} --min_fold_change {params.min_fold_change} --outdir {params.outdir} > {log}"