|
a |
|
b/ATAC/AnalysisPipeline/7.1.All.TF.target.R |
|
|
1 |
#' @description: identify the target genes of TF |
|
|
2 |
|
|
|
3 |
library(openxlsx) |
|
|
4 |
library(circlize) |
|
|
5 |
library(dplyr) |
|
|
6 |
library(tibble) |
|
|
7 |
library(data.table) |
|
|
8 |
library(Signac) |
|
|
9 |
library(Seurat) |
|
|
10 |
library(stringr) |
|
|
11 |
library(BSgenome.Hsapiens.UCSC.hg38) |
|
|
12 |
library(chromVARmotifs) |
|
|
13 |
data("human_pwms_v2") |
|
|
14 |
library(ggpubr) |
|
|
15 |
|
|
|
16 |
setwd("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scATAC") |
|
|
17 |
scATAC.data <- readRDS("scATAC.data.pro.rds") |
|
|
18 |
DefaultAssay(scATAC.data) <- "Peaks" |
|
|
19 |
scRNA.data <- readRDS("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/data.merge.pro.rds") |
|
|
20 |
DefaultAssay(scRNA.data) <- "RNA" |
|
|
21 |
|
|
|
22 |
celltype <- "Tumor" |
|
|
23 |
cCREs <- readRDS(paste0("6.Co-Accessible/", celltype,"_peak_gene_correlation.rds")) # 4368 cCREs |
|
|
24 |
#gene-cCRE link --- enhancer |
|
|
25 |
cCREs <- cCREs %>% subset(distance_bp/1000>=10 & peak2_type != 'Promoter') |
|
|
26 |
cCREs$target_gene <- cCREs$peak1_nearestGene |
|
|
27 |
cCREs$peak_gene <- paste0(cCREs$Peak2, '_', cCREs$peak1_nearestGene) |
|
|
28 |
cCREs$Peak1 <- gsub("_", "-", cCREs$Peak1) |
|
|
29 |
cCREs$Peak2 <- gsub("_", "-", cCREs$Peak2) |
|
|
30 |
|
|
|
31 |
# motif information |
|
|
32 |
motif.info <- data.frame(originName = names(scATAC.data@assays$Peaks@motifs@motif.names), TF = unlist(scATAC.data@assays$Peaks@motifs@motif.names)) |
|
|
33 |
rownames(motif.info) <- NULL |
|
|
34 |
# originName TF |
|
|
35 |
# ENSG00000008196_LINE2_TFAP2B_D_N1 TFAP2B |
|
|
36 |
# ENSG00000008197_LINE6_TFAP2D_D TFAP2D |
|
|
37 |
# ENSG00000087510_LINE7_TFAP2C_D_N3 TFAP2C |
|
|
38 |
# ENSG00000116819_LINE19_TFAP2E_I TFAP2E |
|
|
39 |
# ENSG00000137203_LINE29_TFAP2A_D_N3 TFAP2A |
|
|
40 |
# ENSG00000116017_LINE44_ARID3A_D_N1 ARID3A |
|
|
41 |
|
|
|
42 |
# subset by celltype |
|
|
43 |
scATAC.sub <- subset(scATAC.data, cellType_low == celltype) |
|
|
44 |
scRNA.sub <- subset(scRNA.data, cellType_low == celltype) |
|
|
45 |
|
|
|
46 |
scATAC.sub <- FindTopFeatures(scATAC.sub, min.cutoff=ncol(scATAC.sub)*0.05) # present in what % of cells, 要求95%的细胞都包含peak |
|
|
47 |
scRNA.sub <- FindVariableFeatures(scRNA.sub, nfeatures = 3000) |
|
|
48 |
|
|
|
49 |
# use chipseeker annotate each peak in scATAC-seq |
|
|
50 |
peak.info <- readRDS("4.Peak/peak.annotation.simple.ChIPseeker.rds") |
|
|
51 |
|
|
|
52 |
# TF target genes |
|
|
53 |
# for each motif, find genes: |
|
|
54 |
use_variable_genes <- F |
|
|
55 |
motif_list <- list() |
|
|
56 |
edge_df.list <- data.frame() |
|
|
57 |
vertex_df.list <- data.frame() |
|
|
58 |
for(i in 1:nrow(motif.info)){ |
|
|
59 |
motif_name <- motif.info[i,2] |
|
|
60 |
motif_ID <- motif.info[i,1] |
|
|
61 |
cat("Analysis ", motif_ID, "\n") |
|
|
62 |
|
|
|
63 |
# get list of promoter and enhancer targets of these TFs |
|
|
64 |
DefaultAssay(scATAC.data) <- "Peaks" |
|
|
65 |
motif_accessible <- Motifs(scATAC.data)@data[,motif_ID] |
|
|
66 |
motif_accessible <- names(motif_accessible)[motif_accessible > 0] |
|
|
67 |
motif_accessible <- motif_accessible[motif_accessible %in% VariableFeatures(scATAC.sub)] |
|
|
68 |
|
|
|
69 |
# promoter locate in 1kb |
|
|
70 |
motif_accessible_promoters <- motif_accessible[motif_accessible %in% peak.info$peaks[which(peak.info$originType == "Promoter (<=1kb)")]] |
|
|
71 |
motif_target_genes <- peak.info$SYMBOL[match(motif_accessible_promoters, peak.info$peaks)] |
|
|
72 |
motif_target_genes <- as.character(motif_target_genes) |
|
|
73 |
|
|
|
74 |
# variable genes only? |
|
|
75 |
if(use_variable_genes){ |
|
|
76 |
motif_target_genes <- motif_target_genes[motif_target_genes %in% VariableFeatures(scRNA.sub)] %>% as.character %>% unique |
|
|
77 |
} else{ |
|
|
78 |
motif_target_genes<- motif_target_genes %>% as.character %>% unique |
|
|
79 |
} |
|
|
80 |
|
|
|
81 |
# enhancer target genes |
|
|
82 |
motif_accessible_enhancers <- motif_accessible[motif_accessible %in% cCREs$Peak2] |
|
|
83 |
motif_cCREs <- subset(cCREs, Peak2 %in% motif_accessible_enhancers) |
|
|
84 |
motif_enhancers_target_genes <- motif_cCREs$target_gene |
|
|
85 |
|
|
|
86 |
# variable genes only? |
|
|
87 |
if(use_variable_genes){ |
|
|
88 |
motif_enhancers_target_genes <- motif_enhancers_target_genes[motif_enhancers_target_genes %in% VariableFeatures(scRNA.sub)] %>% as.character %>% unique |
|
|
89 |
}else{ |
|
|
90 |
motif_enhancers_target_genes<- motif_enhancers_target_genes %>% as.character %>% unique |
|
|
91 |
} |
|
|
92 |
|
|
|
93 |
vertex_df <- data.frame(name = c(motif_name, as.character(unique(c(motif_enhancers_target_genes, motif_target_genes))))) |
|
|
94 |
|
|
|
95 |
# check if there are promoter targets: |
|
|
96 |
if(length(motif_target_genes) > 0){ |
|
|
97 |
promoter_edge_df <- data.frame( |
|
|
98 |
from = motif_name, |
|
|
99 |
to = as.character(motif_target_genes), |
|
|
100 |
type = 'promoter' |
|
|
101 |
) |
|
|
102 |
} else{promoter_edge_df <- data.frame()} |
|
|
103 |
|
|
|
104 |
# check if there are enhancer targets: |
|
|
105 |
if(length(motif_enhancers_target_genes) > 0){ |
|
|
106 |
enhancer_edge_df <- data.frame( |
|
|
107 |
from = motif_name, |
|
|
108 |
to = as.character(motif_enhancers_target_genes), |
|
|
109 |
type = 'enhancer' |
|
|
110 |
) |
|
|
111 |
} else{enhancer_edge_df <- data.frame()} |
|
|
112 |
|
|
|
113 |
#cur_edge_df <- rbind(cur_promoter_edge_df, cur_enhancer_edge_df, cur_repressors_edge_df) |
|
|
114 |
edge_df <- rbind(promoter_edge_df, enhancer_edge_df) |
|
|
115 |
|
|
|
116 |
edge_df.list <- rbind(edge_df.list, edge_df) |
|
|
117 |
vertex_df.list <- rbind(vertex_df.list, vertex_df) |
|
|
118 |
} |
|
|
119 |
|
|
|
120 |
vertex_df.list <- data.frame(name=na.omit(as.character(unique(vertex_df.list$name)))) |
|
|
121 |
vertex_df.list$name <- as.character(vertex_df.list$name) |
|
|
122 |
edge_df.list <- na.omit(edge_df.list) |
|
|
123 |
saveRDS(edge_df.list, file = paste0("7.TF.analysis/targets/", celltype,".All.TF.targets.rds")) |