a b/ATAC/AnalysisPipeline/7.1.All.TF.target.R
1
#' @description: identify the target genes of TF
2
3
library(openxlsx)
4
library(circlize)
5
library(dplyr)
6
library(tibble)
7
library(data.table)
8
library(Signac)
9
library(Seurat)
10
library(stringr)
11
library(BSgenome.Hsapiens.UCSC.hg38)
12
library(chromVARmotifs)
13
data("human_pwms_v2")
14
library(ggpubr)
15
16
setwd("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scATAC")
17
scATAC.data <- readRDS("scATAC.data.pro.rds")
18
DefaultAssay(scATAC.data) <- "Peaks"
19
scRNA.data <- readRDS("/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA/data.merge.pro.rds")
20
DefaultAssay(scRNA.data) <- "RNA"
21
22
celltype <- "Tumor"
23
cCREs <- readRDS(paste0("6.Co-Accessible/", celltype,"_peak_gene_correlation.rds")) # 4368 cCREs
24
#gene-cCRE link --- enhancer
25
cCREs <- cCREs %>% subset(distance_bp/1000>=10 & peak2_type != 'Promoter')
26
cCREs$target_gene <- cCREs$peak1_nearestGene
27
cCREs$peak_gene <- paste0(cCREs$Peak2, '_', cCREs$peak1_nearestGene)
28
cCREs$Peak1 <- gsub("_", "-", cCREs$Peak1)
29
cCREs$Peak2 <- gsub("_", "-", cCREs$Peak2)
30
31
# motif information
32
motif.info <- data.frame(originName = names(scATAC.data@assays$Peaks@motifs@motif.names), TF = unlist(scATAC.data@assays$Peaks@motifs@motif.names))
33
rownames(motif.info) <- NULL
34
#                           originName     TF
35
#   ENSG00000008196_LINE2_TFAP2B_D_N1 TFAP2B
36
#      ENSG00000008197_LINE6_TFAP2D_D TFAP2D
37
#   ENSG00000087510_LINE7_TFAP2C_D_N3 TFAP2C
38
#     ENSG00000116819_LINE19_TFAP2E_I TFAP2E
39
#  ENSG00000137203_LINE29_TFAP2A_D_N3 TFAP2A
40
#  ENSG00000116017_LINE44_ARID3A_D_N1 ARID3A
41
42
# subset by celltype
43
scATAC.sub <- subset(scATAC.data, cellType_low == celltype)
44
scRNA.sub <- subset(scRNA.data, cellType_low == celltype)
45
46
scATAC.sub <- FindTopFeatures(scATAC.sub, min.cutoff=ncol(scATAC.sub)*0.05) # present in what % of cells, 要求95%的细胞都包含peak
47
scRNA.sub <- FindVariableFeatures(scRNA.sub, nfeatures = 3000)
48
49
# use chipseeker annotate each peak in scATAC-seq
50
peak.info <- readRDS("4.Peak/peak.annotation.simple.ChIPseeker.rds")
51
52
# TF target genes
53
# for each motif, find genes:
54
use_variable_genes <- F
55
motif_list <- list()
56
edge_df.list <- data.frame()
57
vertex_df.list <- data.frame()
58
for(i in 1:nrow(motif.info)){
59
  motif_name <- motif.info[i,2]
60
  motif_ID <- motif.info[i,1]
61
  cat("Analysis ", motif_ID, "\n")
62
63
  # get list of promoter and enhancer targets of these TFs
64
  DefaultAssay(scATAC.data) <- "Peaks"
65
  motif_accessible <- Motifs(scATAC.data)@data[,motif_ID]
66
  motif_accessible <- names(motif_accessible)[motif_accessible > 0]
67
  motif_accessible <- motif_accessible[motif_accessible %in% VariableFeatures(scATAC.sub)]
68
69
  # promoter locate in 1kb
70
  motif_accessible_promoters <- motif_accessible[motif_accessible %in% peak.info$peaks[which(peak.info$originType == "Promoter (<=1kb)")]]
71
  motif_target_genes <- peak.info$SYMBOL[match(motif_accessible_promoters, peak.info$peaks)]
72
  motif_target_genes <- as.character(motif_target_genes)
73
74
  # variable genes only?
75
  if(use_variable_genes){
76
    motif_target_genes <- motif_target_genes[motif_target_genes %in% VariableFeatures(scRNA.sub)] %>% as.character %>% unique
77
  } else{
78
    motif_target_genes<- motif_target_genes %>% as.character %>% unique
79
  }
80
81
  # enhancer target genes
82
  motif_accessible_enhancers <- motif_accessible[motif_accessible %in% cCREs$Peak2]
83
  motif_cCREs <- subset(cCREs, Peak2 %in% motif_accessible_enhancers)
84
  motif_enhancers_target_genes <- motif_cCREs$target_gene
85
86
  # variable genes only?
87
  if(use_variable_genes){
88
    motif_enhancers_target_genes <- motif_enhancers_target_genes[motif_enhancers_target_genes %in% VariableFeatures(scRNA.sub)] %>% as.character %>% unique
89
  }else{
90
    motif_enhancers_target_genes<- motif_enhancers_target_genes %>% as.character %>% unique
91
  }
92
93
  vertex_df <- data.frame(name = c(motif_name, as.character(unique(c(motif_enhancers_target_genes, motif_target_genes)))))
94
95
  # check if there are promoter targets:
96
  if(length(motif_target_genes) > 0){
97
    promoter_edge_df <- data.frame(
98
      from = motif_name,
99
      to = as.character(motif_target_genes),
100
      type = 'promoter'
101
    )
102
  } else{promoter_edge_df <- data.frame()}
103
104
  # check if there are enhancer targets:
105
  if(length(motif_enhancers_target_genes) > 0){
106
    enhancer_edge_df <- data.frame(
107
      from = motif_name,
108
      to = as.character(motif_enhancers_target_genes),
109
      type = 'enhancer'
110
    )
111
  } else{enhancer_edge_df <- data.frame()}
112
113
  #cur_edge_df <- rbind(cur_promoter_edge_df, cur_enhancer_edge_df, cur_repressors_edge_df)
114
  edge_df <- rbind(promoter_edge_df, enhancer_edge_df)
115
116
  edge_df.list <- rbind(edge_df.list, edge_df)
117
  vertex_df.list <- rbind(vertex_df.list, vertex_df)
118
}
119
120
vertex_df.list <- data.frame(name=na.omit(as.character(unique(vertex_df.list$name))))
121
vertex_df.list$name <- as.character(vertex_df.list$name)
122
edge_df.list <- na.omit(edge_df.list)
123
saveRDS(edge_df.list, file = paste0("7.TF.analysis/targets/", celltype,".All.TF.targets.rds"))