[40a513]: / ATAC / Functions / TCGA-data.R

Download this file

74 lines (67 with data), 3.6 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
setwd("/data/active_data/lzl/RenalTumor-20200713/Data/TCGA/KIRC/Result")
source(file = "/data/active_data/lzl/RenalTumor-20200713/code/analysis.diff.survival.TCGA.R")
#### surivial analysis in TCGA
#http://firebrowse.org/?cohort=KIRC&download_dialog=true,数据下载来源
exp.data <- read.table("/data/active_data/lzl/RenalTumor-20200713/Data/TCGA/KIRC/KIRC__RSEM_normalized__data.data.txt",
header = T, stringsAsFactors = F, sep = "\t")
exp.data <- exp.data[-1,]
rownames(exp.data) <- exp.data[,1]
exp.data.process <- exp.data[,-1]
exp.data.process <- apply(exp.data.process, 2, as.numeric)
rownames(exp.data.process) <- rownames(exp.data)
exp.data.process <- log2(exp.data.process+1)
saveRDS(exp.data.process, file = "Log2.expression.rds") #20531*606
#### raw count
exp.matrix <- read.table("/data/active_data/lzl/RenalTumor-20200713/Data/TCGA/KIRC/KIRC_RSEM_genes_data.data.txt",
header = T, stringsAsFactors = F, sep = "\t")
count.index <- seq(1,1818,by=3)+1
exp.matrix.count <- exp.matrix[,count.index]
exp.matrix.count <- exp.matrix.count[-1,]
rownames(exp.matrix.count) <- exp.matrix[2:nrow(exp.matrix),1]
#处理表达谱名字
gene.id <- gsub("\\|\\d+", "", rownames(exp.matrix.count))
index <- which(gene.id == "?")
exp.data.process.pro <- exp.data.process[-index,]
exp.matrix.count.pro <- exp.matrix.count[-index,]
gene.id <- gene.id[-index]
#去除重复--- SLC35E2
gene.id <- gene.id[-16273]
exp.data.process.pro <- exp.data.process.pro[-16273,]
exp.matrix.count.pro <- exp.matrix.count.pro[-16273,]
rownames(exp.data.process.pro) <- gene.id
rownames(exp.matrix.count.pro) <- gene.id
saveRDS(exp.data.process.pro, file = "Log2.expression.pro.rds") #20501*606
saveRDS(exp.matrix.count.pro, file = "exp.matrix.count.pro.rds") #20501*606
#### process the survival data
clin.data <- read.table("/data/active_data/lzl/RenalTumor-20200713/Data/TCGA/KIRC/KIRC_clin.txt", header = T, stringsAsFactors = F, sep = "\t")
clin.data$Sample <- gsub("-", ".", clin.data$Sample)
index <- grep("01$", clin.data$Sample)
clin.data <- clin.data[index, ]
saveRDS(clin.data, file = "clin.data.rds")
##############################1.Differential expression analysis#########################################
# Distinguish between normal and cancer samples
sample.id <- colnames(exp.data.process.pro)
sample.id <- substr(sample.id, 1, 16)
sample.type <- substr(sample.id, 14, 15)
##normal
normal.index <- which(sample.type == "11") #72
cancer.index <- which(sample.type == "01") #533
####DEseq2 analysis
library(DESeq2)
exp.matrix.count.pro2 <- as.matrix(exp.matrix.count.pro)
exp.matrix.count.pro2 <- apply(exp.matrix.count.pro2, 2, as.numeric)
exp.matrix.count.pro2 <- round(exp.matrix.count.pro2)
index <- which(rowSums(exp.matrix.count.pro2)>0)
exp.matrix.count.pro2 <- exp.matrix.count.pro2[index,] #去除不表达的基因,
rownames(exp.matrix.count.pro2) <- rownames(exp.matrix.count.pro)[index] #20221*606
exp.matrix.count.pro2 <- exp.matrix.count.pro2[,c(normal.index, cancer.index)] #605个样本
condition <- rep("Tumor", ncol(exp.matrix.count.pro2))
condition[1:length(normal.index)] <- "Normal"
coldata <- data.frame(row.names = colnames(exp.matrix.count.pro2), condition = as.factor(condition))
dds <- DESeqDataSetFromMatrix(countData = exp.matrix.count.pro2, colData = coldata, design=~condition)
dds <- DESeq(dds)
DESeq2.result <- results(dds)
saveRDS(DESeq2.result, file = "DESeq2.result.rds")
#提取标化后的counts
DESeq2.normalized_counts <- counts(dds, normalized=TRUE)
saveRDS(DESeq2.normalized_counts, file = "DESeq2.normalized_counts.rds")