|
a |
|
b/R/amaretto_initialize.R |
|
|
1 |
#' CreateRegulatorData |
|
|
2 |
#' |
|
|
3 |
#' Determine potential regulator genes. |
|
|
4 |
#' @return result |
|
|
5 |
#' @keywords internal |
|
|
6 |
CreateRegulatorData <- function(MA_matrix = MA_matrix, CNV_matrix = NULL, MET_matrix = NULL, Driver_list = NULL, PvalueThreshold = 0.001, RsquareThreshold = 0.1, method = "union") { |
|
|
7 |
if (is.null(Driver_list)) |
|
|
8 |
DriversList <- NULL |
|
|
9 |
if (is.null(CNV_matrix)) |
|
|
10 |
CNV_matrix <- matrix(0, nrow = 0, ncol = 0) |
|
|
11 |
if (nrow(CNV_matrix) > 1) { |
|
|
12 |
GeneVariances = rowVars(CNV_matrix) |
|
|
13 |
CNV_matrix = CNV_matrix[GeneVariances >= 1e-04,] |
|
|
14 |
} |
|
|
15 |
if (nrow(CNV_matrix) > 0) { |
|
|
16 |
CNV_matrix = FindTranscriptionallyPredictive_CNV(MA_matrix, CNV_matrix, PvalueThreshold = PvalueThreshold, RsquareThreshold = RsquareThreshold) |
|
|
17 |
} |
|
|
18 |
cat("\tFound", length(rownames(CNV_matrix)), "CNV driver genes.\n") |
|
|
19 |
if (is.null(MET_matrix)) |
|
|
20 |
MET_matrix <- matrix(0, nrow = 0, ncol = 0) |
|
|
21 |
if (nrow(MET_matrix) > 1) { |
|
|
22 |
METcounts = apply(MET_matrix, 1, function(x) length(unique(x))) |
|
|
23 |
MET_matrix = MET_matrix[METcounts == 2, ] |
|
|
24 |
GeneVariances = rowVars(MET_matrix) |
|
|
25 |
MET_matrix = MET_matrix[GeneVariances >= 1e-04,] |
|
|
26 |
} |
|
|
27 |
MET_drivers <- intersect(rownames(MET_matrix),rownames(MA_matrix)) |
|
|
28 |
cat("\tFound", length(MET_drivers), "MethylMix driver genes.\n") |
|
|
29 |
if (!is.null(Driver_list)) { |
|
|
30 |
DriversList <- Driver_list |
|
|
31 |
DriversList <- intersect(DriversList, rownames(MA_matrix)) |
|
|
32 |
cat("\tFound", length(DriversList), "driver genes from the input list.\n") |
|
|
33 |
} |
|
|
34 |
if (is.null(Driver_list)) |
|
|
35 |
Drivers <- unique(c(rownames(CNV_matrix), MET_drivers,DriversList)) |
|
|
36 |
dataset_drivers <- unique(c(rownames(CNV_matrix),MET_drivers)) |
|
|
37 |
if (!is.null(Driver_list) & method == "union") |
|
|
38 |
Drivers <- union(dataset_drivers, DriversList) |
|
|
39 |
if (!is.null(Driver_list) & method == "intersect") |
|
|
40 |
Drivers <- intersect(dataset_drivers, DriversList) |
|
|
41 |
cat("\tFound a total of", length(Drivers), "unique drivers with your selected method.\n") |
|
|
42 |
if (length(Drivers) == 0) { |
|
|
43 |
cat("AMARETTO doesn't find any driver genes.") |
|
|
44 |
return("No driver") |
|
|
45 |
} else { |
|
|
46 |
if (length(Drivers) == 1) { |
|
|
47 |
RegulatorData_temp <- matrix(0, 1, ncol(MA_matrix)) |
|
|
48 |
colnames(RegulatorData_temp) <- colnames(MA_matrix) |
|
|
49 |
rownames(RegulatorData_temp) <- Drivers |
|
|
50 |
RegulatorData_temp[1, ] <- MA_matrix[Drivers,] |
|
|
51 |
RegulatorData <- RegulatorData_temp |
|
|
52 |
} else { |
|
|
53 |
RegulatorData = MA_matrix[Drivers, ] |
|
|
54 |
} |
|
|
55 |
RegulatorData = t(scale(t(RegulatorData))) |
|
|
56 |
MET_aberrations <- matrix(0, ncol = 3, nrow = length(MET_drivers)) |
|
|
57 |
rownames(MET_aberrations) <- MET_drivers |
|
|
58 |
colnames(MET_aberrations)<-c("Hyper-methylated","Hypo-methylated","No_change") |
|
|
59 |
if (length(MET_drivers) > 0) { |
|
|
60 |
MET_aberrations[, "Hyper-methylated"] <- rowSums(MET_matrix[MET_drivers,] > 0)/ncol(MET_matrix) |
|
|
61 |
MET_aberrations[, "Hypo-methylated"] <- rowSums(MET_matrix[MET_drivers,] < 0)/ncol(MET_matrix) |
|
|
62 |
MET_aberrations[, "No_change"] <- rowSums(MET_matrix[MET_drivers,] == 0)/ncol(MET_matrix) |
|
|
63 |
} |
|
|
64 |
CNV_alterations <- matrix(0, ncol = 3, nrow = nrow(CNV_matrix)) |
|
|
65 |
rownames(CNV_alterations) <- rownames(CNV_matrix) |
|
|
66 |
colnames(CNV_alterations)<-c("Amplification","Deletion","No_change") |
|
|
67 |
if (nrow(CNV_alterations) > 0) { |
|
|
68 |
CNV_alterations[, "Amplification"] <- rowSums(CNV_matrix > 0)/ncol(CNV_matrix) |
|
|
69 |
CNV_alterations[, "Deletion"] <- rowSums(CNV_matrix < 0)/ncol(CNV_matrix) |
|
|
70 |
CNV_alterations[, "No_change"] <- rowSums(CNV_matrix == 0)/ncol(CNV_matrix) |
|
|
71 |
} |
|
|
72 |
driverList_alterations <- matrix(1, ncol = 1, nrow = length(DriversList)) |
|
|
73 |
rownames(driverList_alterations) <- DriversList |
|
|
74 |
Alterations <- matrix(0, nrow = length(Drivers), ncol = 3) |
|
|
75 |
colnames(Alterations) <- c("CNV", "MET", "Driver List") |
|
|
76 |
rownames(Alterations) <- Drivers |
|
|
77 |
Alterations[which(Drivers %in% rownames(CNV_matrix)), 1] <- rep(1, length(which(Drivers %in% rownames(CNV_matrix)))) |
|
|
78 |
Alterations[which(Drivers %in% rownames(MET_matrix)), 2] <- rep(1, length(which(Drivers %in% rownames(MET_matrix)))) |
|
|
79 |
Alterations[which(Drivers %in% rownames(driverList_alterations)), 3] <- rep(1, length(which(Drivers %in% rownames(driverList_alterations)))) |
|
|
80 |
Alterations <- list(MET = MET_aberrations, |
|
|
81 |
CNV = CNV_alterations, |
|
|
82 |
Driver_list = driverList_alterations, |
|
|
83 |
Summary = Alterations) |
|
|
84 |
return(list(RegulatorData = RegulatorData, Alterations = Alterations)) |
|
|
85 |
} |
|
|
86 |
} |
|
|
87 |
|
|
|
88 |
#' FindTranscriptionallyPredictive_CNV |
|
|
89 |
#' |
|
|
90 |
#' Function to identify which genes CNV significantly predict expression of that gene. |
|
|
91 |
#' @return result |
|
|
92 |
#' @keywords internal |
|
|
93 |
FindTranscriptionallyPredictive_CNV <- function(MA_matrix, |
|
|
94 |
CNV_matrix, PvalueThreshold = 0.001, RsquareThreshold = 0.1) { |
|
|
95 |
OverlapGenes = Reduce(intersect, list(rownames(MA_matrix), |
|
|
96 |
rownames(CNV_matrix))) |
|
|
97 |
OverlapSamples = Reduce(intersect, list(colnames(MA_matrix), |
|
|
98 |
colnames(CNV_matrix))) |
|
|
99 |
if (length(OverlapGenes) == 1) { |
|
|
100 |
CNV_TCGA_temp = MA_matrix_temp <- matrix(0, |
|
|
101 |
length(OverlapGenes), length(OverlapSamples)) |
|
|
102 |
rownames(CNV_TCGA_temp) = rownames(MA_matrix_temp) <- OverlapGenes |
|
|
103 |
colnames(CNV_TCGA_temp) = colnames(MA_matrix_temp) <- OverlapSamples |
|
|
104 |
CNV_TCGA_temp[1, ] <- CNV_matrix[OverlapGenes, |
|
|
105 |
OverlapSamples] |
|
|
106 |
MA_matrix_temp[1, ] <- MA_matrix[OverlapGenes, |
|
|
107 |
OverlapSamples] |
|
|
108 |
CNV_matrix = CNV_TCGA_temp |
|
|
109 |
MA_matrix = MA_matrix_temp |
|
|
110 |
} else { |
|
|
111 |
CNV_matrix = CNV_matrix[OverlapGenes, OverlapSamples] |
|
|
112 |
MA_matrix = MA_matrix[OverlapGenes, OverlapSamples] |
|
|
113 |
} |
|
|
114 |
if (length(OverlapGenes) > 0 && length(OverlapSamples) > |
|
|
115 |
0) { |
|
|
116 |
CNVdrivers = c() |
|
|
117 |
for (i in 1:length(rownames(CNV_matrix))) { |
|
|
118 |
res = lm(MA_matrix[i, ] ~ CNV_matrix[i, |
|
|
119 |
]) |
|
|
120 |
res.summary = summary(res) |
|
|
121 |
if (res$coefficients[2] > 0 & res.summary$coefficients[2, |
|
|
122 |
4] < PvalueThreshold & res.summary$r.squared > |
|
|
123 |
RsquareThreshold) { |
|
|
124 |
CNVdrivers = c(CNVdrivers, rownames(CNV_matrix)[i]) |
|
|
125 |
} |
|
|
126 |
} |
|
|
127 |
if (length(CNVdrivers) == 1) { |
|
|
128 |
CNV_matrix_temp <- matrix(0, 1, ncol(CNV_matrix)) |
|
|
129 |
colnames(CNV_matrix_temp) = colnames(CNV_matrix) |
|
|
130 |
rownames(CNV_matrix_temp) = CNVdrivers |
|
|
131 |
CNV_matrix_temp[1, ] <- CNV_matrix[CNVdrivers, |
|
|
132 |
] |
|
|
133 |
CNV_matrix = CNV_matrix_temp |
|
|
134 |
} else { |
|
|
135 |
CNV_matrix = CNV_matrix[CNVdrivers, ] |
|
|
136 |
} |
|
|
137 |
} |
|
|
138 |
return(CNV_matrix = CNV_matrix) |
|
|
139 |
} |
|
|
140 |
|
|
|
141 |
#' geneFiltering |
|
|
142 |
#' |
|
|
143 |
#' Function to filter gene expression matrix |
|
|
144 |
#' @return result |
|
|
145 |
#' @keywords internal |
|
|
146 |
geneFiltering <- function(Type, MAdata, Percentage) { |
|
|
147 |
switch(Type, Variance = { |
|
|
148 |
GeneVariances = rowVars(MAdata) |
|
|
149 |
tmpResult = sort(GeneVariances, decreasing = TRUE) |
|
|
150 |
SortedGenes = names(tmpResult) |
|
|
151 |
tmpNrGenes = round(length(rownames(MAdata)) * |
|
|
152 |
Percentage/100) |
|
|
153 |
MAdata_Filtered = MAdata[SortedGenes[1:tmpNrGenes], |
|
|
154 |
] |
|
|
155 |
}, MAD = { |
|
|
156 |
GeneVariances = rowMads(MAdata) |
|
|
157 |
names(GeneVariances) = rownames(MAdata) |
|
|
158 |
tmpResult = sort(GeneVariances, decreasing = TRUE) |
|
|
159 |
SortedGenes = names(tmpResult) |
|
|
160 |
tmpNrGenes = round(length(rownames(MAdata)) * |
|
|
161 |
Percentage/100) |
|
|
162 |
MAdata_Filtered = MAdata[SortedGenes[1:tmpNrGenes], |
|
|
163 |
] |
|
|
164 |
}) |
|
|
165 |
return(MAdata_Filtered) |
|
|
166 |
} |
|
|
167 |
|
|
|
168 |
#' read_gct |
|
|
169 |
#' |
|
|
170 |
#' Function to turn a .gct data files into a matrix format |
|
|
171 |
#' |
|
|
172 |
#' @param file_address Address of the input gct file. |
|
|
173 |
#' |
|
|
174 |
#' @importFrom utils read.delim |
|
|
175 |
#' @return result |
|
|
176 |
#' @export |
|
|
177 |
#' @examples |
|
|
178 |
#' data_matrix<-read_gct(file_address="") |
|
|
179 |
read_gct <- function(file_address){ |
|
|
180 |
if(file_address==""){ |
|
|
181 |
print("No gct file address is provided.") |
|
|
182 |
return(NULL) |
|
|
183 |
} |
|
|
184 |
else{ |
|
|
185 |
data_fr <-read.delim(file_address, skip=2, sep="\t", header=TRUE, row.names=1) |
|
|
186 |
data_fr <- as.matrix(subset(data_fr, select=-c(Description))) |
|
|
187 |
return(data_fr) |
|
|
188 |
} |
|
|
189 |
} |
|
|
190 |
|
|
|
191 |
#' printf |
|
|
192 |
#' |
|
|
193 |
#' Wrapper function for C-style formatted output. |
|
|
194 |
#' @return result |
|
|
195 |
#' @keywords internal |
|
|
196 |
printf <- function(...) { |
|
|
197 |
cat(sprintf(...)) |
|
|
198 |
} |