#' @rdname cleanMetadata
#' @details
#' GSE10846:\cr
#' The cleanup of GSE10846 (LLMPP) adds two batches corresponding to each
#' the CHOP and the R-CHOP cohort.
#' @export
cleanMetadata.GSE10846 <- function(meta_data) {
message("Cleaning GSE10846 (LLMPP)!")
# Generic clean
suppressMessages(meta_data <- cleanMetadata.data.frame(meta_data))
stopifnot(requireNamespace("survival"))
# Helper functions
wo.na <- function(x) sum(x[!is.na(x)])
n.is.na <- function(x) sum(is.na(x))
IPI <- function(age, ECOG, stage, No.Extra.Nodal, LDH) {
a <- ifelse(age > 60, 1, 0)
b <- ifelse(ECOG > 1, 1, 0)
c <- ifelse(No.Extra.Nodal >= 2, 1, 0)
d <- ifelse(stage > 2, 1, 0)
e <- ifelse(LDH > 1, 1, 0)
ipi <- data.frame(a = a, b = b, c = c, d = d, e = e)
score <- apply(ipi, 1, sum)
score2 <- apply(ipi, 1, wo.na)
n.NA <- apply(ipi, 1, n.is.na) == 1
n.NA2 <- apply(ipi, 1, n.is.na) == 2
ipi.hl <- rep(NA, length(n.NA))
ipi.hl[score %in% c(0, 1, 2)] <- 0
ipi.hl[score %in% c(3, 4, 5)] <- 1
ipi.hl2 <- rep(NA, length(n.NA))
ipi.hl2[score2 %in% c(0, 1, 2)] <- 0
ipi.hl2[score2 %in% c(3, 4, 5)] <- 1
ipi.hl[n.NA & score2 %in% c(0, 1, 3, 4) ] <-
ipi.hl2[n.NA & score2 %in% c(0, 1, 3, 4) ]
ipi.hl[n.NA2 & score2 %in% c(0, 3) ] <-
ipi.hl2[n.NA2 & score2 %in% c(0, 3) ]
return(list(ipi = score, ipi.hl = ipi.hl, na.1 = n.NA, ipi.na = score2))
}
metadata <- apply(meta_data, 2, as.character)
metadata <- as.data.frame(metadata[1:414, ], stringsAsFactors = FALSE)
GEO.ID <- metadata$geo_accession
id <- gsub("Individual: ", "", metadata$source_name_ch1)
gender <- gsub("Gender: ", "", metadata$characteristics_ch1)
age <- gsub("Age: ", "", metadata$characteristics_ch1.1)
tissue <- gsub("Tissue: ", "", metadata$characteristics_ch1.2)
disease.state <- gsub("Disease state: ",
"", metadata$characteristics_ch1.3)
Submitting.diagnosis <- gsub("Clinical info: Submitting diagnosis: ",
"", metadata$characteristics_ch1.5)
microarray.diagnosis <- gsub("Clinical info: Final microarray diagnosis: ",
"", metadata$characteristics_ch1.6)
microarray.diagnosis <- gsub(" DLBCL", "", microarray.diagnosis)
status <- gsub("Clinical info: Follow up status: ",
"", metadata$characteristics_ch1.7)
FU <- gsub("Clinical info: Follow up years: ",
"", metadata$characteristics_ch1.8)
chemo <- gsub("Clinical info: Chemotherapy: ",
"", metadata$characteristics_ch1.9)
chemo <- gsub("-Like Regimen", "", chemo)
ECOG <- gsub("Clinical info: ECOG performance status: ",
"", metadata$characteristics_ch1.10)
stage <- gsub("Clinical info: Stage: ",
"", metadata$characteristics_ch1.11)
LDH <- gsub("Clinical info: LDH ratio: ",
"", metadata$characteristics_ch1.12)
No.Extra.Nodal <- gsub("Clinical info: Number of extranodal sites: ",
"", metadata$characteristics_ch1.13)
metadataLLMPP <- data.frame(id, GEO.ID, gender, as.numeric(age), status,
FU, chemo, tissue,
disease.state, Submitting.diagnosis,
microarray.diagnosis, ECOG, stage, LDH,
No.Extra.Nodal = No.Extra.Nodal)
colnames(metadataLLMPP) <- c("id", "GEO.ID", "gender", "age",
"survival.status", "FU", "chemo", "tissue",
"disease.state", "Submitting.diagnosis",
"microarray.diagnosis", "ECOG", "stage",
"LDH", "No.Extra.Nodal")
metadataLLMPP$FU <- as.numeric(as.character(metadataLLMPP$FU))
metadataLLMPP$stage <- as.numeric(as.character(metadataLLMPP$stage))
metadataLLMPP$age <- as.numeric(as.character(metadataLLMPP$age))
metadataLLMPP$No.Extra.Nodal <- as.numeric(as.character(metadataLLMPP$No.Extra.Nodal))
metadataLLMPP$ECOG <- as.numeric(as.character(metadataLLMPP$ECOG))
metadataLLMPP$LDH <- as.numeric(as.character(metadataLLMPP$LDH))
ipi <- IPI(metadataLLMPP$age, metadataLLMPP$ECOG,
metadataLLMPP$stage, metadataLLMPP$No.Extra.Nodal,
metadataLLMPP$LDH)
metadataLLMPP$ipi <- as.factor(ipi$ipi)
metadataLLMPP$ipi.hl <- as.factor(ipi$ipi.hl)
metadataLLMPP$ipi.hl <- as.character(metadataLLMPP$ipi)
metadataLLMPP$ipi.hl[metadataLLMPP$ipi %in% c(0, 1)] <- "0-1"
metadataLLMPP$ipi.hl[metadataLLMPP$ipi %in% c(2, 3)] <- "2-3"
metadataLLMPP$ipi.hl[metadataLLMPP$ipi %in% c(4, 5)] <- "4-5"
metadataLLMPP$ipi.hl2 <- metadataLLMPP$ipi.hl
metadataLLMPP$ipi.hl2[ipi$ipi.na == 0 & ipi$na.1] <- "0-1"
metadataLLMPP$ipi.hl2[ipi$ipi.na == 2 & ipi$na.1] <- "2-3"
metadataLLMPP$ipi.hl2[ipi$ipi.na == 4 & ipi$na.1] <- "4-5"
# Creating survival objects
metadataLLMPP$OS <- survival::Surv(metadataLLMPP$FU,
metadataLLMPP$survival.status == "DEAD")
os5 <- ifelse(metadataLLMPP$FU > 5, 5, metadataLLMPP$FU)
ios5 <- pmin(ifelse(metadataLLMPP$FU > 5, 0, 1), metadataLLMPP$OS[,2])
metadataLLMPP$OS5 <- survival::Surv(as.numeric(os5), ios5)
metadataLLMPP$WrightClass <- metadataLLMPP$microarray.diagnosis
metadataLLMPP$WrightClass2 <- as.character(metadataLLMPP$WrightClass)
metadataLLMPP$WrightClass2 <-
as.factor(gsub("Unclassified", "UC", metadataLLMPP$WrightClass2))
rownames(metadataLLMPP) <- paste(metadataLLMPP$GEO.ID, ".CEL",sep = "")
# Added factor describing the batches and CEL files
metadataLLMPP$Batch <- as.factor(metadataLLMPP$chemo)
metadataLLMPP$CEL <- rownames(metadataLLMPP)
metadataLLMPP$GSM <- as.character(metadataLLMPP$GEO.ID)
class(metadataLLMPP) <- class(meta_data)
return(metadataLLMPP)
}