--- a +++ b/R/cleanMetadata.GSE10846.R @@ -0,0 +1,146 @@ +#' @rdname cleanMetadata +#' @details +#' GSE10846:\cr +#' The cleanup of GSE10846 (LLMPP) adds two batches corresponding to each +#' the CHOP and the R-CHOP cohort. +#' @export +cleanMetadata.GSE10846 <- function(meta_data) { + message("Cleaning GSE10846 (LLMPP)!") + + # Generic clean + suppressMessages(meta_data <- cleanMetadata.data.frame(meta_data)) + + stopifnot(requireNamespace("survival")) + + # Helper functions + wo.na <- function(x) sum(x[!is.na(x)]) + n.is.na <- function(x) sum(is.na(x)) + IPI <- function(age, ECOG, stage, No.Extra.Nodal, LDH) { + a <- ifelse(age > 60, 1, 0) + b <- ifelse(ECOG > 1, 1, 0) + c <- ifelse(No.Extra.Nodal >= 2, 1, 0) + d <- ifelse(stage > 2, 1, 0) + e <- ifelse(LDH > 1, 1, 0) + + ipi <- data.frame(a = a, b = b, c = c, d = d, e = e) + score <- apply(ipi, 1, sum) + score2 <- apply(ipi, 1, wo.na) + n.NA <- apply(ipi, 1, n.is.na) == 1 + n.NA2 <- apply(ipi, 1, n.is.na) == 2 + + ipi.hl <- rep(NA, length(n.NA)) + ipi.hl[score %in% c(0, 1, 2)] <- 0 + ipi.hl[score %in% c(3, 4, 5)] <- 1 + + + ipi.hl2 <- rep(NA, length(n.NA)) + ipi.hl2[score2 %in% c(0, 1, 2)] <- 0 + ipi.hl2[score2 %in% c(3, 4, 5)] <- 1 + + + ipi.hl[n.NA & score2 %in% c(0, 1, 3, 4) ] <- + ipi.hl2[n.NA & score2 %in% c(0, 1, 3, 4) ] + + ipi.hl[n.NA2 & score2 %in% c(0, 3) ] <- + ipi.hl2[n.NA2 & score2 %in% c(0, 3) ] + + return(list(ipi = score, ipi.hl = ipi.hl, na.1 = n.NA, ipi.na = score2)) + } + + metadata <- apply(meta_data, 2, as.character) + metadata <- as.data.frame(metadata[1:414, ], stringsAsFactors = FALSE) + + GEO.ID <- metadata$geo_accession + id <- gsub("Individual: ", "", metadata$source_name_ch1) + gender <- gsub("Gender: ", "", metadata$characteristics_ch1) + age <- gsub("Age: ", "", metadata$characteristics_ch1.1) + tissue <- gsub("Tissue: ", "", metadata$characteristics_ch1.2) + + disease.state <- gsub("Disease state: ", + "", metadata$characteristics_ch1.3) + Submitting.diagnosis <- gsub("Clinical info: Submitting diagnosis: ", + "", metadata$characteristics_ch1.5) + microarray.diagnosis <- gsub("Clinical info: Final microarray diagnosis: ", + "", metadata$characteristics_ch1.6) + microarray.diagnosis <- gsub(" DLBCL", "", microarray.diagnosis) + + status <- gsub("Clinical info: Follow up status: ", + "", metadata$characteristics_ch1.7) + FU <- gsub("Clinical info: Follow up years: ", + "", metadata$characteristics_ch1.8) + chemo <- gsub("Clinical info: Chemotherapy: ", + "", metadata$characteristics_ch1.9) + + chemo <- gsub("-Like Regimen", "", chemo) + + ECOG <- gsub("Clinical info: ECOG performance status: ", + "", metadata$characteristics_ch1.10) + stage <- gsub("Clinical info: Stage: ", + "", metadata$characteristics_ch1.11) + LDH <- gsub("Clinical info: LDH ratio: ", + "", metadata$characteristics_ch1.12) + + No.Extra.Nodal <- gsub("Clinical info: Number of extranodal sites: ", + "", metadata$characteristics_ch1.13) + + + metadataLLMPP <- data.frame(id, GEO.ID, gender, as.numeric(age), status, + FU, chemo, tissue, + disease.state, Submitting.diagnosis, + microarray.diagnosis, ECOG, stage, LDH, + No.Extra.Nodal = No.Extra.Nodal) + + colnames(metadataLLMPP) <- c("id", "GEO.ID", "gender", "age", + "survival.status", "FU", "chemo", "tissue", + "disease.state", "Submitting.diagnosis", + "microarray.diagnosis", "ECOG", "stage", + "LDH", "No.Extra.Nodal") + + metadataLLMPP$FU <- as.numeric(as.character(metadataLLMPP$FU)) + metadataLLMPP$stage <- as.numeric(as.character(metadataLLMPP$stage)) + metadataLLMPP$age <- as.numeric(as.character(metadataLLMPP$age)) + metadataLLMPP$No.Extra.Nodal <- as.numeric(as.character(metadataLLMPP$No.Extra.Nodal)) + metadataLLMPP$ECOG <- as.numeric(as.character(metadataLLMPP$ECOG)) + metadataLLMPP$LDH <- as.numeric(as.character(metadataLLMPP$LDH)) + ipi <- IPI(metadataLLMPP$age, metadataLLMPP$ECOG, + metadataLLMPP$stage, metadataLLMPP$No.Extra.Nodal, + metadataLLMPP$LDH) + + metadataLLMPP$ipi <- as.factor(ipi$ipi) + metadataLLMPP$ipi.hl <- as.factor(ipi$ipi.hl) + + metadataLLMPP$ipi.hl <- as.character(metadataLLMPP$ipi) + metadataLLMPP$ipi.hl[metadataLLMPP$ipi %in% c(0, 1)] <- "0-1" + metadataLLMPP$ipi.hl[metadataLLMPP$ipi %in% c(2, 3)] <- "2-3" + metadataLLMPP$ipi.hl[metadataLLMPP$ipi %in% c(4, 5)] <- "4-5" + + metadataLLMPP$ipi.hl2 <- metadataLLMPP$ipi.hl + + metadataLLMPP$ipi.hl2[ipi$ipi.na == 0 & ipi$na.1] <- "0-1" + metadataLLMPP$ipi.hl2[ipi$ipi.na == 2 & ipi$na.1] <- "2-3" + metadataLLMPP$ipi.hl2[ipi$ipi.na == 4 & ipi$na.1] <- "4-5" + + # Creating survival objects + metadataLLMPP$OS <- survival::Surv(metadataLLMPP$FU, + metadataLLMPP$survival.status == "DEAD") + + os5 <- ifelse(metadataLLMPP$FU > 5, 5, metadataLLMPP$FU) + ios5 <- pmin(ifelse(metadataLLMPP$FU > 5, 0, 1), metadataLLMPP$OS[,2]) + + metadataLLMPP$OS5 <- survival::Surv(as.numeric(os5), ios5) + + metadataLLMPP$WrightClass <- metadataLLMPP$microarray.diagnosis + metadataLLMPP$WrightClass2 <- as.character(metadataLLMPP$WrightClass) + metadataLLMPP$WrightClass2 <- + as.factor(gsub("Unclassified", "UC", metadataLLMPP$WrightClass2)) + + rownames(metadataLLMPP) <- paste(metadataLLMPP$GEO.ID, ".CEL",sep = "") + + # Added factor describing the batches and CEL files + metadataLLMPP$Batch <- as.factor(metadataLLMPP$chemo) + metadataLLMPP$CEL <- rownames(metadataLLMPP) + metadataLLMPP$GSM <- as.character(metadataLLMPP$GEO.ID) + + class(metadataLLMPP) <- class(meta_data) + return(metadataLLMPP) +}