a b/R/cleanMetadata.GSE10846.R
1
#' @rdname cleanMetadata
2
#' @details
3
#'    GSE10846:\cr
4
#'    The cleanup of GSE10846 (LLMPP) adds two batches corresponding to each
5
#'    the CHOP and the R-CHOP cohort.
6
#' @export
7
cleanMetadata.GSE10846 <- function(meta_data) {
8
  message("Cleaning GSE10846 (LLMPP)!")
9
10
  # Generic clean
11
  suppressMessages(meta_data <- cleanMetadata.data.frame(meta_data))
12
13
  stopifnot(requireNamespace("survival"))
14
15
  # Helper functions
16
  wo.na <- function(x) sum(x[!is.na(x)])
17
  n.is.na <- function(x) sum(is.na(x))
18
  IPI <- function(age, ECOG, stage, No.Extra.Nodal, LDH) {
19
    a <- ifelse(age            >  60, 1, 0)
20
    b <- ifelse(ECOG           >   1, 1, 0)
21
    c <- ifelse(No.Extra.Nodal >=  2, 1, 0)
22
    d <- ifelse(stage          >   2, 1, 0)
23
    e <- ifelse(LDH            >   1, 1, 0)
24
25
    ipi <- data.frame(a = a, b = b, c = c, d = d, e = e)
26
    score  <- apply(ipi, 1, sum)
27
    score2 <- apply(ipi, 1, wo.na)
28
    n.NA   <- apply(ipi, 1, n.is.na) == 1
29
    n.NA2  <- apply(ipi, 1, n.is.na) == 2
30
31
    ipi.hl <- rep(NA, length(n.NA))
32
    ipi.hl[score %in% c(0, 1, 2)] <- 0
33
    ipi.hl[score %in% c(3, 4, 5)] <- 1
34
35
36
    ipi.hl2 <- rep(NA, length(n.NA))
37
    ipi.hl2[score2 %in% c(0, 1, 2)] <- 0
38
    ipi.hl2[score2 %in% c(3, 4, 5)] <- 1
39
40
41
    ipi.hl[n.NA & score2 %in% c(0, 1, 3, 4) ] <-
42
      ipi.hl2[n.NA & score2 %in% c(0, 1, 3, 4) ]
43
44
    ipi.hl[n.NA2 & score2 %in% c(0, 3) ] <-
45
      ipi.hl2[n.NA2 & score2 %in% c(0, 3) ]
46
47
    return(list(ipi = score, ipi.hl = ipi.hl, na.1 = n.NA, ipi.na = score2))
48
  }
49
50
  metadata <- apply(meta_data, 2, as.character)
51
  metadata <- as.data.frame(metadata[1:414, ], stringsAsFactors = FALSE)
52
53
  GEO.ID <- metadata$geo_accession
54
  id     <- gsub("Individual: ", "", metadata$source_name_ch1)
55
  gender <- gsub("Gender: ",     "", metadata$characteristics_ch1)
56
  age    <- gsub("Age: ",        "", metadata$characteristics_ch1.1)
57
  tissue <- gsub("Tissue: ",     "", metadata$characteristics_ch1.2)
58
59
  disease.state        <- gsub("Disease state: ",
60
                               "", metadata$characteristics_ch1.3)
61
  Submitting.diagnosis <- gsub("Clinical info: Submitting diagnosis: ",
62
                               "", metadata$characteristics_ch1.5)
63
  microarray.diagnosis <- gsub("Clinical info: Final microarray diagnosis: ",
64
                               "", metadata$characteristics_ch1.6)
65
  microarray.diagnosis <- gsub(" DLBCL", "", microarray.diagnosis)
66
67
  status <- gsub("Clinical info: Follow up status: ",
68
                 "", metadata$characteristics_ch1.7)
69
  FU     <- gsub("Clinical info: Follow up years: ",
70
                 "", metadata$characteristics_ch1.8)
71
  chemo  <- gsub("Clinical info: Chemotherapy: ",
72
                 "", metadata$characteristics_ch1.9)
73
74
  chemo  <- gsub("-Like Regimen", "", chemo)
75
76
  ECOG   <- gsub("Clinical info: ECOG performance status: ",
77
                 "", metadata$characteristics_ch1.10)
78
  stage  <- gsub("Clinical info: Stage: ",
79
                 "", metadata$characteristics_ch1.11)
80
  LDH    <- gsub("Clinical info: LDH ratio: ",
81
                 "", metadata$characteristics_ch1.12)
82
83
  No.Extra.Nodal <- gsub("Clinical info: Number of extranodal sites: ",
84
                         "", metadata$characteristics_ch1.13)
85
86
87
  metadataLLMPP <- data.frame(id, GEO.ID, gender, as.numeric(age), status,
88
                              FU, chemo, tissue,
89
                              disease.state, Submitting.diagnosis,
90
                              microarray.diagnosis, ECOG, stage, LDH,
91
                              No.Extra.Nodal = No.Extra.Nodal)
92
93
  colnames(metadataLLMPP) <- c("id", "GEO.ID", "gender", "age",
94
                               "survival.status", "FU", "chemo", "tissue",
95
                               "disease.state", "Submitting.diagnosis",
96
                               "microarray.diagnosis", "ECOG", "stage",
97
                               "LDH", "No.Extra.Nodal")
98
99
  metadataLLMPP$FU <- as.numeric(as.character(metadataLLMPP$FU))
100
  metadataLLMPP$stage <- as.numeric(as.character(metadataLLMPP$stage))
101
  metadataLLMPP$age   <- as.numeric(as.character(metadataLLMPP$age))
102
  metadataLLMPP$No.Extra.Nodal <- as.numeric(as.character(metadataLLMPP$No.Extra.Nodal))
103
  metadataLLMPP$ECOG <- as.numeric(as.character(metadataLLMPP$ECOG))
104
  metadataLLMPP$LDH  <- as.numeric(as.character(metadataLLMPP$LDH))
105
  ipi <- IPI(metadataLLMPP$age,   metadataLLMPP$ECOG,
106
             metadataLLMPP$stage, metadataLLMPP$No.Extra.Nodal,
107
             metadataLLMPP$LDH)
108
109
  metadataLLMPP$ipi    <- as.factor(ipi$ipi)
110
  metadataLLMPP$ipi.hl <- as.factor(ipi$ipi.hl)
111
112
  metadataLLMPP$ipi.hl <- as.character(metadataLLMPP$ipi)
113
  metadataLLMPP$ipi.hl[metadataLLMPP$ipi %in% c(0, 1)] <- "0-1"
114
  metadataLLMPP$ipi.hl[metadataLLMPP$ipi %in% c(2, 3)] <- "2-3"
115
  metadataLLMPP$ipi.hl[metadataLLMPP$ipi %in% c(4, 5)] <- "4-5"
116
117
  metadataLLMPP$ipi.hl2 <- metadataLLMPP$ipi.hl
118
119
  metadataLLMPP$ipi.hl2[ipi$ipi.na == 0 & ipi$na.1] <- "0-1"
120
  metadataLLMPP$ipi.hl2[ipi$ipi.na == 2 & ipi$na.1] <- "2-3"
121
  metadataLLMPP$ipi.hl2[ipi$ipi.na == 4 & ipi$na.1] <- "4-5"
122
123
  # Creating survival objects
124
  metadataLLMPP$OS <- survival::Surv(metadataLLMPP$FU,
125
                                     metadataLLMPP$survival.status == "DEAD")
126
127
  os5  <- ifelse(metadataLLMPP$FU > 5, 5, metadataLLMPP$FU)
128
  ios5 <- pmin(ifelse(metadataLLMPP$FU > 5, 0, 1), metadataLLMPP$OS[,2])
129
130
  metadataLLMPP$OS5  <- survival::Surv(as.numeric(os5), ios5)
131
132
  metadataLLMPP$WrightClass  <- metadataLLMPP$microarray.diagnosis
133
  metadataLLMPP$WrightClass2 <- as.character(metadataLLMPP$WrightClass)
134
  metadataLLMPP$WrightClass2 <-
135
    as.factor(gsub("Unclassified", "UC", metadataLLMPP$WrightClass2))
136
137
  rownames(metadataLLMPP) <- paste(metadataLLMPP$GEO.ID, ".CEL",sep = "")
138
139
  # Added factor describing the batches and CEL files
140
  metadataLLMPP$Batch <- as.factor(metadataLLMPP$chemo)
141
  metadataLLMPP$CEL   <- rownames(metadataLLMPP)
142
  metadataLLMPP$GSM   <- as.character(metadataLLMPP$GEO.ID)
143
144
  class(metadataLLMPP) <- class(meta_data)
145
  return(metadataLLMPP)
146
}