Switch to unified view

a b/overview/explore-dataset.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- FALSE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Simple data investigations
11
#' 
12
#+ setup, message=FALSE
13
14
data.filename <- '../../data/cohort-sanitised.csv'
15
16
source('../lib/shared.R')
17
requirePlus('rms')
18
19
COHORT.full <- data.frame(fread(data.filename))
20
21
COHORT.use <- subset(COHORT.full, !exclude)
22
23
#' ## Missing data
24
#' 
25
#' How much is there, and where is it concentrated?
26
#' 
27
#+ missing_data_plot
28
29
interesting.vars <- 
30
  c(
31
    'age', 'gender', 'diagnosis', 'pci_6mo', 'cabg_6mo',
32
    'hx_mi', 'long_nitrate', 'smokstatus', 'hypertension', 'diabetes',
33
    'total_chol_6mo', 'hdl_6mo', 'heart_failure', 'pad', 'hx_af', 'hx_stroke',
34
    'hx_renal', 'hx_copd', 'hx_cancer', 'hx_liver', 'hx_depression',
35
    'hx_anxiety', 'pulse_6mo', 'crea_6mo', 'total_wbc_6mo','haemoglobin_6mo',
36
    'imd_score'
37
  )
38
39
missingness <- 
40
  unlist(lapply(COHORT.use[, interesting.vars], function(x){sum(is.na(x))}))
41
42
missingness <- data.frame(var = names(missingness), n.missing = missingness)
43
44
missingness$pc.missing <- missingness$n.missing / nrow(COHORT.use)
45
46
ggplot(subset(missingness, n.missing > 0), aes(x = var, y = pc.missing)) +
47
  geom_bar(stat = 'identity') +
48
  ggtitle('% missingness by variable')
49
50
#' Are any variables commonly found to be jointly missing?
51
#' 
52
#+ missing_jointly_plot
53
54
COHORT.missing <-
55
  data.frame(
56
    lapply(COHORT.use[, interesting.vars], function(x){is.na(x)})
57
  )
58
59
COHORT.missing.cor <- data.frame(
60
  t(combn(1:ncol(COHORT.missing), 2)),
61
  var1 = NA, var2 = NA, joint.missing = NA
62
)
63
64
for(i in 1:nrow(COHORT.missing.cor)) {
65
  var1 <- sort(names(COHORT.missing))[COHORT.missing.cor[i, 'X1']]
66
  var2 <- sort(names(COHORT.missing))[COHORT.missing.cor[i, 'X2']]
67
  COHORT.missing.cor[i, c('var1', 'var2')] <- c(var1, var2)
68
  if(any(COHORT.missing[, var1]) & any(COHORT.missing[, var2])) {
69
    COHORT.missing.cor[i, 'joint.missing'] <-
70
      sum(!(COHORT.missing[, var1]) & !(COHORT.missing[, var2])) / 
71
      sum(!(COHORT.missing[, var1]) | !(COHORT.missing[, var2]))
72
  }
73
}
74
75
ggplot(subset(COHORT.missing.cor, !is.na(joint.missing)), aes(x = var1, y = var2, fill = joint.missing)) +
76
  geom_tile()
77
78
#' Some tests are usually ordered together. Are they missing together?
79
#' 
80
#+ missing_venn
81
82
ggplot(
83
  data.frame(
84
    x = 1,
85
    category = factor(c('HDL', 'both', 'total cholesterol', 'neither'),
86
                      levels = c('HDL', 'both', 'total cholesterol', 'neither')),
87
    val = c(
88
      sum(!COHORT.missing$hdl_6mo) - sum(!COHORT.missing$hdl_6mo & !COHORT.missing$total_chol_6mo),
89
      sum(!COHORT.missing$hdl_6mo & !COHORT.missing$total_chol_6mo),
90
      sum(!COHORT.missing$total_chol_6mo) - sum(!COHORT.missing$hdl_6mo & !COHORT.missing$total_chol_6mo),
91
      sum(COHORT.missing$hdl_6mo & COHORT.missing$total_chol_6mo)
92
    )
93
  ),
94
  aes(x = x, y = val, fill = category)
95
) +
96
  geom_bar(stat='identity') +
97
  scale_fill_manual(values=c("#cc0000", "#990099", "#0000cc", '#cccccc'))
98
99
ggplot(
100
  data.frame(
101
    x = 1,
102
    category = factor(c('haemoglobin', 'both', 'WBC', 'neither'),
103
                      levels = c('haemoglobin', 'both', 'WBC', 'neither')),
104
    val = c(
105
      sum(!COHORT.missing$haemoglobin_6mo) - sum(!COHORT.missing$haemoglobin_6mo & !COHORT.missing$total_wbc_6mo),
106
      sum(!COHORT.missing$haemoglobin_6mo & !COHORT.missing$total_wbc_6mo),
107
      sum(!COHORT.missing$total_wbc_6mo) - sum(!COHORT.missing$haemoglobin_6mo & !COHORT.missing$total_wbc_6mo),
108
      sum(COHORT.missing$hdl_6mo & COHORT.missing$total_wbc_6mo)
109
    )
110
  ),
111
  aes(x = x, y = val, fill = category)
112
) +
113
  geom_bar(stat='identity') +
114
  scale_fill_manual(values=c("#cc0000", "#990099", "#0000cc", '#cccccc'))
115
  
116
117
#' ### Survival and missingness
118
#' 
119
#' Let's plot survival curves for a few types of data by missingness...
120
#' 
121
#+ missingness_survival
122
123
COHORT.use$surv <- with(COHORT.use, Surv(time_death, endpoint_death == 'Death'))
124
125
plotSurvSummary <- function(df, var) {
126
  df$var_summary <- factorNAfix(binByQuantile(df[, var], c(0,0.1,0.9,1)))
127
  surv.curve <- npsurv(surv ~ var_summary, data = df)
128
  print(survplot(surv.curve, ylab = var))
129
}
130
131
plotSurvSummary(COHORT.use, 'total_wbc_6mo')
132
133
surv.curve <- npsurv(surv ~ is.na(crea_6mo), data = COHORT.use)
134
survplot(surv.curve)
135
136
surv.curve <- npsurv(surv ~ is.na(haemoglobin_6mo), data = COHORT.use)
137
survplot(surv.curve)
138
139
surv.curve <- npsurv(surv ~ is.na(hdl_6mo), data = COHORT.use)
140
survplot(surv.curve)
141
142
surv.curve <- npsurv(surv ~ is.na(pulse_6mo), data = COHORT.use)
143
survplot(surv.curve)
144
145
surv.curve <- npsurv(surv ~ is.na(smokstatus), data = COHORT.use)
146
survplot(surv.curve)
147
148
surv.curve <- npsurv(surv ~ is.na(total_chol_6mo), data = COHORT.use)
149
survplot(surv.curve)
150
151
surv.curve <- npsurv(surv ~ is.na(total_wbc_6mo), data = COHORT.use)
152
survplot(surv.curve)
153
154
#' Variables where it's safer to be missing data:
155
#'  * Creatinine
156
#'  * 
157
#' 
158
#' Variables where it's safer to have data:
159
#'  * 
160
161
#' ## Data distributions
162
#' 
163
#' How are the data distributed?
164
#' 
165
#+ data_distributions
166
167
ggplot(COHORT.full) +
168
  geom_histogram(aes(x = crea_6mo, fill = crea_6mo %% 10 != 0), binwidth = 1) +
169
  xlim(0, 300)
170
171
#' Creatinine levels are fairly smoothly distributed. The highlighted bins
172
#' indicate numerical values divisible by 10, and there seems to be no
173
#' particular bias. The small cluster of extremely low values could be
174
#' misrecorded somehow.
175
176
ggplot(COHORT.full) +
177
  geom_histogram(aes(x = pulse_6mo, fill = pulse_6mo %% 4 != 0), binwidth = 1) +
178
  xlim(0, 150)
179
180
#' Heart rate data have high missingness, and those few values we do have are
181
#' very heavily biased towards multiples of 4. This is likely because heart rate
182
#' is commonly measured for 15 seconds and then multiplied up to give a result
183
#' in beats per minute! There is also a bias towards round numbers, with large
184
#' peaks at 60, 80, 100 and 120...