|
a |
|
b/overview/explore-dataset.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- FALSE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Simple data investigations |
|
|
11 |
#' |
|
|
12 |
#+ setup, message=FALSE |
|
|
13 |
|
|
|
14 |
data.filename <- '../../data/cohort-sanitised.csv' |
|
|
15 |
|
|
|
16 |
source('../lib/shared.R') |
|
|
17 |
requirePlus('rms') |
|
|
18 |
|
|
|
19 |
COHORT.full <- data.frame(fread(data.filename)) |
|
|
20 |
|
|
|
21 |
COHORT.use <- subset(COHORT.full, !exclude) |
|
|
22 |
|
|
|
23 |
#' ## Missing data |
|
|
24 |
#' |
|
|
25 |
#' How much is there, and where is it concentrated? |
|
|
26 |
#' |
|
|
27 |
#+ missing_data_plot |
|
|
28 |
|
|
|
29 |
interesting.vars <- |
|
|
30 |
c( |
|
|
31 |
'age', 'gender', 'diagnosis', 'pci_6mo', 'cabg_6mo', |
|
|
32 |
'hx_mi', 'long_nitrate', 'smokstatus', 'hypertension', 'diabetes', |
|
|
33 |
'total_chol_6mo', 'hdl_6mo', 'heart_failure', 'pad', 'hx_af', 'hx_stroke', |
|
|
34 |
'hx_renal', 'hx_copd', 'hx_cancer', 'hx_liver', 'hx_depression', |
|
|
35 |
'hx_anxiety', 'pulse_6mo', 'crea_6mo', 'total_wbc_6mo','haemoglobin_6mo', |
|
|
36 |
'imd_score' |
|
|
37 |
) |
|
|
38 |
|
|
|
39 |
missingness <- |
|
|
40 |
unlist(lapply(COHORT.use[, interesting.vars], function(x){sum(is.na(x))})) |
|
|
41 |
|
|
|
42 |
missingness <- data.frame(var = names(missingness), n.missing = missingness) |
|
|
43 |
|
|
|
44 |
missingness$pc.missing <- missingness$n.missing / nrow(COHORT.use) |
|
|
45 |
|
|
|
46 |
ggplot(subset(missingness, n.missing > 0), aes(x = var, y = pc.missing)) + |
|
|
47 |
geom_bar(stat = 'identity') + |
|
|
48 |
ggtitle('% missingness by variable') |
|
|
49 |
|
|
|
50 |
#' Are any variables commonly found to be jointly missing? |
|
|
51 |
#' |
|
|
52 |
#+ missing_jointly_plot |
|
|
53 |
|
|
|
54 |
COHORT.missing <- |
|
|
55 |
data.frame( |
|
|
56 |
lapply(COHORT.use[, interesting.vars], function(x){is.na(x)}) |
|
|
57 |
) |
|
|
58 |
|
|
|
59 |
COHORT.missing.cor <- data.frame( |
|
|
60 |
t(combn(1:ncol(COHORT.missing), 2)), |
|
|
61 |
var1 = NA, var2 = NA, joint.missing = NA |
|
|
62 |
) |
|
|
63 |
|
|
|
64 |
for(i in 1:nrow(COHORT.missing.cor)) { |
|
|
65 |
var1 <- sort(names(COHORT.missing))[COHORT.missing.cor[i, 'X1']] |
|
|
66 |
var2 <- sort(names(COHORT.missing))[COHORT.missing.cor[i, 'X2']] |
|
|
67 |
COHORT.missing.cor[i, c('var1', 'var2')] <- c(var1, var2) |
|
|
68 |
if(any(COHORT.missing[, var1]) & any(COHORT.missing[, var2])) { |
|
|
69 |
COHORT.missing.cor[i, 'joint.missing'] <- |
|
|
70 |
sum(!(COHORT.missing[, var1]) & !(COHORT.missing[, var2])) / |
|
|
71 |
sum(!(COHORT.missing[, var1]) | !(COHORT.missing[, var2])) |
|
|
72 |
} |
|
|
73 |
} |
|
|
74 |
|
|
|
75 |
ggplot(subset(COHORT.missing.cor, !is.na(joint.missing)), aes(x = var1, y = var2, fill = joint.missing)) + |
|
|
76 |
geom_tile() |
|
|
77 |
|
|
|
78 |
#' Some tests are usually ordered together. Are they missing together? |
|
|
79 |
#' |
|
|
80 |
#+ missing_venn |
|
|
81 |
|
|
|
82 |
ggplot( |
|
|
83 |
data.frame( |
|
|
84 |
x = 1, |
|
|
85 |
category = factor(c('HDL', 'both', 'total cholesterol', 'neither'), |
|
|
86 |
levels = c('HDL', 'both', 'total cholesterol', 'neither')), |
|
|
87 |
val = c( |
|
|
88 |
sum(!COHORT.missing$hdl_6mo) - sum(!COHORT.missing$hdl_6mo & !COHORT.missing$total_chol_6mo), |
|
|
89 |
sum(!COHORT.missing$hdl_6mo & !COHORT.missing$total_chol_6mo), |
|
|
90 |
sum(!COHORT.missing$total_chol_6mo) - sum(!COHORT.missing$hdl_6mo & !COHORT.missing$total_chol_6mo), |
|
|
91 |
sum(COHORT.missing$hdl_6mo & COHORT.missing$total_chol_6mo) |
|
|
92 |
) |
|
|
93 |
), |
|
|
94 |
aes(x = x, y = val, fill = category) |
|
|
95 |
) + |
|
|
96 |
geom_bar(stat='identity') + |
|
|
97 |
scale_fill_manual(values=c("#cc0000", "#990099", "#0000cc", '#cccccc')) |
|
|
98 |
|
|
|
99 |
ggplot( |
|
|
100 |
data.frame( |
|
|
101 |
x = 1, |
|
|
102 |
category = factor(c('haemoglobin', 'both', 'WBC', 'neither'), |
|
|
103 |
levels = c('haemoglobin', 'both', 'WBC', 'neither')), |
|
|
104 |
val = c( |
|
|
105 |
sum(!COHORT.missing$haemoglobin_6mo) - sum(!COHORT.missing$haemoglobin_6mo & !COHORT.missing$total_wbc_6mo), |
|
|
106 |
sum(!COHORT.missing$haemoglobin_6mo & !COHORT.missing$total_wbc_6mo), |
|
|
107 |
sum(!COHORT.missing$total_wbc_6mo) - sum(!COHORT.missing$haemoglobin_6mo & !COHORT.missing$total_wbc_6mo), |
|
|
108 |
sum(COHORT.missing$hdl_6mo & COHORT.missing$total_wbc_6mo) |
|
|
109 |
) |
|
|
110 |
), |
|
|
111 |
aes(x = x, y = val, fill = category) |
|
|
112 |
) + |
|
|
113 |
geom_bar(stat='identity') + |
|
|
114 |
scale_fill_manual(values=c("#cc0000", "#990099", "#0000cc", '#cccccc')) |
|
|
115 |
|
|
|
116 |
|
|
|
117 |
#' ### Survival and missingness |
|
|
118 |
#' |
|
|
119 |
#' Let's plot survival curves for a few types of data by missingness... |
|
|
120 |
#' |
|
|
121 |
#+ missingness_survival |
|
|
122 |
|
|
|
123 |
COHORT.use$surv <- with(COHORT.use, Surv(time_death, endpoint_death == 'Death')) |
|
|
124 |
|
|
|
125 |
plotSurvSummary <- function(df, var) { |
|
|
126 |
df$var_summary <- factorNAfix(binByQuantile(df[, var], c(0,0.1,0.9,1))) |
|
|
127 |
surv.curve <- npsurv(surv ~ var_summary, data = df) |
|
|
128 |
print(survplot(surv.curve, ylab = var)) |
|
|
129 |
} |
|
|
130 |
|
|
|
131 |
plotSurvSummary(COHORT.use, 'total_wbc_6mo') |
|
|
132 |
|
|
|
133 |
surv.curve <- npsurv(surv ~ is.na(crea_6mo), data = COHORT.use) |
|
|
134 |
survplot(surv.curve) |
|
|
135 |
|
|
|
136 |
surv.curve <- npsurv(surv ~ is.na(haemoglobin_6mo), data = COHORT.use) |
|
|
137 |
survplot(surv.curve) |
|
|
138 |
|
|
|
139 |
surv.curve <- npsurv(surv ~ is.na(hdl_6mo), data = COHORT.use) |
|
|
140 |
survplot(surv.curve) |
|
|
141 |
|
|
|
142 |
surv.curve <- npsurv(surv ~ is.na(pulse_6mo), data = COHORT.use) |
|
|
143 |
survplot(surv.curve) |
|
|
144 |
|
|
|
145 |
surv.curve <- npsurv(surv ~ is.na(smokstatus), data = COHORT.use) |
|
|
146 |
survplot(surv.curve) |
|
|
147 |
|
|
|
148 |
surv.curve <- npsurv(surv ~ is.na(total_chol_6mo), data = COHORT.use) |
|
|
149 |
survplot(surv.curve) |
|
|
150 |
|
|
|
151 |
surv.curve <- npsurv(surv ~ is.na(total_wbc_6mo), data = COHORT.use) |
|
|
152 |
survplot(surv.curve) |
|
|
153 |
|
|
|
154 |
#' Variables where it's safer to be missing data: |
|
|
155 |
#' * Creatinine |
|
|
156 |
#' * |
|
|
157 |
#' |
|
|
158 |
#' Variables where it's safer to have data: |
|
|
159 |
#' * |
|
|
160 |
|
|
|
161 |
#' ## Data distributions |
|
|
162 |
#' |
|
|
163 |
#' How are the data distributed? |
|
|
164 |
#' |
|
|
165 |
#+ data_distributions |
|
|
166 |
|
|
|
167 |
ggplot(COHORT.full) + |
|
|
168 |
geom_histogram(aes(x = crea_6mo, fill = crea_6mo %% 10 != 0), binwidth = 1) + |
|
|
169 |
xlim(0, 300) |
|
|
170 |
|
|
|
171 |
#' Creatinine levels are fairly smoothly distributed. The highlighted bins |
|
|
172 |
#' indicate numerical values divisible by 10, and there seems to be no |
|
|
173 |
#' particular bias. The small cluster of extremely low values could be |
|
|
174 |
#' misrecorded somehow. |
|
|
175 |
|
|
|
176 |
ggplot(COHORT.full) + |
|
|
177 |
geom_histogram(aes(x = pulse_6mo, fill = pulse_6mo %% 4 != 0), binwidth = 1) + |
|
|
178 |
xlim(0, 150) |
|
|
179 |
|
|
|
180 |
#' Heart rate data have high missingness, and those few values we do have are |
|
|
181 |
#' very heavily biased towards multiples of 4. This is likely because heart rate |
|
|
182 |
#' is commonly measured for 15 seconds and then multiplied up to give a result |
|
|
183 |
#' in beats per minute! There is also a bias towards round numbers, with large |
|
|
184 |
#' peaks at 60, 80, 100 and 120... |