--- a +++ b/overview/explore-dataset.R @@ -0,0 +1,184 @@ +#+ knitr_setup, include = FALSE + +# Whether to cache the intensive code sections. Set to FALSE to recalculate +# everything afresh. +cacheoption <- FALSE +# Disable lazy caching globally, because it fails for large objects, and all the +# objects we wish to cache are large... +opts_chunk$set(cache.lazy = FALSE) + +#' # Simple data investigations +#' +#+ setup, message=FALSE + +data.filename <- '../../data/cohort-sanitised.csv' + +source('../lib/shared.R') +requirePlus('rms') + +COHORT.full <- data.frame(fread(data.filename)) + +COHORT.use <- subset(COHORT.full, !exclude) + +#' ## Missing data +#' +#' How much is there, and where is it concentrated? +#' +#+ missing_data_plot + +interesting.vars <- + c( + 'age', 'gender', 'diagnosis', 'pci_6mo', 'cabg_6mo', + 'hx_mi', 'long_nitrate', 'smokstatus', 'hypertension', 'diabetes', + 'total_chol_6mo', 'hdl_6mo', 'heart_failure', 'pad', 'hx_af', 'hx_stroke', + 'hx_renal', 'hx_copd', 'hx_cancer', 'hx_liver', 'hx_depression', + 'hx_anxiety', 'pulse_6mo', 'crea_6mo', 'total_wbc_6mo','haemoglobin_6mo', + 'imd_score' + ) + +missingness <- + unlist(lapply(COHORT.use[, interesting.vars], function(x){sum(is.na(x))})) + +missingness <- data.frame(var = names(missingness), n.missing = missingness) + +missingness$pc.missing <- missingness$n.missing / nrow(COHORT.use) + +ggplot(subset(missingness, n.missing > 0), aes(x = var, y = pc.missing)) + + geom_bar(stat = 'identity') + + ggtitle('% missingness by variable') + +#' Are any variables commonly found to be jointly missing? +#' +#+ missing_jointly_plot + +COHORT.missing <- + data.frame( + lapply(COHORT.use[, interesting.vars], function(x){is.na(x)}) + ) + +COHORT.missing.cor <- data.frame( + t(combn(1:ncol(COHORT.missing), 2)), + var1 = NA, var2 = NA, joint.missing = NA +) + +for(i in 1:nrow(COHORT.missing.cor)) { + var1 <- sort(names(COHORT.missing))[COHORT.missing.cor[i, 'X1']] + var2 <- sort(names(COHORT.missing))[COHORT.missing.cor[i, 'X2']] + COHORT.missing.cor[i, c('var1', 'var2')] <- c(var1, var2) + if(any(COHORT.missing[, var1]) & any(COHORT.missing[, var2])) { + COHORT.missing.cor[i, 'joint.missing'] <- + sum(!(COHORT.missing[, var1]) & !(COHORT.missing[, var2])) / + sum(!(COHORT.missing[, var1]) | !(COHORT.missing[, var2])) + } +} + +ggplot(subset(COHORT.missing.cor, !is.na(joint.missing)), aes(x = var1, y = var2, fill = joint.missing)) + + geom_tile() + +#' Some tests are usually ordered together. Are they missing together? +#' +#+ missing_venn + +ggplot( + data.frame( + x = 1, + category = factor(c('HDL', 'both', 'total cholesterol', 'neither'), + levels = c('HDL', 'both', 'total cholesterol', 'neither')), + val = c( + sum(!COHORT.missing$hdl_6mo) - sum(!COHORT.missing$hdl_6mo & !COHORT.missing$total_chol_6mo), + sum(!COHORT.missing$hdl_6mo & !COHORT.missing$total_chol_6mo), + sum(!COHORT.missing$total_chol_6mo) - sum(!COHORT.missing$hdl_6mo & !COHORT.missing$total_chol_6mo), + sum(COHORT.missing$hdl_6mo & COHORT.missing$total_chol_6mo) + ) + ), + aes(x = x, y = val, fill = category) +) + + geom_bar(stat='identity') + + scale_fill_manual(values=c("#cc0000", "#990099", "#0000cc", '#cccccc')) + +ggplot( + data.frame( + x = 1, + category = factor(c('haemoglobin', 'both', 'WBC', 'neither'), + levels = c('haemoglobin', 'both', 'WBC', 'neither')), + val = c( + sum(!COHORT.missing$haemoglobin_6mo) - sum(!COHORT.missing$haemoglobin_6mo & !COHORT.missing$total_wbc_6mo), + sum(!COHORT.missing$haemoglobin_6mo & !COHORT.missing$total_wbc_6mo), + sum(!COHORT.missing$total_wbc_6mo) - sum(!COHORT.missing$haemoglobin_6mo & !COHORT.missing$total_wbc_6mo), + sum(COHORT.missing$hdl_6mo & COHORT.missing$total_wbc_6mo) + ) + ), + aes(x = x, y = val, fill = category) +) + + geom_bar(stat='identity') + + scale_fill_manual(values=c("#cc0000", "#990099", "#0000cc", '#cccccc')) + + +#' ### Survival and missingness +#' +#' Let's plot survival curves for a few types of data by missingness... +#' +#+ missingness_survival + +COHORT.use$surv <- with(COHORT.use, Surv(time_death, endpoint_death == 'Death')) + +plotSurvSummary <- function(df, var) { + df$var_summary <- factorNAfix(binByQuantile(df[, var], c(0,0.1,0.9,1))) + surv.curve <- npsurv(surv ~ var_summary, data = df) + print(survplot(surv.curve, ylab = var)) +} + +plotSurvSummary(COHORT.use, 'total_wbc_6mo') + +surv.curve <- npsurv(surv ~ is.na(crea_6mo), data = COHORT.use) +survplot(surv.curve) + +surv.curve <- npsurv(surv ~ is.na(haemoglobin_6mo), data = COHORT.use) +survplot(surv.curve) + +surv.curve <- npsurv(surv ~ is.na(hdl_6mo), data = COHORT.use) +survplot(surv.curve) + +surv.curve <- npsurv(surv ~ is.na(pulse_6mo), data = COHORT.use) +survplot(surv.curve) + +surv.curve <- npsurv(surv ~ is.na(smokstatus), data = COHORT.use) +survplot(surv.curve) + +surv.curve <- npsurv(surv ~ is.na(total_chol_6mo), data = COHORT.use) +survplot(surv.curve) + +surv.curve <- npsurv(surv ~ is.na(total_wbc_6mo), data = COHORT.use) +survplot(surv.curve) + +#' Variables where it's safer to be missing data: +#' * Creatinine +#' * +#' +#' Variables where it's safer to have data: +#' * + +#' ## Data distributions +#' +#' How are the data distributed? +#' +#+ data_distributions + +ggplot(COHORT.full) + + geom_histogram(aes(x = crea_6mo, fill = crea_6mo %% 10 != 0), binwidth = 1) + + xlim(0, 300) + +#' Creatinine levels are fairly smoothly distributed. The highlighted bins +#' indicate numerical values divisible by 10, and there seems to be no +#' particular bias. The small cluster of extremely low values could be +#' misrecorded somehow. + +ggplot(COHORT.full) + + geom_histogram(aes(x = pulse_6mo, fill = pulse_6mo %% 4 != 0), binwidth = 1) + + xlim(0, 150) + +#' Heart rate data have high missingness, and those few values we do have are +#' very heavily biased towards multiples of 4. This is likely because heart rate +#' is commonly measured for 15 seconds and then multiplied up to give a result +#' in beats per minute! There is also a bias towards round numbers, with large +#' peaks at 60, 80, 100 and 120... \ No newline at end of file