--- a +++ b/scripts/export-bigdata.R @@ -0,0 +1,610 @@ +################################################################################ +### USER VARIABLES ############################################################# +################################################################################ + +setwd('R:/Pop_Health/Farr_Luscombe/') + + +### Relating to constructed cohort ############################################# +# the R data file containing the patient cohort +cohort.file <- '2_Cohort/COHORT_SCAD_makecohort2.rda' + +# columns to remove +kill.cols <- c('anonpatid', 'pracid','year_of_birth') + +# columns to make relative to indexdate +date.cols <- c('afterentry_acs','afterentry_mi_nos', + 'afterentry_nstemi','afterentry_stemi','afterentry_ua', + 'crd','date_entry','date_exit','date_mi_endpoint','deathdate','dob','dod', + 'dod_combined','earliest_chd','earliest_hf','earliest_mi','earliest_sa', + 'earliest_ua','frd','hes_end','hes_start','indexdate','praclcd', + 'pracuts','tod','recent_acs','recent_mi_nos','recent_nstemi', + 'recent_stemi','recent_ua','smokdate','endpoint_coronary_date', + 'endpoint_death_date' +) + +### Relating to other files #################################################### +data.path <- '1a_ExtractedData' + +hes.file <- file.path(data.path, 'hes_diag_epi.csv') +n.top.icd <- 100 + +procedures.file <- file.path(data.path, 'hes_procedures.csv') +n.top.procedures <- 100 + +tests.files <- + file.path( + data.path, + paste0('test.part.', 0:3) + ) +# Number of tests to extract +n.top.tests <- 100 +# Find tests as close as possible to the first value here, and no more than the +# second timepoint before it +test.timepoints <- c(0, 183) #days + +clinical.files <- + file.path( + data.path, + paste0('clinical.part.', 0:3) + ) +n.top.clinical.values <- 100 +n.top.clinical.history <- 100 + +therapy.files <- + file.path( + data.path, + paste0('therapy.part.', 0:7) + ) +n.top.therapy <- 100 + + +################################################################################ +### END USER VARIABLES ######################################################### +################################################################################ + + +### Load the patient cohort to act as a base ################################### + +# load the standard COHORT variable, which is a data table called COHORT +load(cohort.file) + +### Do a bunch of silly fixes on the data ###################################### +# cabg_6mo is 1s and 0s, which should be TRUE and FALSE +COHORT$cabg_6mo <- as.logical(COHORT$cabg_6mo) +# long_nitrate is 1s and NAs, which should be TRUE and FALSE... +COHORT$long_nitrate <- as.logical(COHORT$long_nitrate) +COHORT$long_nitrate[is.na(COHORT$long_nitrate)] <- FALSE +# pci_6mo is 1s and 0s, which should be TRUE and FALSE +COHORT$pci_6mo <- as.logical(COHORT$pci_6mo) + +### Append other data types #################################################### + +# We'll be needing handymedical from here on in +source('Andrew/lib/handymedical.R', chdir = TRUE) +require(CALIBERlookups) + +percentMissing <- function(x, sf = 3) { + round(sum(is.na(x))/length(x), digits = sf)*100 +} + +### Hospital episodes statistics ############################################### + +# read in the hospital data (HES) +hes.diag.epi <- readMedicalData( + hes.file, + c("anonpatid", "epistart", "epiend", "icd"), + c("integer", "date", "date", "factor") +) + +# remove non alphanumerics (trailing -s on some ICD codes) +hes.diag.epi$icd <- gsub('[^[:alnum:]]', '', hes.diag.epi$icd) + +# Now, merge with the indexdate...we only want data before then, because looking +# after it is cheating, and using all data rather than just stuff before may +# distort our choice of variables as some variables may be very common after +# entering the study, but less so before... (This actually isn't much of an +# issue...only 9 variables differ. Still!) +hes.diag.epi <- + merge( + hes.diag.epi, COHORT[, c('anonpatid', 'indexdate')], + by = 'anonpatid', all = TRUE + ) +hes.diag.epi$relativedate <- hes.diag.epi$indexdate - hes.diag.epi$epistart + +# Now, remove all the negative relative dates, because they're in the past +hes.diag.epi <- hes.diag.epi[hes.diag.epi$relativedate >= 0, ] + +# get aggregate statistics for each ICD code +hes.by.icd <- hes.diag.epi[, length(unique(anonpatid)), by = icd] +names(hes.by.icd) <- c('icd', 'n.pat') + +# Take the top n by number of patients with that code +top.icd <- + hes.by.icd$icd[order(hes.by.icd$n.pat, decreasing = TRUE)[1:n.top.icd]] + +# And now, let's put how far in the past the patient was first diagnosed with +# each of these things into the table... + +# First, discard all the rows corresponding to non-top ICDs +hes.diag.epi.top <- hes.diag.epi[hes.diag.epi$icd %in% top.icd, ] +# Then, keep only the earliest instance of each per patient, because those are +# the ones we care about +hes.diag.epi.top.earliest <- + hes.diag.epi.top[, min(relativedate), by = c('anonpatid', 'icd')] + +# Per ICD code, add a new column to the cohort and put in numbers +hes.diag.epi.top.earliest <- + dcast( + data = hes.diag.epi.top.earliest, + formula = anonpatid ~ icd, + value.var = "V1" + ) + +# Add a prefix to the names to keep track +names(hes.diag.epi.top.earliest)[2:(n.top.icd + 1)] <- + paste0('hes.icd.', names(hes.diag.epi.top.earliest)[2:(n.top.icd + 1)]) + +# Keep the names for when we need to make them relative to the indexdate when +# anonymising later +names.hes.diag.icd <- names(hes.diag.epi.top.earliest)[2:(n.top.icd + 1)] + +# Now, merge with the original cohort +COHORT <- + merge( + COHORT, + hes.diag.epi.top.earliest, + by = c('anonpatid'), + all = TRUE + ) + +hes.icd.summary <- + data.frame( + percent.missing = + sort( + sapply( + COHORT[, names(COHORT)[startsWith(names(COHORT), 'hes.icd.')], with = FALSE], + percentMissing + ) + ) + ) + +hes.icd.summary$code <- substring(rownames(hes.icd.summary), 9) + +hes.icd.summary <- merge(hes.icd.summary, CALIBER_DICT[, c('code', 'term')], by = 'code') + +print(hes.icd.summary[order(hes.icd.summary$percent.missing),]) + +### Hospital procedures data ################################################### + +# read in the hospital data (HES) +hes.procedures <- readMedicalData( + procedures.file, + c("anonpatid", "opcs", "evdate"), + c("integer", "factor", "date") +) + +# Now, merge with the indexdate...we only want data before then, because looking +# after it is cheating, and using all data rather than just stuff before may +# distort our choice of variables as some variables may be very common after +# entering the study, but less so before... (14 differ...) +hes.procedures <- + merge( + hes.procedures, COHORT[, c('anonpatid', 'indexdate')], + by = 'anonpatid', all = TRUE + ) +hes.procedures$relativedate <- hes.procedures$indexdate - hes.procedures$evdate + +# Now, remove all the negative relative dates, because they're in the past +hes.procedures <- hes.procedures[relativedate >= 0] + +# get aggregate statistics for each OPCS code +hes.by.opcs <- hes.procedures[, length(unique(anonpatid)), by = opcs] +names(hes.by.opcs) <- c('opcs', 'n.pat') + +# Take the top n by number of patients with that code +top.opcs <- + hes.by.opcs$opcs[order(hes.by.opcs$n.pat, decreasing = TRUE)[1:n.top.procedures]] + +# And now, let's put how far in the past the patient was first diagnosed with +# each of these things into the table... + +# First, discard all the rows corresponding to non-top OPCS codes +hes.procedures.top <- hes.procedures[hes.procedures$opcs %in% top.opcs, ] +# Then, keep only the earliest instance of each per patient, because those are +# the ones we care about +hes.procedures.top.earliest <- + hes.procedures.top[, min(relativedate), by = c('anonpatid', 'opcs')] + +# Per OPCS code, add a new column to the cohort and put in numbers +hes.procedures.top.earliest <- + dcast( + data = hes.procedures.top.earliest, + formula = anonpatid ~ opcs, + value.var = "V1" + ) + +# Add a prefix to the names to keep track +names(hes.procedures.top.earliest)[2:(n.top.procedures + 1)] <- + paste0('hes.opcs.', names(hes.procedures.top.earliest)[2:(n.top.procedures + 1)]) + +# Keep the names for when we need to make them relative to the indexdate when +# anonymising later +names.procedures.opcs <- names(hes.procedures.top.earliest)[2:(n.top.icd + 1)] + +# Now, merge with the original cohort +COHORT <- + merge( + COHORT, + hes.procedures.top.earliest, + by = c('anonpatid'), + all = TRUE + ) + +# Summarise by percent missing +hes.opcs.summary <- + data.frame( + percent.missing = + sort( + sapply( + COHORT[, names(COHORT)[startsWith(names(COHORT), 'hes.opcs.')], with = FALSE], + percentMissing + ) + ) + ) + +hes.opcs.summary$code <- substring(rownames(hes.opcs.summary), 10) + +hes.opcs.summary <- merge(hes.opcs.summary, CALIBER_DICT[, c('code', 'term')], by = 'code') + +print(hes.opcs.summary[order(hes.opcs.summary$percent.missing),]) + +### Test results ############################################################### + +# read in the test data +tests.data <- readMedicalData( + tests.files, + # data1 is operator (eg =, > etc), data2 is value, data 3 is unit of measure + c("anonpatid", "eventdate", "enttype", "data1", "data2", "data3"), + c("integer", "date", "integer", "integer", "numeric", "integer") +) + +# First, discard those where operator is not =, because > and < etc will +# introduce complexity, and drop data1 since it's now useless +tests.data <- tests.data[data1 == 3] +tests.data$data1 <- NULL + +# Now, let's subtract the indexdate from every test so we can choose the ones +# closest to the desired dates... +tests.data <- + merge( + tests.data, COHORT[, c('anonpatid', 'indexdate')], + by = 'anonpatid', all = TRUE + ) +tests.data$relativedate <- tests.data$indexdate - tests.data$eventdate + +# Only keep positive values; negative ones are in the future which is cheating! +tests.data <- tests.data[relativedate >= 0] +# Only 89001 have any test results from before the indexdate! + +# Discard any test results from times greater than the longest time ago to check +tests.data <- tests.data[relativedate < test.timepoints[2]] +# And this drops to 57972 in the preceding six months + +# Per patient and per test, keep the smallest relativedate value only +tests.data <- + tests.data[, .SD[which.min(relativedate)], by = list(anonpatid, enttype)] + +# aggregate by test type (enttype, which covers a range of Read codes which can +# mean the same test) and unit of measure (so we can choose the tests with the +# units with the best coverage) +# We do this after all the preprocessing (ie only looking at the most recent +# value per patient before but not too far before the indexdate) because +# otherwise a lot of the top tests change. Presumably +tests.by.test <- + tests.data[, length(unique(anonpatid)), by = c('enttype', 'data3')] +names(tests.by.test) <- c('enttype', 'data3', 'n.pat') + +top.tests <- tests.by.test$enttype[order(tests.by.test$n.pat, decreasing = TRUE)[1:n.top.tests]] +top.units <- tests.by.test$data3[order(tests.by.test$n.pat, decreasing = TRUE)[1:n.top.tests]] + +# Now, discard all the rows corresponding to non-top tests +tests.data <- + tests.data[ + # Needs to be exact match of enttype and variable combination + paste0(enttype, '!!!', data3) %in% paste0(top.tests, '!!!', top.units) + ] + +# Make a column per test +tests.wide <- + dcast( + data = tests.data, + formula = anonpatid ~ enttype + data3, + value.var = "data2" + ) + +# Add a prefix to the names to keep track +# (There may not be n.top.tests columns here as some get lost during the paring +# down processes above...) +names(tests.wide)[-1] <- + paste0('tests.enttype.data3.', names(tests.wide)[-1]) + +# Now, merge with the original cohort +COHORT <- + merge( + COHORT, + tests.wide, + by = c('anonpatid'), + all = TRUE + ) + + +### GP diagnosis data ########################################################## + +# read in the GP data (clinical) +clinical.data <- readMedicalData( + clinical.files, + c("anonpatid", "eventdate", "medcode", "enttype", "data1", "data2", "data3", "data4", "data5", "data6", "data7"), + c("integer", "date", "integer", "integer", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric") +) + +# Now, merge with the indexdate...we only want data before then, because looking +# after it is cheating, and using all data rather than just stuff before may +# distort our choice of variables as some variables may be very common after +# entering the study, but less so before... (14 differ...) +clinical.data <- + merge( + clinical.data, COHORT[, c('anonpatid', 'indexdate')], + by = 'anonpatid', all = TRUE + ) +clinical.data$relativedate <- clinical.data$indexdate - clinical.data$eventdate + +# Now, remove all the negative relative dates, because they're in the past +clinical.data <- clinical.data[relativedate >= 0] + +# Find out which enttypes have associated data values, to distinguish purely +# binary variables (like medical history, family history etc) from test results +# and so on which have data1, data2, etc values. +# From a quick scan, anything with any data at all has a data1 value, so we can +# use that as a proxy. +clinical.by.data1 <- clinical.data[, sum(!is.na(data1)), by = enttype] + +# Split into two data tables, the ones with associated data, and without +clinical.history <- + clinical.data[ + enttype %in% clinical.by.data1$enttype[clinical.by.data1$V1 == 0] + ] +clinical.values <- + clinical.data[ + enttype %in% clinical.by.data1$enttype[clinical.by.data1$V1 > 0] + ] + +### Clinical history + +# get aggregate statistics for each medcode +clinical.history.by.medcode <- + clinical.history[, length(unique(anonpatid)), by = medcode] +names(clinical.history.by.medcode) <- c('medcode', 'n.pat') + +# Print a table of the leading medcodes we've found +print( + merge( + clinical.history.by.medcode[order(n.pat, decreasing = TRUE)[1:100],], + CALIBER_DICT[, c('medcode', 'term')], by = 'medcode' + ) +) + +# Take the top n by number of patients with that code +top.history <- + clinical.history.by.medcode$medcode[ + order( + clinical.history.by.medcode$n.pat, decreasing = TRUE + )[1:n.top.clinical.history] + ] + +# And now, let's put how far in the past the patient was first diagnosed with +# each of these things into the table... + +# First, discard all the rows corresponding to non-top OPCS codes +clinical.history <- clinical.history[clinical.history$medcode %in% top.history, ] +# Then, keep only the earliest instance of each per patient, because those are +# the ones we care about +clinical.history <- + clinical.history[, min(relativedate), by = c('anonpatid', 'medcode')] + +# Per medcode, add a new column to the cohort and put in numbers +clinical.history <- + dcast( + data = clinical.history, + formula = anonpatid ~ medcode, + value.var = "V1" + ) + +# Add a prefix to the names to keep track +names(clinical.history)[2:(n.top.clinical.history + 1)] <- + paste0('clinical.history.', names(clinical.history)[2:(n.top.clinical.history + 1)]) + +# Now, merge with the original cohort +COHORT <- + merge( + COHORT, + clinical.history, + by = c('anonpatid'), + all = TRUE + ) + +### Clinical values + +# Next, let's melt the values data by data1, data2 etc. Each potentially +# contains a separate measurement (eg for entity type 13, which is weight, data1 +# is the weight, data2 is the weight centile [always blank in this dataset!] and +# data3 is BMI) +clinical.values <- + melt( + clinical.values, + id.vars = c("anonpatid", "relativedate", "medcode", "enttype"), + measure.vars = paste0("data", 1:7) + ) +# Because there are lots of data types, the value column gets coerced to double. +# This is fine for now, because it's all numeric, and whilst some values are +# categorical represented as integers, random forests don't care about that. + +# Remove all NA values +clinical.values <- clinical.values[!is.na(value)] + +# Remove all values measured too long ago +clinical.values <- clinical.values[relativedate <= 183] + +# aggregate by enttype and which data column the test came from +clinical.values.by.type <- + clinical.values[, length(unique(anonpatid)), by = c('enttype', 'variable')] +names(clinical.values.by.type) <- c('enttype', 'variable', 'n.pat') + +top.clinical.enttypes <- + clinical.values.by.type$enttype[order(clinical.values.by.type$n.pat, decreasing = TRUE)[1:n.top.clinical.values]] +top.clinical.dataN <- + clinical.values.by.type$variable[order(clinical.values.by.type$n.pat, decreasing = TRUE)[1:n.top.clinical.values]] + +# Now, discard all the rows corresponding to non-top tests +clinical.values <- + clinical.values[ + # Needs to be exact match of enttype and variable combination + paste0(enttype, '!!!', variable) %in% + paste0(top.clinical.enttypes, '!!!', top.clinical.dataN), + ] + +# Per patient and per test, keep the smallest relativedate value only +clinical.values.most.recent <- + clinical.values[, .SD[which.min(relativedate)], by = c('anonpatid', 'enttype', 'variable')] + +# Make a column per test +clinical.values.most.recent.wide <- + dcast( + data = clinical.values.most.recent, + formula = anonpatid ~ enttype + variable, + value.var = "value" + ) + +# Add a prefix to the names to keep track +# (There may not be n.top.tests columns here as some get lost during the paring +# down processes above...) +names(clinical.values.most.recent.wide)[-1] <- + paste0('clinical.values.', names(clinical.values.most.recent.wide)[-1]) + +# Now, merge with the original cohort +COHORT <- + merge( + COHORT, + clinical.values.most.recent.wide, + by = c('anonpatid'), + all = TRUE + ) + + +### Therapy #################################################################### + +# read in the therapy data +therapy.data <- readMedicalData( + therapy.files, + c("anonpatid", "eventdate", "bnfcode"), + c("integer", "date", "integer") +) +# The other option than bnfcode is prodcode, which refers to specific products +# rather than BNF categories. There are far more of these so, assuming the BNF +# classification is somewhat rational, I'm going to go with that first to +# reduce data sparsity. + +# Now, merge with the indexdate...we only want data before then, because looking +# after it is cheating, and using all data rather than just stuff before may +# distort our choice of variables as some variables may be very common after +# entering the study, but less so before... (14 differ...) +therapy.data <- + merge( + therapy.data, COHORT[, c('anonpatid', 'indexdate')], + by = 'anonpatid', all.x = TRUE + ) + +therapy.data$relativedate <- therapy.data$indexdate - therapy.data$eventdate + +# Now, remove all the negative relative dates, because they're in the past +therapy.data <- therapy.data[relativedate >= 0] +# And remove all data which is too far into the past +therapy.data <- therapy.data[relativedate < 366] + +# get aggregate statistics for each BNF code +therapy.by.bnf <- therapy.data[, length(unique(anonpatid)), by = bnfcode] +names(therapy.by.bnf) <- c('bnfcode', 'n.pat') +# Take the top n by number of patients with that code +top.bnf <- + therapy.by.bnf$bnfcode[order(therapy.by.bnf$n.pat, decreasing = TRUE)[1:n.top.therapy]] + +# Discard all the rows corresponding to non-top BNF codes +therapy.data <- therapy.data[therapy.data$bnfcode %in% top.bnf, ] + +# Aggregate by number of prescriptions per patient +therapy.data <- therapy.data[, .N, by = list(anonpatid, bnfcode)] + +# Per BNF code, add a new column to the cohort and put in numbers +therapy.wide <- + dcast( + data = therapy.data, + formula = anonpatid ~ bnfcode, + value.var = "N" + ) + +# Add a prefix to the names to keep track +names(therapy.wide)[2:(n.top.therapy + 1)] <- + paste0('bnf.', names(therapy.wide)[2:(n.top.therapy + 1)]) + +# And merge into the overall cohort +COHORT <- + merge( + COHORT, + therapy.wide, + by = c('anonpatid'), + all = TRUE + ) + +### Anonymisation steps ######################################################## + +# delete the columns with obvious privacy issues +COHORT[, (kill.cols) := NULL] + +# make the remaining columns relative to the indexdate +indexdate <- as.Date(COHORT$indexdate) + +# date.cols specified +for(date.col in c(date.cols)) { + COHORT[[date.col]] <- + # Do indexdate minus, so this is days in the past + as.numeric(indexdate - as.Date(COHORT[[date.col]])) + # ...and therefore negative values are in the future, hence cheating + COHORT[[date.col]][COHORT[[date.col]] < 0] <- NA +} +# make age an integer +COHORT$age <- round(COHORT$age) + +# make pracregion and ethnicity into categories +lookup_pracregion <- + sample(unique(COHORT$pracregion),length(unique(COHORT$pracregion))) +lookup_ethnicity <- + sample(unique(COHORT$hes_ethnicity),length(unique(COHORT$hes_ethnicity))) +write.csv(lookup_pracregion, 'lookup_pracregion.csv') +write.csv(lookup_ethnicity, 'lookup_ethnicity.csv') + +COHORT$pracregion <- + as.integer(factor(COHORT$pracregion, levels = lookup_pracregion)) +COHORT$hes_ethnicity <- + as.integer(factor(COHORT$hes_ethnicity, levels = lookup_ethnicity)) + +# make IMD score into deciles-ish +COHORT$imd_score <- round(COHORT$imd_score/10) + +write.csv(COHORT, 'cohort-datadriven.csv') + +fake.df <- data.frame(id = 1:10) +for (colname in names(COHORT)) { + fake.df[,colname] <- sample(COHORT[!is.na(COHORT[, colname]), colname], 10) +} + +write.csv(fake.df, 'cohort-sample.csv')