################################################################################
### USER VARIABLES #############################################################
################################################################################
setwd('R:/Pop_Health/Farr_Luscombe/')
### Relating to constructed cohort #############################################
# the R data file containing the patient cohort
cohort.file <- '2_Cohort/COHORT_SCAD_makecohort2.rda'
# columns to remove
kill.cols <- c('anonpatid', 'pracid','year_of_birth')
# columns to make relative to indexdate
date.cols <- c('afterentry_acs','afterentry_mi_nos',
'afterentry_nstemi','afterentry_stemi','afterentry_ua',
'crd','date_entry','date_exit','date_mi_endpoint','deathdate','dob','dod',
'dod_combined','earliest_chd','earliest_hf','earliest_mi','earliest_sa',
'earliest_ua','frd','hes_end','hes_start','indexdate','praclcd',
'pracuts','tod','recent_acs','recent_mi_nos','recent_nstemi',
'recent_stemi','recent_ua','smokdate','endpoint_coronary_date',
'endpoint_death_date'
)
### Relating to other files ####################################################
data.path <- '1a_ExtractedData'
hes.file <- file.path(data.path, 'hes_diag_epi.csv')
n.top.icd <- 100
procedures.file <- file.path(data.path, 'hes_procedures.csv')
n.top.procedures <- 100
tests.files <-
file.path(
data.path,
paste0('test.part.', 0:3)
)
# Number of tests to extract
n.top.tests <- 100
# Find tests as close as possible to the first value here, and no more than the
# second timepoint before it
test.timepoints <- c(0, 183) #days
clinical.files <-
file.path(
data.path,
paste0('clinical.part.', 0:3)
)
n.top.clinical.values <- 100
n.top.clinical.history <- 100
therapy.files <-
file.path(
data.path,
paste0('therapy.part.', 0:7)
)
n.top.therapy <- 100
################################################################################
### END USER VARIABLES #########################################################
################################################################################
### Load the patient cohort to act as a base ###################################
# load the standard COHORT variable, which is a data table called COHORT
load(cohort.file)
### Do a bunch of silly fixes on the data ######################################
# cabg_6mo is 1s and 0s, which should be TRUE and FALSE
COHORT$cabg_6mo <- as.logical(COHORT$cabg_6mo)
# long_nitrate is 1s and NAs, which should be TRUE and FALSE...
COHORT$long_nitrate <- as.logical(COHORT$long_nitrate)
COHORT$long_nitrate[is.na(COHORT$long_nitrate)] <- FALSE
# pci_6mo is 1s and 0s, which should be TRUE and FALSE
COHORT$pci_6mo <- as.logical(COHORT$pci_6mo)
### Append other data types ####################################################
# We'll be needing handymedical from here on in
source('Andrew/lib/handymedical.R', chdir = TRUE)
require(CALIBERlookups)
percentMissing <- function(x, sf = 3) {
round(sum(is.na(x))/length(x), digits = sf)*100
}
### Hospital episodes statistics ###############################################
# read in the hospital data (HES)
hes.diag.epi <- readMedicalData(
hes.file,
c("anonpatid", "epistart", "epiend", "icd"),
c("integer", "date", "date", "factor")
)
# remove non alphanumerics (trailing -s on some ICD codes)
hes.diag.epi$icd <- gsub('[^[:alnum:]]', '', hes.diag.epi$icd)
# Now, merge with the indexdate...we only want data before then, because looking
# after it is cheating, and using all data rather than just stuff before may
# distort our choice of variables as some variables may be very common after
# entering the study, but less so before... (This actually isn't much of an
# issue...only 9 variables differ. Still!)
hes.diag.epi <-
merge(
hes.diag.epi, COHORT[, c('anonpatid', 'indexdate')],
by = 'anonpatid', all = TRUE
)
hes.diag.epi$relativedate <- hes.diag.epi$indexdate - hes.diag.epi$epistart
# Now, remove all the negative relative dates, because they're in the past
hes.diag.epi <- hes.diag.epi[hes.diag.epi$relativedate >= 0, ]
# get aggregate statistics for each ICD code
hes.by.icd <- hes.diag.epi[, length(unique(anonpatid)), by = icd]
names(hes.by.icd) <- c('icd', 'n.pat')
# Take the top n by number of patients with that code
top.icd <-
hes.by.icd$icd[order(hes.by.icd$n.pat, decreasing = TRUE)[1:n.top.icd]]
# And now, let's put how far in the past the patient was first diagnosed with
# each of these things into the table...
# First, discard all the rows corresponding to non-top ICDs
hes.diag.epi.top <- hes.diag.epi[hes.diag.epi$icd %in% top.icd, ]
# Then, keep only the earliest instance of each per patient, because those are
# the ones we care about
hes.diag.epi.top.earliest <-
hes.diag.epi.top[, min(relativedate), by = c('anonpatid', 'icd')]
# Per ICD code, add a new column to the cohort and put in numbers
hes.diag.epi.top.earliest <-
dcast(
data = hes.diag.epi.top.earliest,
formula = anonpatid ~ icd,
value.var = "V1"
)
# Add a prefix to the names to keep track
names(hes.diag.epi.top.earliest)[2:(n.top.icd + 1)] <-
paste0('hes.icd.', names(hes.diag.epi.top.earliest)[2:(n.top.icd + 1)])
# Keep the names for when we need to make them relative to the indexdate when
# anonymising later
names.hes.diag.icd <- names(hes.diag.epi.top.earliest)[2:(n.top.icd + 1)]
# Now, merge with the original cohort
COHORT <-
merge(
COHORT,
hes.diag.epi.top.earliest,
by = c('anonpatid'),
all = TRUE
)
hes.icd.summary <-
data.frame(
percent.missing =
sort(
sapply(
COHORT[, names(COHORT)[startsWith(names(COHORT), 'hes.icd.')], with = FALSE],
percentMissing
)
)
)
hes.icd.summary$code <- substring(rownames(hes.icd.summary), 9)
hes.icd.summary <- merge(hes.icd.summary, CALIBER_DICT[, c('code', 'term')], by = 'code')
print(hes.icd.summary[order(hes.icd.summary$percent.missing),])
### Hospital procedures data ###################################################
# read in the hospital data (HES)
hes.procedures <- readMedicalData(
procedures.file,
c("anonpatid", "opcs", "evdate"),
c("integer", "factor", "date")
)
# Now, merge with the indexdate...we only want data before then, because looking
# after it is cheating, and using all data rather than just stuff before may
# distort our choice of variables as some variables may be very common after
# entering the study, but less so before... (14 differ...)
hes.procedures <-
merge(
hes.procedures, COHORT[, c('anonpatid', 'indexdate')],
by = 'anonpatid', all = TRUE
)
hes.procedures$relativedate <- hes.procedures$indexdate - hes.procedures$evdate
# Now, remove all the negative relative dates, because they're in the past
hes.procedures <- hes.procedures[relativedate >= 0]
# get aggregate statistics for each OPCS code
hes.by.opcs <- hes.procedures[, length(unique(anonpatid)), by = opcs]
names(hes.by.opcs) <- c('opcs', 'n.pat')
# Take the top n by number of patients with that code
top.opcs <-
hes.by.opcs$opcs[order(hes.by.opcs$n.pat, decreasing = TRUE)[1:n.top.procedures]]
# And now, let's put how far in the past the patient was first diagnosed with
# each of these things into the table...
# First, discard all the rows corresponding to non-top OPCS codes
hes.procedures.top <- hes.procedures[hes.procedures$opcs %in% top.opcs, ]
# Then, keep only the earliest instance of each per patient, because those are
# the ones we care about
hes.procedures.top.earliest <-
hes.procedures.top[, min(relativedate), by = c('anonpatid', 'opcs')]
# Per OPCS code, add a new column to the cohort and put in numbers
hes.procedures.top.earliest <-
dcast(
data = hes.procedures.top.earliest,
formula = anonpatid ~ opcs,
value.var = "V1"
)
# Add a prefix to the names to keep track
names(hes.procedures.top.earliest)[2:(n.top.procedures + 1)] <-
paste0('hes.opcs.', names(hes.procedures.top.earliest)[2:(n.top.procedures + 1)])
# Keep the names for when we need to make them relative to the indexdate when
# anonymising later
names.procedures.opcs <- names(hes.procedures.top.earliest)[2:(n.top.icd + 1)]
# Now, merge with the original cohort
COHORT <-
merge(
COHORT,
hes.procedures.top.earliest,
by = c('anonpatid'),
all = TRUE
)
# Summarise by percent missing
hes.opcs.summary <-
data.frame(
percent.missing =
sort(
sapply(
COHORT[, names(COHORT)[startsWith(names(COHORT), 'hes.opcs.')], with = FALSE],
percentMissing
)
)
)
hes.opcs.summary$code <- substring(rownames(hes.opcs.summary), 10)
hes.opcs.summary <- merge(hes.opcs.summary, CALIBER_DICT[, c('code', 'term')], by = 'code')
print(hes.opcs.summary[order(hes.opcs.summary$percent.missing),])
### Test results ###############################################################
# read in the test data
tests.data <- readMedicalData(
tests.files,
# data1 is operator (eg =, > etc), data2 is value, data 3 is unit of measure
c("anonpatid", "eventdate", "enttype", "data1", "data2", "data3"),
c("integer", "date", "integer", "integer", "numeric", "integer")
)
# First, discard those where operator is not =, because > and < etc will
# introduce complexity, and drop data1 since it's now useless
tests.data <- tests.data[data1 == 3]
tests.data$data1 <- NULL
# Now, let's subtract the indexdate from every test so we can choose the ones
# closest to the desired dates...
tests.data <-
merge(
tests.data, COHORT[, c('anonpatid', 'indexdate')],
by = 'anonpatid', all = TRUE
)
tests.data$relativedate <- tests.data$indexdate - tests.data$eventdate
# Only keep positive values; negative ones are in the future which is cheating!
tests.data <- tests.data[relativedate >= 0]
# Only 89001 have any test results from before the indexdate!
# Discard any test results from times greater than the longest time ago to check
tests.data <- tests.data[relativedate < test.timepoints[2]]
# And this drops to 57972 in the preceding six months
# Per patient and per test, keep the smallest relativedate value only
tests.data <-
tests.data[, .SD[which.min(relativedate)], by = list(anonpatid, enttype)]
# aggregate by test type (enttype, which covers a range of Read codes which can
# mean the same test) and unit of measure (so we can choose the tests with the
# units with the best coverage)
# We do this after all the preprocessing (ie only looking at the most recent
# value per patient before but not too far before the indexdate) because
# otherwise a lot of the top tests change. Presumably
tests.by.test <-
tests.data[, length(unique(anonpatid)), by = c('enttype', 'data3')]
names(tests.by.test) <- c('enttype', 'data3', 'n.pat')
top.tests <- tests.by.test$enttype[order(tests.by.test$n.pat, decreasing = TRUE)[1:n.top.tests]]
top.units <- tests.by.test$data3[order(tests.by.test$n.pat, decreasing = TRUE)[1:n.top.tests]]
# Now, discard all the rows corresponding to non-top tests
tests.data <-
tests.data[
# Needs to be exact match of enttype and variable combination
paste0(enttype, '!!!', data3) %in% paste0(top.tests, '!!!', top.units)
]
# Make a column per test
tests.wide <-
dcast(
data = tests.data,
formula = anonpatid ~ enttype + data3,
value.var = "data2"
)
# Add a prefix to the names to keep track
# (There may not be n.top.tests columns here as some get lost during the paring
# down processes above...)
names(tests.wide)[-1] <-
paste0('tests.enttype.data3.', names(tests.wide)[-1])
# Now, merge with the original cohort
COHORT <-
merge(
COHORT,
tests.wide,
by = c('anonpatid'),
all = TRUE
)
### GP diagnosis data ##########################################################
# read in the GP data (clinical)
clinical.data <- readMedicalData(
clinical.files,
c("anonpatid", "eventdate", "medcode", "enttype", "data1", "data2", "data3", "data4", "data5", "data6", "data7"),
c("integer", "date", "integer", "integer", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric")
)
# Now, merge with the indexdate...we only want data before then, because looking
# after it is cheating, and using all data rather than just stuff before may
# distort our choice of variables as some variables may be very common after
# entering the study, but less so before... (14 differ...)
clinical.data <-
merge(
clinical.data, COHORT[, c('anonpatid', 'indexdate')],
by = 'anonpatid', all = TRUE
)
clinical.data$relativedate <- clinical.data$indexdate - clinical.data$eventdate
# Now, remove all the negative relative dates, because they're in the past
clinical.data <- clinical.data[relativedate >= 0]
# Find out which enttypes have associated data values, to distinguish purely
# binary variables (like medical history, family history etc) from test results
# and so on which have data1, data2, etc values.
# From a quick scan, anything with any data at all has a data1 value, so we can
# use that as a proxy.
clinical.by.data1 <- clinical.data[, sum(!is.na(data1)), by = enttype]
# Split into two data tables, the ones with associated data, and without
clinical.history <-
clinical.data[
enttype %in% clinical.by.data1$enttype[clinical.by.data1$V1 == 0]
]
clinical.values <-
clinical.data[
enttype %in% clinical.by.data1$enttype[clinical.by.data1$V1 > 0]
]
### Clinical history
# get aggregate statistics for each medcode
clinical.history.by.medcode <-
clinical.history[, length(unique(anonpatid)), by = medcode]
names(clinical.history.by.medcode) <- c('medcode', 'n.pat')
# Print a table of the leading medcodes we've found
print(
merge(
clinical.history.by.medcode[order(n.pat, decreasing = TRUE)[1:100],],
CALIBER_DICT[, c('medcode', 'term')], by = 'medcode'
)
)
# Take the top n by number of patients with that code
top.history <-
clinical.history.by.medcode$medcode[
order(
clinical.history.by.medcode$n.pat, decreasing = TRUE
)[1:n.top.clinical.history]
]
# And now, let's put how far in the past the patient was first diagnosed with
# each of these things into the table...
# First, discard all the rows corresponding to non-top OPCS codes
clinical.history <- clinical.history[clinical.history$medcode %in% top.history, ]
# Then, keep only the earliest instance of each per patient, because those are
# the ones we care about
clinical.history <-
clinical.history[, min(relativedate), by = c('anonpatid', 'medcode')]
# Per medcode, add a new column to the cohort and put in numbers
clinical.history <-
dcast(
data = clinical.history,
formula = anonpatid ~ medcode,
value.var = "V1"
)
# Add a prefix to the names to keep track
names(clinical.history)[2:(n.top.clinical.history + 1)] <-
paste0('clinical.history.', names(clinical.history)[2:(n.top.clinical.history + 1)])
# Now, merge with the original cohort
COHORT <-
merge(
COHORT,
clinical.history,
by = c('anonpatid'),
all = TRUE
)
### Clinical values
# Next, let's melt the values data by data1, data2 etc. Each potentially
# contains a separate measurement (eg for entity type 13, which is weight, data1
# is the weight, data2 is the weight centile [always blank in this dataset!] and
# data3 is BMI)
clinical.values <-
melt(
clinical.values,
id.vars = c("anonpatid", "relativedate", "medcode", "enttype"),
measure.vars = paste0("data", 1:7)
)
# Because there are lots of data types, the value column gets coerced to double.
# This is fine for now, because it's all numeric, and whilst some values are
# categorical represented as integers, random forests don't care about that.
# Remove all NA values
clinical.values <- clinical.values[!is.na(value)]
# Remove all values measured too long ago
clinical.values <- clinical.values[relativedate <= 183]
# aggregate by enttype and which data column the test came from
clinical.values.by.type <-
clinical.values[, length(unique(anonpatid)), by = c('enttype', 'variable')]
names(clinical.values.by.type) <- c('enttype', 'variable', 'n.pat')
top.clinical.enttypes <-
clinical.values.by.type$enttype[order(clinical.values.by.type$n.pat, decreasing = TRUE)[1:n.top.clinical.values]]
top.clinical.dataN <-
clinical.values.by.type$variable[order(clinical.values.by.type$n.pat, decreasing = TRUE)[1:n.top.clinical.values]]
# Now, discard all the rows corresponding to non-top tests
clinical.values <-
clinical.values[
# Needs to be exact match of enttype and variable combination
paste0(enttype, '!!!', variable) %in%
paste0(top.clinical.enttypes, '!!!', top.clinical.dataN),
]
# Per patient and per test, keep the smallest relativedate value only
clinical.values.most.recent <-
clinical.values[, .SD[which.min(relativedate)], by = c('anonpatid', 'enttype', 'variable')]
# Make a column per test
clinical.values.most.recent.wide <-
dcast(
data = clinical.values.most.recent,
formula = anonpatid ~ enttype + variable,
value.var = "value"
)
# Add a prefix to the names to keep track
# (There may not be n.top.tests columns here as some get lost during the paring
# down processes above...)
names(clinical.values.most.recent.wide)[-1] <-
paste0('clinical.values.', names(clinical.values.most.recent.wide)[-1])
# Now, merge with the original cohort
COHORT <-
merge(
COHORT,
clinical.values.most.recent.wide,
by = c('anonpatid'),
all = TRUE
)
### Therapy ####################################################################
# read in the therapy data
therapy.data <- readMedicalData(
therapy.files,
c("anonpatid", "eventdate", "bnfcode"),
c("integer", "date", "integer")
)
# The other option than bnfcode is prodcode, which refers to specific products
# rather than BNF categories. There are far more of these so, assuming the BNF
# classification is somewhat rational, I'm going to go with that first to
# reduce data sparsity.
# Now, merge with the indexdate...we only want data before then, because looking
# after it is cheating, and using all data rather than just stuff before may
# distort our choice of variables as some variables may be very common after
# entering the study, but less so before... (14 differ...)
therapy.data <-
merge(
therapy.data, COHORT[, c('anonpatid', 'indexdate')],
by = 'anonpatid', all.x = TRUE
)
therapy.data$relativedate <- therapy.data$indexdate - therapy.data$eventdate
# Now, remove all the negative relative dates, because they're in the past
therapy.data <- therapy.data[relativedate >= 0]
# And remove all data which is too far into the past
therapy.data <- therapy.data[relativedate < 366]
# get aggregate statistics for each BNF code
therapy.by.bnf <- therapy.data[, length(unique(anonpatid)), by = bnfcode]
names(therapy.by.bnf) <- c('bnfcode', 'n.pat')
# Take the top n by number of patients with that code
top.bnf <-
therapy.by.bnf$bnfcode[order(therapy.by.bnf$n.pat, decreasing = TRUE)[1:n.top.therapy]]
# Discard all the rows corresponding to non-top BNF codes
therapy.data <- therapy.data[therapy.data$bnfcode %in% top.bnf, ]
# Aggregate by number of prescriptions per patient
therapy.data <- therapy.data[, .N, by = list(anonpatid, bnfcode)]
# Per BNF code, add a new column to the cohort and put in numbers
therapy.wide <-
dcast(
data = therapy.data,
formula = anonpatid ~ bnfcode,
value.var = "N"
)
# Add a prefix to the names to keep track
names(therapy.wide)[2:(n.top.therapy + 1)] <-
paste0('bnf.', names(therapy.wide)[2:(n.top.therapy + 1)])
# And merge into the overall cohort
COHORT <-
merge(
COHORT,
therapy.wide,
by = c('anonpatid'),
all = TRUE
)
### Anonymisation steps ########################################################
# delete the columns with obvious privacy issues
COHORT[, (kill.cols) := NULL]
# make the remaining columns relative to the indexdate
indexdate <- as.Date(COHORT$indexdate)
# date.cols specified
for(date.col in c(date.cols)) {
COHORT[[date.col]] <-
# Do indexdate minus, so this is days in the past
as.numeric(indexdate - as.Date(COHORT[[date.col]]))
# ...and therefore negative values are in the future, hence cheating
COHORT[[date.col]][COHORT[[date.col]] < 0] <- NA
}
# make age an integer
COHORT$age <- round(COHORT$age)
# make pracregion and ethnicity into categories
lookup_pracregion <-
sample(unique(COHORT$pracregion),length(unique(COHORT$pracregion)))
lookup_ethnicity <-
sample(unique(COHORT$hes_ethnicity),length(unique(COHORT$hes_ethnicity)))
write.csv(lookup_pracregion, 'lookup_pracregion.csv')
write.csv(lookup_ethnicity, 'lookup_ethnicity.csv')
COHORT$pracregion <-
as.integer(factor(COHORT$pracregion, levels = lookup_pracregion))
COHORT$hes_ethnicity <-
as.integer(factor(COHORT$hes_ethnicity, levels = lookup_ethnicity))
# make IMD score into deciles-ish
COHORT$imd_score <- round(COHORT$imd_score/10)
write.csv(COHORT, 'cohort-datadriven.csv')
fake.df <- data.frame(id = 1:10)
for (colname in names(COHORT)) {
fake.df[,colname] <- sample(COHORT[!is.na(COHORT[, colname]), colname], 10)
}
write.csv(fake.df, 'cohort-sample.csv')