Diff of /scripts/export-bigdata.R [000000] .. [0375db]

Switch to unified view

a b/scripts/export-bigdata.R
1
################################################################################
2
### USER VARIABLES #############################################################
3
################################################################################
4
5
setwd('R:/Pop_Health/Farr_Luscombe/')
6
7
8
### Relating to constructed cohort #############################################
9
# the R data file containing the patient cohort
10
cohort.file <- '2_Cohort/COHORT_SCAD_makecohort2.rda'
11
12
# columns to remove
13
kill.cols <- c('anonpatid', 'pracid','year_of_birth')
14
15
# columns to make relative to indexdate
16
date.cols <- c('afterentry_acs','afterentry_mi_nos',
17
            'afterentry_nstemi','afterentry_stemi','afterentry_ua',
18
            'crd','date_entry','date_exit','date_mi_endpoint','deathdate','dob','dod',
19
            'dod_combined','earliest_chd','earliest_hf','earliest_mi','earliest_sa',
20
            'earliest_ua','frd','hes_end','hes_start','indexdate','praclcd',
21
            'pracuts','tod','recent_acs','recent_mi_nos','recent_nstemi',
22
            'recent_stemi','recent_ua','smokdate','endpoint_coronary_date',
23
            'endpoint_death_date'
24
)
25
26
### Relating to other files ####################################################
27
data.path <- '1a_ExtractedData'
28
29
hes.file <- file.path(data.path, 'hes_diag_epi.csv')
30
n.top.icd <- 100
31
32
procedures.file <- file.path(data.path, 'hes_procedures.csv')
33
n.top.procedures <- 100
34
35
tests.files <-
36
  file.path(
37
    data.path,
38
    paste0('test.part.', 0:3)
39
  )
40
# Number of tests to extract
41
n.top.tests <- 100
42
# Find tests as close as possible to the first value here, and no more than the
43
# second timepoint before it
44
test.timepoints <- c(0, 183) #days
45
46
clinical.files <-
47
  file.path(
48
    data.path,
49
    paste0('clinical.part.', 0:3)
50
  )
51
n.top.clinical.values <- 100
52
n.top.clinical.history <- 100
53
54
therapy.files <-
55
  file.path(
56
    data.path,
57
    paste0('therapy.part.', 0:7)
58
  )
59
n.top.therapy <- 100
60
61
62
################################################################################
63
### END USER VARIABLES #########################################################
64
################################################################################
65
66
67
### Load the patient cohort to act as a base ###################################
68
69
# load the standard COHORT variable, which is a data table called COHORT
70
load(cohort.file)
71
72
### Do a bunch of silly fixes on the data ######################################
73
# cabg_6mo is 1s and 0s, which should be TRUE and FALSE
74
COHORT$cabg_6mo <- as.logical(COHORT$cabg_6mo)
75
# long_nitrate is 1s and NAs, which should be TRUE and FALSE...
76
COHORT$long_nitrate <- as.logical(COHORT$long_nitrate)
77
COHORT$long_nitrate[is.na(COHORT$long_nitrate)] <- FALSE
78
# pci_6mo is 1s and 0s, which should be TRUE and FALSE
79
COHORT$pci_6mo <- as.logical(COHORT$pci_6mo)
80
81
### Append other data types ####################################################
82
83
# We'll be needing handymedical from here on in
84
source('Andrew/lib/handymedical.R', chdir = TRUE)
85
require(CALIBERlookups)
86
87
percentMissing <- function(x, sf = 3) {
88
  round(sum(is.na(x))/length(x), digits = sf)*100
89
}
90
91
### Hospital episodes statistics ###############################################
92
93
# read in the hospital data (HES)
94
hes.diag.epi <- readMedicalData(
95
  hes.file,
96
  c("anonpatid", "epistart", "epiend", "icd"),
97
  c("integer", "date", "date", "factor")
98
)
99
100
# remove non alphanumerics (trailing -s on some ICD codes)
101
hes.diag.epi$icd <- gsub('[^[:alnum:]]', '', hes.diag.epi$icd)
102
103
# Now, merge with the indexdate...we only want data before then, because looking
104
# after it is cheating, and using all data rather than just stuff before may
105
# distort our choice of variables as some variables may be very common after
106
# entering the study, but less so before... (This actually isn't much of an
107
# issue...only 9 variables differ. Still!)
108
hes.diag.epi <-
109
  merge(
110
    hes.diag.epi, COHORT[, c('anonpatid', 'indexdate')],
111
    by = 'anonpatid', all = TRUE
112
  )
113
hes.diag.epi$relativedate <- hes.diag.epi$indexdate - hes.diag.epi$epistart
114
115
# Now, remove all the negative relative dates, because they're in the past
116
hes.diag.epi <- hes.diag.epi[hes.diag.epi$relativedate >= 0, ]
117
118
# get aggregate statistics for each ICD code
119
hes.by.icd <- hes.diag.epi[, length(unique(anonpatid)), by = icd]
120
names(hes.by.icd) <- c('icd', 'n.pat')
121
122
# Take the top n by number of patients with that code
123
top.icd <-
124
  hes.by.icd$icd[order(hes.by.icd$n.pat, decreasing = TRUE)[1:n.top.icd]]
125
126
# And now, let's put how far in the past the patient was first diagnosed with
127
# each of these things into the table...
128
129
# First, discard all the rows corresponding to non-top ICDs
130
hes.diag.epi.top <- hes.diag.epi[hes.diag.epi$icd %in% top.icd, ]
131
# Then, keep only the earliest instance of each per patient, because those are
132
# the ones we care about
133
hes.diag.epi.top.earliest <- 
134
  hes.diag.epi.top[, min(relativedate), by = c('anonpatid', 'icd')]
135
136
# Per ICD code, add a new column to the cohort and put in numbers
137
hes.diag.epi.top.earliest <-
138
  dcast(
139
    data = hes.diag.epi.top.earliest,
140
    formula = anonpatid ~ icd,
141
    value.var = "V1"
142
  )
143
144
# Add a prefix to the names to keep track
145
names(hes.diag.epi.top.earliest)[2:(n.top.icd + 1)] <-
146
  paste0('hes.icd.', names(hes.diag.epi.top.earliest)[2:(n.top.icd + 1)])
147
148
# Keep the names for when we need to make them relative to the indexdate when
149
# anonymising later
150
names.hes.diag.icd <- names(hes.diag.epi.top.earliest)[2:(n.top.icd + 1)]
151
152
# Now, merge with the original cohort
153
COHORT <-
154
  merge(
155
    COHORT,
156
    hes.diag.epi.top.earliest,
157
    by = c('anonpatid'),
158
    all = TRUE
159
  )
160
161
hes.icd.summary <-
162
  data.frame(
163
    percent.missing = 
164
      sort(
165
        sapply(
166
          COHORT[, names(COHORT)[startsWith(names(COHORT), 'hes.icd.')], with = FALSE],
167
          percentMissing
168
        )
169
      )
170
  )
171
172
hes.icd.summary$code <- substring(rownames(hes.icd.summary), 9)
173
174
hes.icd.summary <- merge(hes.icd.summary, CALIBER_DICT[, c('code', 'term')], by = 'code')
175
176
print(hes.icd.summary[order(hes.icd.summary$percent.missing),])
177
178
### Hospital procedures data ###################################################
179
180
# read in the hospital data (HES)
181
hes.procedures <- readMedicalData(
182
  procedures.file,
183
  c("anonpatid", "opcs", "evdate"),
184
  c("integer", "factor", "date")
185
)
186
187
# Now, merge with the indexdate...we only want data before then, because looking
188
# after it is cheating, and using all data rather than just stuff before may
189
# distort our choice of variables as some variables may be very common after
190
# entering the study, but less so before... (14 differ...)
191
hes.procedures <-
192
  merge(
193
    hes.procedures, COHORT[, c('anonpatid', 'indexdate')],
194
    by = 'anonpatid', all = TRUE
195
  )
196
hes.procedures$relativedate <- hes.procedures$indexdate - hes.procedures$evdate
197
198
# Now, remove all the negative relative dates, because they're in the past
199
hes.procedures <- hes.procedures[relativedate >= 0]
200
201
# get aggregate statistics for each OPCS code
202
hes.by.opcs <- hes.procedures[, length(unique(anonpatid)), by = opcs]
203
names(hes.by.opcs) <- c('opcs', 'n.pat')
204
205
# Take the top n by number of patients with that code
206
top.opcs <-
207
  hes.by.opcs$opcs[order(hes.by.opcs$n.pat, decreasing = TRUE)[1:n.top.procedures]]
208
209
# And now, let's put how far in the past the patient was first diagnosed with
210
# each of these things into the table...
211
212
# First, discard all the rows corresponding to non-top OPCS codes
213
hes.procedures.top <- hes.procedures[hes.procedures$opcs %in% top.opcs, ]
214
# Then, keep only the earliest instance of each per patient, because those are
215
# the ones we care about
216
hes.procedures.top.earliest <-
217
  hes.procedures.top[, min(relativedate), by = c('anonpatid', 'opcs')]
218
219
# Per OPCS code, add a new column to the cohort and put in numbers
220
hes.procedures.top.earliest <-
221
  dcast(
222
    data = hes.procedures.top.earliest,
223
    formula = anonpatid ~ opcs,
224
    value.var = "V1"
225
  )
226
227
# Add a prefix to the names to keep track
228
names(hes.procedures.top.earliest)[2:(n.top.procedures + 1)] <-
229
  paste0('hes.opcs.', names(hes.procedures.top.earliest)[2:(n.top.procedures + 1)])
230
231
# Keep the names for when we need to make them relative to the indexdate when
232
# anonymising later
233
names.procedures.opcs <- names(hes.procedures.top.earliest)[2:(n.top.icd + 1)]
234
235
# Now, merge with the original cohort
236
COHORT <-
237
  merge(
238
    COHORT,
239
    hes.procedures.top.earliest,
240
    by = c('anonpatid'),
241
    all = TRUE
242
  )
243
244
# Summarise by percent missing
245
hes.opcs.summary <-
246
  data.frame(
247
    percent.missing = 
248
      sort(
249
        sapply(
250
          COHORT[, names(COHORT)[startsWith(names(COHORT), 'hes.opcs.')], with = FALSE],
251
          percentMissing
252
        )
253
      )
254
  )
255
256
hes.opcs.summary$code <- substring(rownames(hes.opcs.summary), 10)
257
258
hes.opcs.summary <- merge(hes.opcs.summary, CALIBER_DICT[, c('code', 'term')], by = 'code')
259
260
print(hes.opcs.summary[order(hes.opcs.summary$percent.missing),])
261
262
### Test results ###############################################################
263
264
# read in the test data
265
tests.data <- readMedicalData(
266
  tests.files,
267
  # data1 is operator (eg =, > etc), data2 is value, data 3 is unit of measure
268
  c("anonpatid", "eventdate", "enttype", "data1", "data2", "data3"),
269
  c("integer", "date", "integer", "integer", "numeric", "integer")
270
)
271
272
# First, discard those where operator is not =, because > and < etc will
273
# introduce complexity, and drop data1 since it's now useless
274
tests.data <- tests.data[data1 == 3]
275
tests.data$data1 <- NULL
276
277
# Now, let's subtract the indexdate from every test so we can choose the ones
278
# closest to the desired dates...
279
tests.data <-
280
  merge(
281
    tests.data, COHORT[, c('anonpatid', 'indexdate')],
282
    by = 'anonpatid', all = TRUE
283
  )
284
tests.data$relativedate <- tests.data$indexdate - tests.data$eventdate
285
286
# Only keep positive values; negative ones are in the future which is cheating!
287
tests.data <- tests.data[relativedate >= 0]
288
# Only 89001 have any test results from before the indexdate!
289
290
# Discard any test results from times greater than the longest time ago to check
291
tests.data <- tests.data[relativedate < test.timepoints[2]]
292
# And this drops to 57972 in the preceding six months
293
294
# Per patient and per test, keep the smallest relativedate value only
295
tests.data <-
296
  tests.data[, .SD[which.min(relativedate)], by = list(anonpatid, enttype)]
297
298
# aggregate by test type (enttype, which covers a range of Read codes which can
299
# mean the same test) and unit of measure (so we can choose the tests with the
300
# units with the best coverage)
301
# We do this after all the preprocessing (ie only looking at the most recent
302
# value per patient before but not too far before the indexdate) because
303
# otherwise a lot of the top tests change. Presumably 
304
tests.by.test <-
305
  tests.data[, length(unique(anonpatid)), by = c('enttype', 'data3')]
306
names(tests.by.test) <- c('enttype', 'data3', 'n.pat')
307
308
top.tests <- tests.by.test$enttype[order(tests.by.test$n.pat, decreasing = TRUE)[1:n.top.tests]]
309
top.units <- tests.by.test$data3[order(tests.by.test$n.pat, decreasing = TRUE)[1:n.top.tests]]
310
311
# Now, discard all the rows corresponding to non-top tests
312
tests.data <-
313
  tests.data[
314
    # Needs to be exact match of enttype and variable combination
315
    paste0(enttype, '!!!', data3) %in% paste0(top.tests, '!!!', top.units)
316
  ]
317
318
# Make a column per test
319
tests.wide <-
320
  dcast(
321
    data = tests.data,
322
    formula = anonpatid ~ enttype + data3,
323
    value.var = "data2"
324
  )
325
326
# Add a prefix to the names to keep track
327
# (There may not be n.top.tests columns here as some get lost during the paring
328
# down processes above...)
329
names(tests.wide)[-1] <-
330
  paste0('tests.enttype.data3.', names(tests.wide)[-1])
331
332
# Now, merge with the original cohort
333
COHORT <-
334
  merge(
335
    COHORT,
336
    tests.wide,
337
    by = c('anonpatid'),
338
    all = TRUE
339
  )
340
341
342
### GP diagnosis data ##########################################################
343
344
# read in the GP data (clinical)
345
clinical.data <- readMedicalData(
346
  clinical.files,
347
  c("anonpatid", "eventdate", "medcode", "enttype", "data1", "data2", "data3", "data4", "data5", "data6", "data7"),
348
  c("integer", "date", "integer", "integer", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric")
349
)
350
351
# Now, merge with the indexdate...we only want data before then, because looking
352
# after it is cheating, and using all data rather than just stuff before may
353
# distort our choice of variables as some variables may be very common after
354
# entering the study, but less so before... (14 differ...)
355
clinical.data <-
356
  merge(
357
    clinical.data, COHORT[, c('anonpatid', 'indexdate')],
358
    by = 'anonpatid', all = TRUE
359
  )
360
clinical.data$relativedate <- clinical.data$indexdate - clinical.data$eventdate
361
362
# Now, remove all the negative relative dates, because they're in the past
363
clinical.data <- clinical.data[relativedate >= 0]
364
365
# Find out which enttypes have associated data values, to distinguish purely
366
# binary variables (like medical history, family history etc) from test results
367
# and so on which have data1, data2, etc values.
368
# From a quick scan, anything with any data at all has a data1 value, so we can
369
# use that as a proxy.
370
clinical.by.data1 <- clinical.data[, sum(!is.na(data1)), by = enttype]
371
372
# Split into two data tables, the ones with associated data, and without
373
clinical.history <-
374
  clinical.data[
375
    enttype %in% clinical.by.data1$enttype[clinical.by.data1$V1 == 0]
376
  ]
377
clinical.values <-
378
  clinical.data[
379
    enttype %in% clinical.by.data1$enttype[clinical.by.data1$V1 > 0]
380
    ]
381
382
### Clinical history
383
384
# get aggregate statistics for each medcode
385
clinical.history.by.medcode <-
386
  clinical.history[, length(unique(anonpatid)), by = medcode]
387
names(clinical.history.by.medcode) <- c('medcode', 'n.pat')
388
389
# Print a table of the leading medcodes we've found
390
print(
391
  merge(
392
    clinical.history.by.medcode[order(n.pat, decreasing = TRUE)[1:100],],
393
    CALIBER_DICT[, c('medcode', 'term')], by = 'medcode'
394
  )
395
)
396
397
# Take the top n by number of patients with that code
398
top.history <-
399
  clinical.history.by.medcode$medcode[
400
    order(
401
      clinical.history.by.medcode$n.pat, decreasing = TRUE
402
    )[1:n.top.clinical.history]
403
  ]
404
405
# And now, let's put how far in the past the patient was first diagnosed with
406
# each of these things into the table...
407
408
# First, discard all the rows corresponding to non-top OPCS codes
409
clinical.history <- clinical.history[clinical.history$medcode %in% top.history, ]
410
# Then, keep only the earliest instance of each per patient, because those are
411
# the ones we care about
412
clinical.history <-
413
  clinical.history[, min(relativedate), by = c('anonpatid', 'medcode')]
414
415
# Per medcode, add a new column to the cohort and put in numbers
416
clinical.history <-
417
  dcast(
418
    data = clinical.history,
419
    formula = anonpatid ~ medcode,
420
    value.var = "V1"
421
  )
422
423
# Add a prefix to the names to keep track
424
names(clinical.history)[2:(n.top.clinical.history + 1)] <-
425
  paste0('clinical.history.', names(clinical.history)[2:(n.top.clinical.history + 1)])
426
427
# Now, merge with the original cohort
428
COHORT <-
429
  merge(
430
    COHORT,
431
    clinical.history,
432
    by = c('anonpatid'),
433
    all = TRUE
434
  )
435
436
### Clinical values
437
438
# Next, let's melt the values data by data1, data2 etc. Each potentially
439
# contains a separate measurement (eg for entity type 13, which is weight, data1
440
# is the weight, data2 is the weight centile [always blank in this dataset!] and
441
# data3 is BMI)
442
clinical.values <-
443
  melt(
444
    clinical.values,
445
    id.vars = c("anonpatid", "relativedate", "medcode", "enttype"),
446
    measure.vars = paste0("data", 1:7)
447
  )
448
# Because there are lots of data types, the value column gets coerced to double.
449
# This is fine for now, because it's all numeric, and whilst some values are
450
# categorical represented as integers, random forests don't care about that.
451
452
# Remove all NA values
453
clinical.values <- clinical.values[!is.na(value)]
454
455
# Remove all values measured too long ago
456
clinical.values <- clinical.values[relativedate <= 183]
457
458
# aggregate by enttype and which data column the test came from
459
clinical.values.by.type <-
460
  clinical.values[, length(unique(anonpatid)), by = c('enttype', 'variable')]
461
names(clinical.values.by.type) <- c('enttype', 'variable', 'n.pat')
462
463
top.clinical.enttypes <-
464
  clinical.values.by.type$enttype[order(clinical.values.by.type$n.pat, decreasing = TRUE)[1:n.top.clinical.values]]
465
top.clinical.dataN <-
466
  clinical.values.by.type$variable[order(clinical.values.by.type$n.pat, decreasing = TRUE)[1:n.top.clinical.values]]
467
468
# Now, discard all the rows corresponding to non-top tests
469
clinical.values <-
470
  clinical.values[
471
    # Needs to be exact match of enttype and variable combination
472
    paste0(enttype, '!!!', variable) %in%
473
      paste0(top.clinical.enttypes, '!!!', top.clinical.dataN),
474
    ]
475
476
# Per patient and per test, keep the smallest relativedate value only
477
clinical.values.most.recent <-
478
  clinical.values[, .SD[which.min(relativedate)], by = c('anonpatid', 'enttype', 'variable')]
479
480
# Make a column per test
481
clinical.values.most.recent.wide <-
482
  dcast(
483
    data = clinical.values.most.recent,
484
    formula = anonpatid ~ enttype + variable,
485
    value.var = "value"
486
  )
487
488
# Add a prefix to the names to keep track
489
# (There may not be n.top.tests columns here as some get lost during the paring
490
# down processes above...)
491
names(clinical.values.most.recent.wide)[-1] <-
492
  paste0('clinical.values.', names(clinical.values.most.recent.wide)[-1])
493
494
# Now, merge with the original cohort
495
COHORT <-
496
  merge(
497
    COHORT,
498
    clinical.values.most.recent.wide,
499
    by = c('anonpatid'),
500
    all = TRUE
501
  )
502
503
504
### Therapy ####################################################################
505
506
# read in the therapy data
507
therapy.data <- readMedicalData(
508
  therapy.files,
509
  c("anonpatid", "eventdate", "bnfcode"),
510
  c("integer", "date", "integer")
511
)
512
# The other option than bnfcode is prodcode, which refers to specific products
513
# rather than BNF categories. There are far more of these so, assuming the BNF
514
# classification is somewhat rational, I'm going to go with that first to
515
# reduce data sparsity.
516
517
# Now, merge with the indexdate...we only want data before then, because looking
518
# after it is cheating, and using all data rather than just stuff before may
519
# distort our choice of variables as some variables may be very common after
520
# entering the study, but less so before... (14 differ...)
521
therapy.data <-
522
  merge(
523
    therapy.data, COHORT[, c('anonpatid', 'indexdate')],
524
    by = 'anonpatid', all.x = TRUE
525
  )
526
527
therapy.data$relativedate <- therapy.data$indexdate - therapy.data$eventdate
528
529
# Now, remove all the negative relative dates, because they're in the past
530
therapy.data <- therapy.data[relativedate >= 0]
531
# And remove all data which is too far into the past
532
therapy.data <- therapy.data[relativedate < 366]
533
534
# get aggregate statistics for each BNF code
535
therapy.by.bnf <- therapy.data[, length(unique(anonpatid)), by = bnfcode]
536
names(therapy.by.bnf) <- c('bnfcode', 'n.pat')
537
# Take the top n by number of patients with that code
538
top.bnf <-
539
  therapy.by.bnf$bnfcode[order(therapy.by.bnf$n.pat, decreasing = TRUE)[1:n.top.therapy]]
540
541
# Discard all the rows corresponding to non-top BNF codes
542
therapy.data <- therapy.data[therapy.data$bnfcode %in% top.bnf, ]
543
544
# Aggregate by number of prescriptions per patient
545
therapy.data <- therapy.data[, .N, by = list(anonpatid, bnfcode)]
546
547
# Per BNF code, add a new column to the cohort and put in numbers
548
therapy.wide <-
549
  dcast(
550
    data = therapy.data,
551
    formula = anonpatid ~ bnfcode,
552
    value.var = "N"
553
  )
554
555
# Add a prefix to the names to keep track
556
names(therapy.wide)[2:(n.top.therapy + 1)] <-
557
  paste0('bnf.', names(therapy.wide)[2:(n.top.therapy + 1)])
558
559
# And merge into the overall cohort
560
COHORT <-
561
  merge(
562
    COHORT,
563
    therapy.wide,
564
    by = c('anonpatid'),
565
    all = TRUE
566
  )
567
568
### Anonymisation steps ########################################################
569
570
# delete the columns with obvious privacy issues
571
COHORT[, (kill.cols) := NULL]
572
573
# make the remaining columns relative to the indexdate
574
indexdate <- as.Date(COHORT$indexdate)
575
576
# date.cols specified
577
for(date.col in c(date.cols)) {
578
    COHORT[[date.col]] <-
579
      # Do indexdate minus, so this is days in the past
580
      as.numeric(indexdate - as.Date(COHORT[[date.col]]))
581
    # ...and therefore negative values are in the future, hence cheating
582
    COHORT[[date.col]][COHORT[[date.col]] < 0] <- NA
583
}
584
# make age an integer
585
COHORT$age <- round(COHORT$age)
586
587
# make pracregion and ethnicity into categories
588
lookup_pracregion <-
589
  sample(unique(COHORT$pracregion),length(unique(COHORT$pracregion)))
590
lookup_ethnicity <-
591
  sample(unique(COHORT$hes_ethnicity),length(unique(COHORT$hes_ethnicity)))
592
write.csv(lookup_pracregion, 'lookup_pracregion.csv')
593
write.csv(lookup_ethnicity, 'lookup_ethnicity.csv')
594
595
COHORT$pracregion <-
596
  as.integer(factor(COHORT$pracregion, levels = lookup_pracregion))
597
COHORT$hes_ethnicity <-
598
  as.integer(factor(COHORT$hes_ethnicity, levels = lookup_ethnicity))
599
600
# make IMD score into deciles-ish
601
COHORT$imd_score <- round(COHORT$imd_score/10)
602
603
write.csv(COHORT, 'cohort-datadriven.csv')
604
605
fake.df <- data.frame(id = 1:10)
606
for (colname in names(COHORT)) {
607
    fake.df[,colname] <- sample(COHORT[!is.na(COHORT[, colname]), colname], 10)
608
}
609
610
write.csv(fake.df, 'cohort-sample.csv')