[20be63]: / foresight / datasets / patient_concept_stream.py

Download this file

107 lines (91 with data), 4.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
from __future__ import absolute_import, division, print_function
import pickle
import logging
import datasets
import numpy as np
_CITATION = """\
@misc{kraljevic2020multidomain,
title={Multi-domain Clinical Natural Language Processing with MedCAT: the Medical Concept Annotation Toolkit},
author={Zeljko Kraljevic and Thomas Searle and Anthony Shek and Lukasz Roguski and Kawsar Noor and Daniel Bean and Aurelie Mascio and Leilei Zhu and Amos A Folarin and Angus Roberts and Rebecca Bendayan and Mark P Richardson and Robert Stewart and Anoop D Shah and Wai Keong Wong and Zina Ibrahim and James T Teo and Richard JB Dobson},
year={2020},
eprint={2010.01165},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_DESCRIPTION = """\
Takes as input a pickled dict of pt2stream. The format should be:
{'patient_id': (concept_cui, concept_count_for_patient, timestamp_of_first_occurrence_for_patient), ...}
"""
class PatientConceptStreamConfig(datasets.BuilderConfig):
""" BuilderConfig for PatientConceptStream.
Args:
**kwargs: keyword arguments forwarded to super.
"""
def __init__(self, **kwargs):
super(PatientConceptStreamConfig, self).__init__(**kwargs)
class PatientConceptStream(datasets.GeneratorBasedBuilder):
"""PatientConceptStream: as input takes the patient to stream of concepts.
TODO: Move the preparations scripts out of notebooks
"""
BUILDER_CONFIGS = [
PatientConceptStreamConfig(
name="pickle",
version=datasets.Version("1.0.0", ""),
description="Pickled output from Temporal dataset preparation scripts",
),
]
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"patient_id": datasets.Value("string"),
"stream": [
{
'token': datasets.Value('string'),
'cnt': datasets.Value('int32'),
'time': datasets.Value('int64'),
'token_type': datasets.Value('string'),
#'context_representation': datasets.Sequence(datasets.Value('float'), length=128),
}
],
}
),
supervised_keys=None,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
'filepaths': self.config.data_files['train'],
},
),
]
def _generate_examples(self, filepaths):
"""Returns Pts one by one"""
for filepath in filepaths:
logging.info("generating examples from = %s", filepath)
with open(filepath, 'rb') as f:
pt2stream = pickle.load(f)
for pt, stream in pt2stream.items():
out_stream = []
# If time == None there is no temporal info, and for this dataset it is required
stream = [data for data in stream if data[2] is not None]
# Sort the stream by time - ascending
stream.sort(key=lambda data: data[2])
for data in stream:
out_stream.append({
'token': data[0], # Call it token from now on as it does not have to be only CUIs
'cnt': data[1],
'time': int(data[2]), # We convert this into int for speed
'token_type': data[3], # Call it token from now on as it does not have to be only CUIs
#'context_representation': np.random.rand(128), #data[4],
})
pt = str(pt)
if out_stream: # Skip streams that have zero annotations
yield pt, {'patient_id': str(pt),
'stream': out_stream}