[f54d94]: / tutorials / synthetic_data_generation / generate_patients.py

Download this file

95 lines (72 with data), 2.7 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import argparse
import datetime
import json
import os
import pickle
import random
import datasets
import jsonschema
import meds
import pyarrow
import pyarrow.parquet
import femr.ontology
import femr.transforms
parser = argparse.ArgumentParser(prog="generate_patients", description="Create synthetic data")
parser.add_argument("athena", type=str)
parser.add_argument("destination", type=str)
args = parser.parse_args()
random.seed(4533)
def get_random_patient(patient_id):
epoch = datetime.datetime(1990, 1, 1)
birth = epoch + datetime.timedelta(days=random.randint(100, 1000))
current_date = birth
gender = "Gender/" + random.choice(["F", "M"])
race = "Race/" + random.choice(["White", "Non-White"])
patient = {
"patient_id": patient_id,
"events": [
{
"time": birth,
"measurements": [
{"code": meds.birth_code},
{"code": gender},
{"code": race},
],
},
],
}
code_cats = ["ICD9CM", "RxNorm"]
for code in range(random.randint(1, 10 + (20 if gender == "Gender/F" else 0))):
code_cat = random.choice(code_cats)
if code_cat == "RxNorm":
code = str(random.randint(0, 10000))
else:
code = str(random.randint(0, 10000))
if len(code) > 3:
code = code[:3] + "." + code[3:]
current_date = current_date + datetime.timedelta(days=random.randint(1, 100))
code = code_cat + "/" + code
patient["events"].append({"time": current_date, "measurements": [{"code": code}]})
return patient
patients = []
for i in range(200):
patients.append(get_random_patient(i))
patient_schema = meds.patient_schema()
patient_table = pyarrow.Table.from_pylist(patients, patient_schema)
os.makedirs(os.path.join(args.destination, "data"), exist_ok=True)
pyarrow.parquet.write_table(patient_table, os.path.join(args.destination, "data", "patients.parquet"))
metadata = {
"dataset_name": "femr synthetic datata",
"dataset_version": "1",
"etl_name": "synthetic data",
"etl_version": "1",
"code_metadata": {},
}
jsonschema.validate(instance=metadata, schema=meds.dataset_metadata)
with open(os.path.join(args.destination, "metadata.json"), "w") as f:
json.dump(metadata, f)
dataset = datasets.Dataset.from_parquet(os.path.join(args.destination, "data", "*"))
ontology = femr.ontology.Ontology(args.athena)
ontology.prune_to_dataset(dataset, remove_ontologies=("SNOMED"))
with open(os.path.join(args.destination, "ontology.pkl"), "wb") as f:
pickle.dump(ontology, f)