[dec218]: / dataset_builder / preprocess_utils / preprocess_db_utils.py

Download this file

131 lines (111 with data), 5.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import sys
import random
import numpy as np
import time
import pandas as pd
from datetime import datetime, timedelta
class Sampler:
def __init__(self):
self.rng = np.random.default_rng(0)
def condition_value_shuffler(self, table, target_cols):
table_copy = table.copy()
original_idx = np.arange(len(table))
rng_val = np.random.default_rng(0) # fix the seed for shuffling
shuffled_idx = rng_val.choice(original_idx, len(original_idx), replace=False).tolist()
table_copy.iloc[original_idx, [table_copy.columns.get_loc(col) for col in target_cols]] = table_copy[target_cols].values[shuffled_idx]
return table_copy
def first_admit_year_sampler(self, start_year, span_year, earliest_year=None):
end_year = start_year + span_year
prob = np.array(range(1, span_year + 2)) / (sum(np.array(range(1, span_year + 2))))
sampled_year = self.rng.choice(range(start_year, end_year + 1), p=prob)
if earliest_year is not None:
year_adjustment = int(sampled_year - earliest_year)
return year_adjustment * 365 * 24 * 60 # in minute
else:
return sampled_year
def sample_date_given_year(self, year, num_split=1, frmt="%Y-%m-%d"):
start_time = time.mktime(time.strptime(f"{year}-01-01", frmt))
end_time = time.mktime(time.strptime(f"{year}-12-31", frmt))
dts = []
for split in range(num_split):
split_seed = split / num_split + self.rng.random() / num_split
ptime = start_time + split_seed * (end_time - start_time)
dt = datetime.fromtimestamp(time.mktime(time.localtime(ptime)))
dts.append(dt.strftime("%Y-%m-%d"))
return dts
def adjust_time(table, time_col, patient_col, start_year=None, current_time=None, offset_dict=None):
shifted_time = []
for idx, time_val in enumerate(table[time_col].values):
if pd.notnull(time_val) and time_val != "":
if offset_dict is not None:
id_ = table[patient_col].iloc[idx]
if id_ in offset_dict:
if type(time_val) == str: # mimic3
time_val = str(datetime.strptime(time_val, "%Y-%m-%d %H:%M:%S") + timedelta(minutes=int(offset_dict[id_])))
else: # eicu
time_val = str(datetime.strptime(offset_dict[id_], "%Y-%m-%d %H:%M:%S") + timedelta(minutes=int(time_val)))
else:
time_val = None
if time_val is not None and current_time is not None:
if current_time < time_val:
time_val = None
# NOTE: 20230514
# for MIMIC-IV, due to the timeshifting strategy is based on the first studydatetime,
# we need to make sure the time_val (specifically, admittime) is not earlier than the start_year
if time_val is not None and start_year is not None:
assert len(str(start_year)) == 4
start_time = str(datetime.strptime(f"{start_year}-01-01", "%Y-%m-%d"))
if time_val < start_time:
time_val = None
else:
time_val = None
shifted_time.append(time_val)
return shifted_time
def read_csv(
data_dir,
filename,
columns=None,
lower=True,
filter_dict=None,
dtype=None,
memory_efficient=False,
):
filepath = os.path.join(data_dir, filename)
if memory_efficient:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
ProgressBar().register()
if filepath.endswith("gz"):
compression = "gzip"
else:
compression = None
if dtype:
df = dd.read_csv(filepath, blocksize=25e6, dtype=dtype, compression=compression)
else:
df = dd.read_csv(filepath, blocksize=25e6, compression=compression)
if columns is not None:
df = df[columns]
if filter_dict is not None:
for key in filter_dict:
df = df[df[key].isin(filter_dict[key])]
df = df.compute()
else:
df = pd.read_csv(filepath, usecols=columns)
if filter_dict is not None:
for key in filter_dict:
df = df[df[key].isin(filter_dict[key])]
if lower:
df = df.applymap(lambda x: x.lower().strip() if pd.notnull(x) and type(x) == str else x)
return df
def generate_random_date(year: int) -> datetime:
# Set the start and end dates for the range
start_date = datetime(year, 1, 1)
end_date = datetime(year, 12, 31)
# Calculate the number of days in the range
num_days = (end_date - start_date).days + 1
# Generate a random number of days to add to the start date
random_offset = timedelta(days=random.randint(0, num_days))
# Calculate the random date by adding the offset to the start date
random_date = start_date + random_offset
return random_date