[6b97c3]: / src / preprocessing.py

Download this file

62 lines (51 with data), 2.0 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
import math
import utils
# if a value in vector not numeric, replace it to "with_value"
# params: vector
# return: numeric vector
def clean_nonnumeric(vector, with_value=0):
# predicate to check if string is an integer
def is_number(string):
try:
return not math.isnan(float(string))
except ValueError:
return False
return [
i if is_number(i) else with_value
for i in vector
]
# perform preprocessing as described in paper
# params: list of studies as dataframes
# return: combined preprocessed dataframe
def preprocess(studies):
for study in studies:
# sort records by time per patient and drop duplicate measurements
study.sort_values(by=['PatientID', 'TreatmentDay'], inplace=True, ignore_index=True)
study.drop_duplicates(subset=['PatientID', 'TreatmentDay'], inplace=True)
# extract study and arm nr
study['StudyNr'] = \
study['StudyArm'].apply(lambda saTxt: int(saTxt[6]))
study['Arm'] = \
study['StudyArm'].apply(lambda saTxt: int(saTxt[-1]))
study.drop('StudyArm', axis=1, inplace=True)
# set nonnumeric values to 0
study['TargetLesionLongDiam_mm'] = \
clean_nonnumeric(study['TargetLesionLongDiam_mm'], with_value=0)
study['TargetLesionLongDiam_mm'].astype('float')
# calculate tumor volume using formula
study['TumorVolume_mm3'] = \
study['TargetLesionLongDiam_mm'].apply(lambda ld: ld ** 3 * 0.5)
# normalize tumor volume to range of [0,1]
min_tv = min(map(
lambda study: study['TumorVolume_mm3'].min(),
studies
))
max_tv = max(map(
lambda study: study['TumorVolume_mm3'].max(),
studies
))
for study in studies:
study['TumorVolumeNorm'] = \
study['TumorVolume_mm3'].apply(lambda tv: (tv - min_tv) / (max_tv - min_tv))
return studies