In [1]:
import pandas as pd
from pathlib import Path
from collections import defaultdict
import pickle

In [2]:
stage = "stage_2"

# Train dataset

In [3]:
df = pd.read_csv(f"data/{stage}_train_dicom_diags.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,BitsAllocated,BitsStored,Columns,HighBit,ImageOrientationPatient_0,ImageOrientationPatient_1,ImageOrientationPatient_2,ImageOrientationPatient_3,ImageOrientationPatient_4,...,WindowWidth,WindowWidth_0,WindowWidth_1,fid,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,0,16,16,512,15,1.0,0.0,0.0,0.0,0.927184,...,80.0,,,ID_000012eaf,0,0,0,0,0,0
1,1,16,16,512,15,1.0,0.0,0.0,0.0,0.968148,...,80.0,,,ID_000039fa0,0,0,0,0,0,0
2,2,16,16,512,15,1.0,0.0,0.0,0.0,1.0,...,100.0,,,ID_00005679d,0,0,0,0,0,0
3,3,16,12,512,11,1.0,0.0,0.0,0.0,0.994522,...,,80.0,80.0,ID_00008ce3c,0,0,0,0,0,0
4,4,16,16,512,15,1.0,0.0,0.0,0.0,1.0,...,135.0,,,ID_0000950d7,0,0,0,0,0,0


In [5]:
# sort, then group by (order is preserved within groups)
gs = df.sort_values('ImagePositionPatient_2').groupby('SeriesInstanceUID')
len(gs)

21744

In [6]:
# see if it worked
gs.get_group('ID_fa19cd5ea9')[['ImagePositionPatient_2', 'fid']].head()

Unnamed: 0,ImagePositionPatient_2,fid
577964,193.542489,ID_c45659d3d
229790,198.214051,ID_4e0bdd2ba
22395,202.885613,ID_079945c27
746126,207.557174,ID_fdbfb2c17
253266,212.228736,ID_55f7bbbf2


In [7]:
g = gs.get_group('ID_fa19cd5ea9')

In [8]:
subg = g[['SeriesInstanceUID', 'fid', 'any', 'epidural', 
          'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']]

In [9]:
subg

Unnamed: 0,SeriesInstanceUID,fid,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
577964,ID_fa19cd5ea9,ID_c45659d3d,0,0,0,0,0,0
229790,ID_fa19cd5ea9,ID_4e0bdd2ba,0,0,0,0,0,0
22395,ID_fa19cd5ea9,ID_079945c27,1,0,0,0,0,1
746126,ID_fa19cd5ea9,ID_fdbfb2c17,1,0,0,0,0,1
253266,ID_fa19cd5ea9,ID_55f7bbbf2,1,0,0,0,0,1
549211,ID_fa19cd5ea9,ID_ba7080372,1,0,0,0,0,1
592856,ID_fa19cd5ea9,ID_c964e4096,1,0,0,0,0,1
183149,ID_fa19cd5ea9,ID_3e31d57d0,1,0,0,0,0,1
306771,ID_fa19cd5ea9,ID_680b2194c,1,0,0,0,0,1
540358,ID_fa19cd5ea9,ID_b76b13444,1,0,0,0,0,1


In [10]:
# You can use a btrfs snapshot and rename files by study_id and z-pos through the brain
def rename_train_group(subg):
    ix = 0
    total = len(subg)
    for index, row in subg.iterrows():
        cur_fn = row['fid']
        new_fn = f"{row['SeriesInstanceUID']}_{ix:03}_{total:03}_{row['any']}_{row['epidural']}_{row['intraparenchymal']}_{row['intraventricular']}_{row['subarachnoid']}_{row['subdural']}_{cur_fn}"
        ix += 1
        Path(f'data/unzip_renamed/{stage}_train_images/{cur_fn}.dcm').rename(f'data/unzip_renamed/{stage}_train_images/{new_fn}.dcm')

In [11]:
def index_group(subg, study_ix_to_fn, fn_to_study_ix):
    ix = 0
    for index, row in subg.iterrows():
        fn = row['SOPInstanceUID']
        study = row['SeriesInstanceUID']
        study_ix_to_fn[study].append(fn)
        fn_to_study_ix[fn] = (study, ix)
        ix += 1  

In [12]:
labels = [ 'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural' ]

def label_group(subg, fn_to_labels):
    for index, row in subg.iterrows():
        fn = row['SOPInstanceUID']
        fn_to_labels[fn] = [ label for label in labels if row[label] == 1 ]

In [13]:
train_study_ix_to_fn = defaultdict(list)
train_fn_to_study_ix = {}
train_fn_to_labels = {}

for name, subg in gs:
    #rename_train_group(subg)
    index_group(subg, train_study_ix_to_fn, train_fn_to_study_ix)
    label_group(subg, train_fn_to_labels)

In [14]:
# Do not pickle yet, merge with test
pickle.dump(train_study_ix_to_fn, open(f"data/{stage}_train_study_ix_to_fn.pickle", "wb" ))
pickle.dump(train_fn_to_study_ix, open(f"data/{stage}_train_fn_to_study_ix.pickle", "wb" ))

In [15]:
train_fn_to_labels['ID_079945c27']

['any', 'subdural']

In [16]:
pickle.dump(train_fn_to_labels, open(f"data/{stage}_train_fn_to_labels.pickle", 'wb'))

# Test dataset

In [17]:
df = pd.read_csv(f"data/{stage}_test_dicom.csv")

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,BitsAllocated,BitsStored,Columns,HighBit,ImageOrientationPatient_0,ImageOrientationPatient_1,ImageOrientationPatient_2,ImageOrientationPatient_3,ImageOrientationPatient_4,...,SamplesPerPixel,SeriesInstanceUID,StudyID,StudyInstanceUID,WindowCenter,WindowCenter_0,WindowCenter_1,WindowWidth,WindowWidth_0,WindowWidth_1
0,0,16,12,512,11,1.0,0.0,0.0,0.0,0.981627,...,1,ID_4d28912ba6,,ID_1f6d1e8aeb,,40.0,40.0,,80.0,80.0
1,1,16,16,512,15,1.0,0.0,0.0,0.0,0.987688,...,1,ID_acabdeee86,,ID_4a8d7ec19f,30.0,,,80.0,,
2,2,16,16,512,15,1.0,0.0,0.0,0.0,0.927184,...,1,ID_d00cee7f0c,,ID_a6ca244172,30.0,,,80.0,,
3,3,16,16,512,15,1.0,0.0,0.0,0.0,0.986286,...,1,ID_a52a0112d5,,ID_fa950a03af,30.0,,,80.0,,
4,4,16,12,512,11,1.0,0.0,0.0,0.0,1.0,...,1,ID_f552d3b922,,ID_965d8b3d8e,,36.0,36.0,,80.0,80.0


In [19]:
# sort, then group by (order is preserver within groups)
gs = df.sort_values('ImagePositionPatient_2').groupby('SeriesInstanceUID')
len(gs)

3518

In [20]:
def rename_test_group(subg):
    ix = 0
    total = len(subg)
    for index, row in subg.iterrows():
        cur_fn = row['SOPInstanceUID']
        new_fn = f"{row['SeriesInstanceUID']}_{ix:03}_{total:03}_{cur_fn}"
        ix += 1
        Path(f'data/unzip_renamed/{stage}_test_images/{cur_fn}.dcm').rename(f'data/unzip_renamed/{stage}_test_images/{new_fn}.dcm')


In [21]:
test_study_ix_to_fn = defaultdict(list)
test_fn_to_study_ix = {}

for name, subg in gs:
    #rename_test_group(subg)
    index_group(subg, test_study_ix_to_fn, test_fn_to_study_ix)


In [22]:
pickle.dump(test_study_ix_to_fn, open(f"data/{stage}_test_study_ix_to_fn.pickle", "wb" ))
pickle.dump(test_fn_to_study_ix, open(f"data/{stage}_test_fn_to_study_ix.pickle", "wb" ))

In [23]:
study_ix_to_fn = { **train_study_ix_to_fn, **test_study_ix_to_fn }
fn_to_study_ix = { **train_fn_to_study_ix, **test_fn_to_study_ix }

In [24]:
pickle.dump(study_ix_to_fn, open(f"data/{stage}_study_ix_to_fn.pickle", "wb" ))
pickle.dump(fn_to_study_ix, open(f"data/{stage}_fn_to_study_ix.pickle", "wb" ))