In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm_notebook
import pydicom
import itertools
import numpy as np
from concurrent.futures import ProcessPoolExecutor

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

# Read stage_X_train and split id/label

In [3]:
stage = "stage_2"

In [4]:
bad_dcm_fn = f'data/unzip/{stage}_train_images/ID_6431af929.dcm'

In [5]:
!rm {bad_dcm_fn}

rm: cannot remove 'data/unzip/stage_2_train_images/ID_6431af929.dcm': No such file or directory


In [6]:
df_train = pd.read_csv(f'data/unzip/{stage}_train.csv')

In [7]:
df_train.head()

Unnamed: 0,ID,Label
0,ID_12cadc6af_epidural,0
1,ID_12cadc6af_intraparenchymal,0
2,ID_12cadc6af_intraventricular,0
3,ID_12cadc6af_subarachnoid,0
4,ID_12cadc6af_subdural,0


In [8]:
df_train['fid'] = df_train.ID.apply(lambda x: '_'.join(x.split('_')[:2]) )

In [9]:
df_train.columns = ['ID', 'probability', 'fid']

In [10]:
df_train['label'] = df_train.ID.apply(lambda x: x.split('_')[-1])

In [11]:
df_train.head()

Unnamed: 0,ID,probability,fid,label
0,ID_12cadc6af_epidural,0,ID_12cadc6af,epidural
1,ID_12cadc6af_intraparenchymal,0,ID_12cadc6af,intraparenchymal
2,ID_12cadc6af_intraventricular,0,ID_12cadc6af,intraventricular
3,ID_12cadc6af_subarachnoid,0,ID_12cadc6af,subarachnoid
4,ID_12cadc6af_subdural,0,ID_12cadc6af,subdural


# Remove dupes 

In [12]:
df_train.shape

(4516842, 4)

In [13]:
df_train.drop_duplicates('ID', inplace=True)

In [14]:
df_train.shape

(4516818, 4)

# Remove corrupted image

In [15]:
df_train = df_train[df_train.fid != 'ID_6431af929'] # ID_6431af929

In [16]:
df_train.shape

(4516812, 4)

# Create pivot table with diagnostic labels as columns
Generates:
* `train_diags.csv` (previously named `train_pivot.csv`)

In [17]:
df_diags = df_train.pivot(index='fid', columns='label', values='probability')

In [18]:
df_diags.head()

label,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
fid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID_000012eaf,0,0,0,0,0,0
ID_000039fa0,0,0,0,0,0,0
ID_00005679d,0,0,0,0,0,0
ID_00008ce3c,0,0,0,0,0,0
ID_0000950d7,0,0,0,0,0,0


In [19]:
df_diags.shape

(752802, 6)

In [20]:
df_diags.reset_index(inplace=True)

In [21]:
df_diags.head()

label,fid,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf,0,0,0,0,0,0
1,ID_000039fa0,0,0,0,0,0,0
2,ID_00005679d,0,0,0,0,0,0
3,ID_00008ce3c,0,0,0,0,0,0
4,ID_0000950d7,0,0,0,0,0,0


In [22]:
df_diags.shape

(752802, 7)

In [23]:
df_diags.to_csv(f'data/{stage}_train_diags.csv', index=False)

# Generate fastai-ready csv image file (.png) -> labels
This is needed for early experiments that worked with the .png dataset.

Generates:
* `train_labels_as_strings.csv`

In [24]:
from collections import defaultdict

d = defaultdict(list)
for fid in df_train.fid.unique(): d[fid]

for tup in df_train.itertuples():
    if tup.probability: d[tup.fid].append(tup.label)

In [25]:
ks, vs = [], []

for k, v in d.items():
    ks.append(k), vs.append(' '.join(v))

In [26]:
fastai_df = pd.DataFrame(data={'fn': ks, 'labels': vs})

In [27]:
fastai_df.shape

(752802, 2)

In [28]:
fastai_df['fn'] += '.png'

In [29]:
fastai_df.head()

Unnamed: 0,fn,labels
0,ID_12cadc6af.png,
1,ID_38fd7baa0.png,
2,ID_6c5d82413.png,
3,ID_aec8e68b3.png,subarachnoid any
4,ID_4d9209c7c.png,


In [30]:
fastai_df.to_csv(f'data/{stage}_train_labels_as_strings.csv', index=False)

# Tabulate dicom data
Generates:
* `train_dicom.csv`
* `test_dicom.csv`

In [31]:
cols_i_want = ['BitsAllocated', 'BitsStored', 'Columns', 'HighBit', 'ImageOrientationPatient', 
               'ImagePositionPatient', 'Modality', 'PatientID', 'PhotometricInterpretation', 
               'PixelRepresentation', 'PixelSpacing', 'RescaleIntercept', 'RescaleSlope', 
               'Rows', 'SOPInstanceUID', 'SamplesPerPixel', 'SeriesInstanceUID', 'StudyID', 
               'StudyInstanceUID', 'WindowCenter', 'WindowWidth']
useless_cols = [ 'PixelData' ]

In [32]:
train_dcm_list = list(Path(f'data/unzip/{stage}_train_images').glob('*.dcm'))

In [33]:
f = train_dcm_list[0]

In [34]:
dicom = pydicom.dcmread(str(f))
dicom

(0008, 0018) SOP Instance UID                    UI: ID_000012eaf
(0008, 0060) Modality                            CS: 'CT'
(0010, 0020) Patient ID                          LO: 'ID_f15c0eee'
(0020, 000d) Study Instance UID                  UI: ID_30ea2b02d4
(0020, 000e) Series Instance UID                 UI: ID_0ab5820b2a
(0020, 0010) Study ID                            SH: ''
(0020, 0032) Image Position (Patient)            DS: ['-125.000000', '-115.897980', '77.970825']
(0020, 0037) Image Orientation (Patient)         DS: ['1.000000', '0.000000', '0.000000', '0.000000', '0.927184', '-0.374607']
(0028, 0002) Samples per Pixel                   US: 1
(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'
(0028, 0010) Rows                                US: 512
(0028, 0011) Columns                             US: 512
(0028, 0030) Pixel Spacing                       DS: ['0.488281', '0.488281']
(0028, 0100) Bits Allocated                      US: 16
(0028, 0101) Bits Stored 

In [35]:
dicom.data_element('PatientID').value

'ID_f15c0eee'

In [36]:
ipp = dicom.data_element('ImagePositionPatient').value
ipp

['-125.000000', '-115.897980', '77.970825']

In [37]:
ipp[0]

"-125.000000"

In [38]:
type(ipp)

pydicom.multival.MultiValue

In [39]:
def tabulate_dicom_data(file_list):
    l = []
    for f in file_list:
        dicom = pydicom.dcmread(str(f))
        d = {}
        for s in cols_i_want:
            v = dicom.data_element(s).value
            if isinstance(v, pydicom.multival.MultiValue):
                for i in range(len(v)):
                    d[s + '_' + str(i)] = v[i]
            else:
                d[s] = v
        l.append(d)
        
    return l


In [40]:
with ProcessPoolExecutor(max_workers=32) as e:
     l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(train_dcm_list, 32))))

In [41]:
df_train_dicom = pd.DataFrame(l)

In [42]:
df_train_dicom.head()

Unnamed: 0,BitsAllocated,BitsStored,Columns,HighBit,ImageOrientationPatient_0,ImageOrientationPatient_1,ImageOrientationPatient_2,ImageOrientationPatient_3,ImageOrientationPatient_4,ImageOrientationPatient_5,...,SamplesPerPixel,SeriesInstanceUID,StudyID,StudyInstanceUID,WindowCenter,WindowCenter_0,WindowCenter_1,WindowWidth,WindowWidth_0,WindowWidth_1
0,16,16,512,15,1.0,0.0,0.0,0.0,0.927184,-0.374607,...,1,ID_0ab5820b2a,,ID_30ea2b02d4,30.0,,,80.0,,
1,16,16,512,15,1.0,0.0,0.0,0.0,0.968148,-0.25038,...,1,ID_5f8484c3e0,,ID_134d398b61,30.0,,,80.0,,
2,16,16,512,15,1.0,0.0,0.0,0.0,1.0,0.0,...,1,ID_203cd6ec46,,ID_b5c26cda09,50.0,,,100.0,,
3,16,12,512,11,1.0,0.0,0.0,0.0,0.994522,0.104528,...,1,ID_3780d48b28,,ID_974735bf79,,40.0,40.0,,80.0,80.0
4,16,16,512,15,1.0,0.0,0.0,0.0,1.0,0.0,...,1,ID_84296c3845,,ID_8881b1c4b1,35.0,,,135.0,,


In [43]:
df_train_dicom.to_csv(f'data/{stage}_train_dicom.csv')

In [44]:
test_dcm_list = list(Path(f'data/unzip/{stage}_test_images').glob('*.dcm'))
with ProcessPoolExecutor(max_workers=32) as e:
     l = list(itertools.chain.from_iterable(e.map(tabulate_dicom_data, np.array_split(test_dcm_list, 32))))

In [45]:
df_test_dicom = pd.DataFrame(l)
df_test_dicom.head()

Unnamed: 0,BitsAllocated,BitsStored,Columns,HighBit,ImageOrientationPatient_0,ImageOrientationPatient_1,ImageOrientationPatient_2,ImageOrientationPatient_3,ImageOrientationPatient_4,ImageOrientationPatient_5,...,SamplesPerPixel,SeriesInstanceUID,StudyID,StudyInstanceUID,WindowCenter,WindowCenter_0,WindowCenter_1,WindowWidth,WindowWidth_0,WindowWidth_1
0,16,12,512,11,1.0,0.0,0.0,0.0,0.981627,-0.190809,...,1,ID_4d28912ba6,,ID_1f6d1e8aeb,,40.0,40.0,,80.0,80.0
1,16,16,512,15,1.0,0.0,0.0,0.0,0.987688,-0.156434,...,1,ID_acabdeee86,,ID_4a8d7ec19f,30.0,,,80.0,,
2,16,16,512,15,1.0,0.0,0.0,0.0,0.927184,-0.374607,...,1,ID_d00cee7f0c,,ID_a6ca244172,30.0,,,80.0,,
3,16,16,512,15,1.0,0.0,0.0,0.0,0.986286,-0.165048,...,1,ID_a52a0112d5,,ID_fa950a03af,30.0,,,80.0,,
4,16,12,512,11,1.0,0.0,0.0,0.0,1.0,0.0,...,1,ID_f552d3b922,,ID_965d8b3d8e,,36.0,36.0,,80.0,80.0


In [46]:
df_test_dicom.to_csv(f'data/{stage}_test_dicom.csv')

# Add labels to the train dicom csv
Generates:
* `train_dicom_diags.csv` (previously named `train_dicom_pivot.csv`)

In [47]:
df_train_dicom_diags = pd.merge(df_train_dicom, df_diags,  how='left', left_on=['SOPInstanceUID'], right_on = ['fid'])

In [48]:
assert len(df_train_dicom) == len(df_diags) == len(df_train_dicom_diags)

In [49]:
df_train_dicom_diags.to_csv(f'data/{stage}_train_dicom_diags.csv')