a b/data/decode.py
1
# Decode segmentation masks and save them as npy files in a mask folder
2
# Run this once only. It should take <5mins
3
import numpy as np
4
import pandas as pd
5
from glob import glob
6
from tqdm import tqdm
7
tqdm.pandas()
8
from PIL import Image
9
import sys
10
import os
11
import cv2
12
import pdb
13
14
# from util import constants as C
15
IMAGE_SIZE = 224
16
17
# Reference: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
18
def rle_decode(mask_rle, shape):
19
    height, width = shape
20
    if mask_rle==mask_rle:
21
        s = mask_rle.split()
22
        starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
23
        starts -= 1
24
        ends = starts + lengths
25
        img = np.zeros(height*width, dtype=np.uint8)
26
        for lo, hi in zip(starts, ends):
27
            img[lo:hi] = 1
28
    else: # If segmentation is NaN, just return an array of 0s
29
        img = np.zeros(height*width, dtype=np.uint8)
30
    return img.reshape(shape)  # Needed to align to RLE direction
31
32
if __name__ == '__main__':
33
    # usage: python decode.py [masks_folder_path] [combined_csv_path] [final_csv_path] [downsample frac]
34
    masks_folder_path = sys.argv[1]
35
    combined_csv_path = sys.argv[2]
36
    final_csv_path = sys.argv[3]
37
    downsample = float(sys.argv[4])
38
39
    # Make masks folder path if it doesn't already exist
40
    if not os.path.exists(masks_folder_path):
41
        os.mkdir(masks_folder_path)
42
43
    # Read in the combined df
44
    combined_df = pd.read_csv(combined_csv_path, keep_default_na=False, index_col=0)
45
46
    #Downsample some empty images
47
    if downsample != 1:
48
        combined_df['empty'] = combined_df.apply(lambda x: 1 if x.segmentation == '' else 0, axis = 1)
49
        combined_df['empty_image'] = combined_df.groupby(['id'])['empty'].transform(lambda x: sum(x))
50
        empty_df = combined_df[combined_df['empty_image']== 3]
51
        masks_df = combined_df[combined_df['empty_image']< 3]
52
        
53
        random_select = empty_df[empty_df['class'] == 'large_bowel'].sample(frac = downsample)['id']
54
        empty_df = pd.merge(random_select, empty_df, how = 'left', on = 'id')
55
56
        combined_df = pd.concat([masks_df, empty_df])
57
58
    classes = ['small_bowel', 'large_bowel', 'stomach'] # mask classes
59
    mask_paths = []
60
    
61
    # Decode!
62
    case_ids = combined_df['id'].unique()
63
    combined_df.set_index(['id', 'class'], inplace=True)
64
    for case_id in tqdm(case_ids):
65
        # make mask for each class and store in dict
66
        mask_dict = {}
67
        for mask_class in classes:
68
                        
69
            # identify row in df with relevant info for the case id and class
70
            id_class = combined_df.loc[case_id, mask_class]
71
            
72
            # decode the mask
73
            decoded_mask = rle_decode(id_class['segmentation'], (id_class['slice_height'], id_class['slice_width'])) 
74
75
            decoded_mask = cv2.resize(decoded_mask, (IMAGE_SIZE, IMAGE_SIZE))
76
            
77
            # store decoded mask in dictionary
78
            mask_dict[mask_class] = decoded_mask
79
        
80
        case_mask = np.stack([mask_dict[c] for c in classes], axis=-1)
81
        mask_path = os.path.join(masks_folder_path, case_id + '.npy')
82
        np.save(mask_path, case_mask)
83
        mask_paths.append(mask_path)
84
    
85
    # save csv of mask paths
86
    combined_df.reset_index(inplace=True)
87
88
    mask_path_df = combined_df[['case', 'day', 'slice_id', 'image_path', 'pic_info', 'slice_height', 'slice_width', 'pixel_height', 'pixel_width']]
89
    mask_path_df.drop_duplicates(inplace=True)
90
    mask_path_df['mask_path'] = mask_paths
91
92
    mask_path_df.to_csv(final_csv_path + '.csv')