--- a +++ b/eda/bad_actors.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# coding: utf-8 + +import sys +import os +import csv +import numpy as np +import pandas as pd +import pydicom +from multiprocessing import Pool +from tqdm.auto import tqdm +tqdm.pandas() + + +TRAIN_CSV_PATH = '../src/training.csv' +VALIDATE_CSV_PATH = '../src/validation.csv' +# TEST_CSV_PATH = '../src/testing.csv' + +train_data = '../../data/stage_1_train_images/' +# test_data = '../../data/stage_1_test_images/' + + +def check_dicom(row, path=train_data): + try: + data = pydicom.dcmread(path+row[0]) + except ValueError: + print('corruption on open...') + return True + try: + img = np.array(data.pixel_array, dtype=float) + except ValueError: + print('corruption on pixel_array...') + return True + if img.shape != (512, 512): + print('square peg in round hole!') + return True + if np.std(img) == 0: + print('Zero std dev.') + return True + + return False + + +def parallelize_dataframe(df, func, n_cores=4): + df_split = np.array_split(df, n_cores) + pool = Pool(n_cores) + df = pd.concat(pool.map(func, df_split)) + pool.close() + pool.join() + return df + + +def find_bad_actors(df): + df['bad_actors'] = df.progress_apply(lambda x: check_dicom(x), axis=1) + return df + + +############### Comment or Uncomment to process VALIDATION data ############### +df_validate = pd.read_csv(VALIDATE_CSV_PATH, header=None) +num_rows = df_validate.shape[0] + +# find bad actors by reading dicom data +df_validate = parallelize_dataframe(df_validate, find_bad_actors) +df_validate_cleaned = df_validate.loc[df_validate['bad_actors'] == False] + +# remove bad actors from the df and check shapes +df_validate_cleaned = df_validate_cleaned.drop(columns=['bad_actors']) +assert df_validate_cleaned.shape[0] <= df_validate.shape[0] + +print("Rows in validate before: ", num_rows) +print("Rows in validate after: ", df_validate_cleaned.shape[0]) +print("Verified bad actors were removed") + +df_validate_cleaned.columns = ["filename", "targets", "any"] +df_validate_cleaned.to_csv('../src/validation_cleaned.csv', index=False) + + + +############### Comment or Uncomment to process TRAINING data ############### +df_train = pd.read_csv(TRAIN_CSV_PATH, header=None) +train_rows = df_train.shape[0] + +# find bad actors by reading dicom data +df_train = parallelize_dataframe(df_train, find_bad_actors) +df_train_cleaned = df_train.loc[df_train['bad_actors'] == False] + +# remove bad actors from the df and check shapes +df_train_cleaned = df_train_cleaned.drop(columns=['bad_actors']) +assert df_train_cleaned.shape[0] <= df_train.shape[0] + +print("Rows in train before: ", train_rows) +print("Rows in train after: ", df_train_cleaned.shape[0]) +print("Verified bad actors were removed") + +df_train_cleaned.columns = ["filename", "targets", "any"] +df_train_cleaned.to_csv('../src/training_cleaned.csv', index=False) + + + +############### Comment or Uncomment to process TEST data ############### +# df_validate = pd.read_csv(VALIDATE_CSV_PATH, header=None) +# print("Rows in train before: ", df_validate.shape[0]) +# df_validate = parallelize_dataframe(df_validate, find_bad_actors) +# df_validate_cleaned = df_validate.loc[df_validate['bad_actors'] == False] +# df_validate_cleaned = df_validate_cleaned.drop(columns=['bad_actors']) +# assert df_validate_cleaned.shape[0] <= df_validate.shape[0] +# print("Rows in train after: ", df_validate_cleaned.shape[0]) +# os.remove(VALIDATE_CSV_PATH) +# print("Verified bad actors were removed and deleted old train CSV") +# df_validate_cleaned.to_csv('../src/training.csv', index=False) + +print('All Done!')