Diff of /eda/bad_actors.py [000000] .. [fb2ce2]

Switch to side-by-side view

--- a
+++ b/eda/bad_actors.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import sys
+import os
+import csv
+import numpy as np
+import pandas as pd
+import pydicom
+from multiprocessing import  Pool
+from tqdm.auto import tqdm
+tqdm.pandas()
+
+
+TRAIN_CSV_PATH = '../src/training.csv'
+VALIDATE_CSV_PATH = '../src/validation.csv'
+# TEST_CSV_PATH = '../src/testing.csv'
+
+train_data = '../../data/stage_1_train_images/'
+# test_data = '../../data/stage_1_test_images/'
+
+
+def check_dicom(row, path=train_data):
+    try:
+        data = pydicom.dcmread(path+row[0])
+    except ValueError:
+        print('corruption on open...')
+        return True
+    try:
+        img = np.array(data.pixel_array, dtype=float)
+    except ValueError:
+        print('corruption on pixel_array...')
+        return True
+    if img.shape != (512, 512):
+        print('square peg in round hole!')
+        return True
+    if np.std(img) == 0:
+        print('Zero std dev.')
+        return True
+
+    return False
+
+
+def parallelize_dataframe(df, func, n_cores=4):
+    df_split = np.array_split(df, n_cores)
+    pool = Pool(n_cores)
+    df = pd.concat(pool.map(func, df_split))
+    pool.close()
+    pool.join()
+    return df
+
+
+def find_bad_actors(df):
+    df['bad_actors'] = df.progress_apply(lambda x: check_dicom(x), axis=1)
+    return df
+
+
+############### Comment or Uncomment to process VALIDATION data ###############
+df_validate = pd.read_csv(VALIDATE_CSV_PATH, header=None)
+num_rows = df_validate.shape[0]
+
+# find bad actors by reading dicom data
+df_validate = parallelize_dataframe(df_validate, find_bad_actors)
+df_validate_cleaned = df_validate.loc[df_validate['bad_actors'] == False]
+
+# remove bad actors from the df and check shapes
+df_validate_cleaned = df_validate_cleaned.drop(columns=['bad_actors'])
+assert df_validate_cleaned.shape[0] <= df_validate.shape[0]
+
+print("Rows in validate before: ", num_rows)
+print("Rows in validate after: ", df_validate_cleaned.shape[0])
+print("Verified bad actors were removed")
+
+df_validate_cleaned.columns = ["filename", "targets", "any"]
+df_validate_cleaned.to_csv('../src/validation_cleaned.csv', index=False)
+
+
+
+############### Comment or Uncomment to process TRAINING data ###############
+df_train = pd.read_csv(TRAIN_CSV_PATH, header=None)
+train_rows = df_train.shape[0]
+
+# find bad actors by reading dicom data
+df_train = parallelize_dataframe(df_train, find_bad_actors)
+df_train_cleaned = df_train.loc[df_train['bad_actors'] == False]
+
+# remove bad actors from the df and check shapes
+df_train_cleaned = df_train_cleaned.drop(columns=['bad_actors'])
+assert df_train_cleaned.shape[0] <= df_train.shape[0]
+
+print("Rows in train before: ", train_rows)
+print("Rows in train after: ", df_train_cleaned.shape[0])
+print("Verified bad actors were removed")
+
+df_train_cleaned.columns = ["filename", "targets", "any"]
+df_train_cleaned.to_csv('../src/training_cleaned.csv', index=False)
+
+
+
+############### Comment or Uncomment to process TEST data ###############
+# df_validate = pd.read_csv(VALIDATE_CSV_PATH, header=None)
+# print("Rows in train before: ", df_validate.shape[0])
+# df_validate = parallelize_dataframe(df_validate, find_bad_actors)
+# df_validate_cleaned = df_validate.loc[df_validate['bad_actors'] == False]
+# df_validate_cleaned = df_validate_cleaned.drop(columns=['bad_actors'])
+# assert df_validate_cleaned.shape[0] <= df_validate.shape[0]
+# print("Rows in train after: ", df_validate_cleaned.shape[0])
+# os.remove(VALIDATE_CSV_PATH)
+# print("Verified bad actors were removed and deleted old train CSV")
+# df_validate_cleaned.to_csv('../src/training.csv', index=False)
+
+print('All Done!')