1 lines (1 with data), 12.2 kB
{"cells":[{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load in \n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"!pip install torch torchvision feather-format kornia pyarrow --upgrade > /dev/null\n!pip install git+https://github.com/fastai/fastai2 > /dev/null","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"from fastai2.torch_basics import *\nfrom fastai2.test import *\nfrom fastai2.layers import *\nfrom fastai2.data.all import *\nfrom fastai2.optimizer import *\nfrom fastai2.learner import *\nfrom fastai2.metrics import *\nfrom fastai2.vision.all import *\nfrom fastai2.vision.learner import *\nfrom fastai2.vision.models import *\nfrom fastai2.callback.all import *\nfrom fastai2.basics import *\nfrom fastai2.vision.all import *\nfrom fastai2.medical.imaging import *\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"np.set_printoptions(linewidth=120)\nmatplotlib.rcParams['image.cmap'] = 'bone'\nset_seed(42)\nset_num_threads(1)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"\npath = Path('../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/')\npath_train = path/'stage_2_train'\npath_test = path/'stage_2_test'\n\n\npath_dest = Path()\npath_dest.mkdir(exist_ok=True)\n\npath_inp = Path('../input')\npath_df = path_inp/'rsna-stage2-meta'\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"path_df.ls()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_lbls = pd.read_feather(path_df/'labels_stage_2.fth')\ndf_tst = pd.read_feather(path_df/'fns_test_2.fth')\ndf_trn = pd.read_feather(path_df/'fns_train_2.fth').dropna(subset=['img_pct_window'])\ncomb = df_trn.join(df_lbls.set_index('ID'), 'SOPInstanceUID')","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"** Fix incorrect RescaleIntercept ** \n\n"},{"metadata":{"trusted":true},"cell_type":"code","source":"repr_flds = ['BitsStored','PixelRepresentation']\ndf1 = comb.query('(BitsStored==12) & (PixelRepresentation==0)')\ndf2 = comb.query('(BitsStored==12) & (PixelRepresentation==1)')\ndf3 = comb.query('BitsStored==16')\ndfs = L(df1,df2,df3)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"dfs[0]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"dfs[1]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"dfs[2]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def df2dcm(df): \n return L(Path(o).dcmread() for o in df.fname.values)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_iffy = df1[df1.RescaleIntercept>-1000]\ndcms = df2dcm(df_iffy)\n\n_,axs = subplots(4,4, imsize=3)\nfor i,ax in enumerate(axs.flat): dcms[i].show(ax=ax)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"dcm = dcms[2]\nd = dcm.pixel_array\nplt.hist(d.flatten());","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"len(dcms)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"d1 = df2dcm(df1.iloc[[0]])[0].pixel_array\nplt.hist(d1.flatten());\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"scipy.stats.mode(d.flatten()).mode[0]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"d += 1000\n\npx_mode = scipy.stats.mode(d.flatten()).mode[0]\nd[d>=px_mode] = d[d>=px_mode] - px_mode\ndcm.PixelData = d.tobytes()\ndcm.RescaleIntercept = -1000\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"plt.hist(dcm.pixel_array.flatten());","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"_,axs = subplots(1,2)\ndcm.show(ax=axs[0]); \ndcm.show(dicom_windows.brain, ax=axs[1])\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def fix_pxrepr(dcm):\n if dcm.PixelRepresentation != 0 or dcm.RescaleIntercept<-1000: return\n x = dcm.pixel_array + 1000\n px_mode = 4096\n x[x>=px_mode] = x[x>=px_mode] - px_mode\n dcm.PixelData = x.tobytes()\n dcm.RescaleIntercept = -1000","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"dcms = df2dcm(df_iffy)\ndcms.map(fix_pxrepr)\n\n_,axs = subplots(2,5, imsize=3)\nfor i,ax in enumerate(axs.flat): dcms[i].show(ax=ax)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Remove useless images\n* Our goal here is to create a small, fast, convenient dataset for rapid prototyping. So let's get rid of images that don't provide much useful information, such as those with very little actual brain tissue in them. Brain tissue is in the region (0,80). Let's find out how many pixels in this region are in each image. When we created the metadata data frame, we got a img_pct_window column included which has the % of pixels in the brain window."},{"metadata":{"trusted":true},"cell_type":"code","source":"df_iffy.img_pct_window[:10].values","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"plt.hist(comb.img_pct_window,40);","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"There are a lot of images with nearly no brain tissue in them - presumably they're the slices above and below the brain. Let's see if they have any labels:"},{"metadata":{"trusted":true},"cell_type":"code","source":"comb = comb.assign(pct_cut = pd.cut(comb.img_pct_window, [0,0.02,0.05,0.1,0.2,0.3,1]))\ncomb.pivot_table(values='any', index='pct_cut', aggfunc=['sum','count']).T","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"We can see that, as expected, the images with little brain tissue (<2% of pixels) have almost no labels. So let's remove them. (Interestingly, we can also see a strong relationship between these two variables.)"},{"metadata":{},"cell_type":"markdown","source":""},{"metadata":{"trusted":true},"cell_type":"code","source":"comb.drop(comb.query('img_pct_window<0.02').index, inplace=True)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Resample to 2/3 split\n- keep every row with a label:\n- we'll keep half that number of images without a label, which should keep the resultant size under Kaggle's 20GB dataset limit:\n\n"},{"metadata":{"trusted":true},"cell_type":"code","source":"df_lbl = comb.query('any==True')\nn_lbl = len(df_lbl)\nn_lbl","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df_nonlbl = comb.query('any==False').sample(n_lbl//2)\nlen(df_nonlbl)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"comb = pd.concat([df_lbl,df_nonlbl])\nlen(comb)\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"comb.head()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"comb_ = comb.reset_index()\ncomb_.index","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"comb_ = comb_.drop(['pct_cut'],axis =1)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"comb_.to_feather('comb_stage_2.fth')\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from IPython.display import FileLink, FileLinks\nFileLink('comb_stage_2.fth')","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Crop to just brain area\n- To create a smaller and faster dataset, we'll need smaller images. So let's make sure they contain the important information, by cropping out the non-brain area. To do so, we start with an image like this:"},{"metadata":{"trusted":true},"cell_type":"code","source":"dcm = Path(dcms[3].filename).dcmread()\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"fix_pxrepr(dcm)\npx = dcm.windowed(*dicom_windows.brain)\nshow_image(px);","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"blurred = gauss_blur2d(px, 100)\nshow_image(blurred);","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"show_image(blurred>0.3)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"dcm.show(dicom_windows.brain)\nshow_image(dcm.mask_from_blur(dicom_windows.brain), cmap=plt.cm.Reds, alpha=0.6)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"We can use fastai's mask_from_blur method to do this for us. We'll overlay the results on a few images to see if it looks OK:"},{"metadata":{"trusted":true},"cell_type":"code","source":"_,axs = subplots(1,4, imsize=3)\nfor i,ax in enumerate(axs.flat):\n dcms[i].show(dicom_windows.brain, ax=ax)\n show_image(dcms[i].mask_from_blur(dicom_windows.brain), cmap=plt.cm.Reds, alpha=0.6, ax=ax)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def pad_square(x):\n r,c = x.shape\n d = (c-r)/2\n pl,pr,pt,pb = 0,0,0,0\n if d>0: pt,pd = int(math.floor( d)),int(math.ceil( d)) \n else: pl,pr = int(math.floor(-d)),int(math.ceil(-d))\n return np.pad(x, ((pt,pb),(pl,pr)), 'minimum')\n\ndef crop_mask(x):\n mask = x.mask_from_blur(dicom_windows.brain)\n bb = mask2bbox(mask)\n if bb is None: return\n lo,hi = bb\n cropped = x.pixel_array[lo[0]:hi[0],lo[1]:hi[1]]\n x.pixel_array = pad_square(cropped)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"_,axs = subplots(1,2)\ndcm.show(ax=axs[0])\ncrop_mask(dcm)\ndcm.show(ax=axs[1]);","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"htypes = 'any','epidural','intraparenchymal','intraventricular','subarachnoid','subdural'\n\ndef get_samples(df):\n recs = [df.query(f'{c}==1').sample() for c in htypes]\n recs.append(df.query('any==0').sample())\n return pd.concat(recs).fname.values\n\nsample_fns = concat(*dfs.map(get_samples))\nsample_dcms = tuple(Path(o).dcmread().scaled_px for o in sample_fns)\nsamples = torch.stack(sample_dcms)\nbins = samples.freqhist_bins()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"(path_dest/'bins_2.pkl').save(bins)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from IPython.display import FileLink, FileLinks\nFileLink('bins_2.pkl')","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":1}