--- a +++ b/1-Clean_Data.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h1><center> Clean and Organize data for the I-SPY1 Clinical Trial</center></h1>" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style>\n", + " .dataframe thead tr:only-child th {\n", + " text-align: right;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: left;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>age</th>\n", + " <th>White</th>\n", + " <th>ER+</th>\n", + " <th>PR+</th>\n", + " <th>HR+</th>\n", + " <th>Bilateral</th>\n", + " <th>Right_Breast</th>\n", + " <th>MRI_LD_Baseline</th>\n", + " <th>MRI_LD_1_3dAC</th>\n", + " <th>MRI_LD_Int_Reg</th>\n", + " <th>MRI_LD_PreSurg</th>\n", + " <th>Alive</th>\n", + " <th>Survival_length</th>\n", + " <th>RFS</th>\n", + " <th>RFS_code</th>\n", + " <th>PCR</th>\n", + " <th>RCB</th>\n", + " </tr>\n", + " <tr>\n", + " <th>SUBJECTID</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1001</th>\n", + " <td>38.73</td>\n", + " <td>Yes</td>\n", + " <td>Yes</td>\n", + " <td>No</td>\n", + " <td>Yes</td>\n", + " <td>No</td>\n", + " <td>No</td>\n", + " <td>88.0</td>\n", + " <td>78.0</td>\n", + " <td>30.0</td>\n", + " <td>14.0</td>\n", + " <td>No</td>\n", + " <td>1264</td>\n", + " <td>751</td>\n", + " <td>1</td>\n", + " <td>No</td>\n", + " <td>2.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1002</th>\n", + " <td>37.79</td>\n", + " <td>Yes</td>\n", + " <td>Yes</td>\n", + " <td>Yes</td>\n", + " <td>Yes</td>\n", + " <td>No</td>\n", + " <td>Yes</td>\n", + " <td>29.0</td>\n", + " <td>26.0</td>\n", + " <td>66.0</td>\n", + " <td>16.0</td>\n", + " <td>No</td>\n", + " <td>1155</td>\n", + " <td>1043</td>\n", + " <td>1</td>\n", + " <td>No</td>\n", + " <td>3.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " age White ER+ PR+ HR+ Bilateral Right_Breast MRI_LD_Baseline \\\n", + "SUBJECTID \n", + "1001 38.73 Yes Yes No Yes No No 88.0 \n", + "1002 37.79 Yes Yes Yes Yes No Yes 29.0 \n", + "\n", + " MRI_LD_1_3dAC MRI_LD_Int_Reg MRI_LD_PreSurg Alive \\\n", + "SUBJECTID \n", + "1001 78.0 30.0 14.0 No \n", + "1002 26.0 66.0 16.0 No \n", + "\n", + " Survival_length RFS RFS_code PCR RCB \n", + "SUBJECTID \n", + "1001 1264 751 1 No 2.0 \n", + "1002 1155 1043 1 No 3.0 " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# load module by Julio and pandas\n", + "from ispy1 import clean_data\n", + "import pandas as pd\n", + "\n", + "file = './data/I-SPY_1_All_Patient_Clinical_and_Outcome_Data.xlsx'\n", + "df = clean_data.clean_my_data(file)\n", + "df.head(2)\n", + "\n", + "# save clean data in new csv file\n", + "df.to_csv('./data/I-SPY_1_clean_data.csv')\n", + "\n", + "df.head(2)" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda root]", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}