a b/1-Clean_Data.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "metadata": {},
6
   "source": [
7
    "<h1><center> Clean and Organize data for the I-SPY1 Clinical Trial</center></h1>"
8
   ]
9
  },
10
  {
11
   "cell_type": "code",
12
   "execution_count": 1,
13
   "metadata": {},
14
   "outputs": [
15
    {
16
     "data": {
17
      "text/html": [
18
       "<div>\n",
19
       "<style>\n",
20
       "    .dataframe thead tr:only-child th {\n",
21
       "        text-align: right;\n",
22
       "    }\n",
23
       "\n",
24
       "    .dataframe thead th {\n",
25
       "        text-align: left;\n",
26
       "    }\n",
27
       "\n",
28
       "    .dataframe tbody tr th {\n",
29
       "        vertical-align: top;\n",
30
       "    }\n",
31
       "</style>\n",
32
       "<table border=\"1\" class=\"dataframe\">\n",
33
       "  <thead>\n",
34
       "    <tr style=\"text-align: right;\">\n",
35
       "      <th></th>\n",
36
       "      <th>age</th>\n",
37
       "      <th>White</th>\n",
38
       "      <th>ER+</th>\n",
39
       "      <th>PR+</th>\n",
40
       "      <th>HR+</th>\n",
41
       "      <th>Bilateral</th>\n",
42
       "      <th>Right_Breast</th>\n",
43
       "      <th>MRI_LD_Baseline</th>\n",
44
       "      <th>MRI_LD_1_3dAC</th>\n",
45
       "      <th>MRI_LD_Int_Reg</th>\n",
46
       "      <th>MRI_LD_PreSurg</th>\n",
47
       "      <th>Alive</th>\n",
48
       "      <th>Survival_length</th>\n",
49
       "      <th>RFS</th>\n",
50
       "      <th>RFS_code</th>\n",
51
       "      <th>PCR</th>\n",
52
       "      <th>RCB</th>\n",
53
       "    </tr>\n",
54
       "    <tr>\n",
55
       "      <th>SUBJECTID</th>\n",
56
       "      <th></th>\n",
57
       "      <th></th>\n",
58
       "      <th></th>\n",
59
       "      <th></th>\n",
60
       "      <th></th>\n",
61
       "      <th></th>\n",
62
       "      <th></th>\n",
63
       "      <th></th>\n",
64
       "      <th></th>\n",
65
       "      <th></th>\n",
66
       "      <th></th>\n",
67
       "      <th></th>\n",
68
       "      <th></th>\n",
69
       "      <th></th>\n",
70
       "      <th></th>\n",
71
       "      <th></th>\n",
72
       "      <th></th>\n",
73
       "    </tr>\n",
74
       "  </thead>\n",
75
       "  <tbody>\n",
76
       "    <tr>\n",
77
       "      <th>1001</th>\n",
78
       "      <td>38.73</td>\n",
79
       "      <td>Yes</td>\n",
80
       "      <td>Yes</td>\n",
81
       "      <td>No</td>\n",
82
       "      <td>Yes</td>\n",
83
       "      <td>No</td>\n",
84
       "      <td>No</td>\n",
85
       "      <td>88.0</td>\n",
86
       "      <td>78.0</td>\n",
87
       "      <td>30.0</td>\n",
88
       "      <td>14.0</td>\n",
89
       "      <td>No</td>\n",
90
       "      <td>1264</td>\n",
91
       "      <td>751</td>\n",
92
       "      <td>1</td>\n",
93
       "      <td>No</td>\n",
94
       "      <td>2.0</td>\n",
95
       "    </tr>\n",
96
       "    <tr>\n",
97
       "      <th>1002</th>\n",
98
       "      <td>37.79</td>\n",
99
       "      <td>Yes</td>\n",
100
       "      <td>Yes</td>\n",
101
       "      <td>Yes</td>\n",
102
       "      <td>Yes</td>\n",
103
       "      <td>No</td>\n",
104
       "      <td>Yes</td>\n",
105
       "      <td>29.0</td>\n",
106
       "      <td>26.0</td>\n",
107
       "      <td>66.0</td>\n",
108
       "      <td>16.0</td>\n",
109
       "      <td>No</td>\n",
110
       "      <td>1155</td>\n",
111
       "      <td>1043</td>\n",
112
       "      <td>1</td>\n",
113
       "      <td>No</td>\n",
114
       "      <td>3.0</td>\n",
115
       "    </tr>\n",
116
       "  </tbody>\n",
117
       "</table>\n",
118
       "</div>"
119
      ],
120
      "text/plain": [
121
       "             age White  ER+  PR+  HR+ Bilateral Right_Breast  MRI_LD_Baseline  \\\n",
122
       "SUBJECTID                                                                       \n",
123
       "1001       38.73   Yes  Yes   No  Yes        No           No             88.0   \n",
124
       "1002       37.79   Yes  Yes  Yes  Yes        No          Yes             29.0   \n",
125
       "\n",
126
       "           MRI_LD_1_3dAC  MRI_LD_Int_Reg  MRI_LD_PreSurg Alive  \\\n",
127
       "SUBJECTID                                                        \n",
128
       "1001                78.0            30.0            14.0    No   \n",
129
       "1002                26.0            66.0            16.0    No   \n",
130
       "\n",
131
       "           Survival_length   RFS  RFS_code PCR  RCB  \n",
132
       "SUBJECTID                                            \n",
133
       "1001                  1264   751         1  No  2.0  \n",
134
       "1002                  1155  1043         1  No  3.0  "
135
      ]
136
     },
137
     "execution_count": 1,
138
     "metadata": {},
139
     "output_type": "execute_result"
140
    }
141
   ],
142
   "source": [
143
    "# load module by Julio and pandas\n",
144
    "from ispy1 import clean_data\n",
145
    "import pandas as pd\n",
146
    "\n",
147
    "file = './data/I-SPY_1_All_Patient_Clinical_and_Outcome_Data.xlsx'\n",
148
    "df = clean_data.clean_my_data(file)\n",
149
    "df.head(2)\n",
150
    "\n",
151
    "# save clean data in new  csv file\n",
152
    "df.to_csv('./data/I-SPY_1_clean_data.csv')\n",
153
    "\n",
154
    "df.head(2)"
155
   ]
156
  }
157
 ],
158
 "metadata": {
159
  "anaconda-cloud": {},
160
  "kernelspec": {
161
   "display_name": "Python [conda root]",
162
   "language": "python",
163
   "name": "conda-root-py"
164
  },
165
  "language_info": {
166
   "codemirror_mode": {
167
    "name": "ipython",
168
    "version": 3
169
   },
170
   "file_extension": ".py",
171
   "mimetype": "text/x-python",
172
   "name": "python",
173
   "nbconvert_exporter": "python",
174
   "pygments_lexer": "ipython3",
175
   "version": "3.5.2"
176
  }
177
 },
178
 "nbformat": 4,
179
 "nbformat_minor": 2
180
}