[4d9a06]: / cancer_clinical_trials.ipynb

Download this file

508 lines (507 with data), 16.4 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0a8fc263",
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import sys, os, re, csv, math, codecs, numpy as np, pandas as pd\n",
    "\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.layers import BatchNormalization\n",
    "from keras.layers.advanced_activations import ELU\n",
    "from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SpatialDropout1D\n",
    "from keras.layers import MaxPool1D, Flatten, Conv1D, GRU, GlobalAveragePooling1D, GlobalMaxPooling1D\n",
    "from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping\n",
    "from keras.layers import concatenate\n",
    "from keras.layers import Bidirectional, GlobalMaxPool1D\n",
    "from keras.models import Model\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "from keras import initializers, regularizers, constraints, optimizers, layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "eb108433",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>study</th>\n",
       "      <th>condition</th>\n",
       "      <th>qualification</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>study interventions are recombinant CD40-ligand</td>\n",
       "      <td>melanoma skin diagnosis and no active cns met...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>study interventions are Liposomal doxorubicin</td>\n",
       "      <td>colorectal cancer diagnosis and cardiovascular</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>study interventions are BI 836909</td>\n",
       "      <td>multiple myeloma diagnosis and indwelling cen...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>study interventions are Immunoglobulins</td>\n",
       "      <td>recurrent fallopian tube carcinoma diagnosis ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>study interventions are Paclitaxel</td>\n",
       "      <td>stage ovarian cancer diagnosis and patients m...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999995</th>\n",
       "      <td>study interventions are Pazopanib</td>\n",
       "      <td>carcinoma renal cell diagnosis and pregnant o...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999996</th>\n",
       "      <td>study interventions are Dexamethasone 21-phosp...</td>\n",
       "      <td>uveal melanoma diagnosis and presence of any ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999997</th>\n",
       "      <td>study interventions are Camptothecin</td>\n",
       "      <td>rectal cancer diagnosis and creatinine cleara...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999998</th>\n",
       "      <td>study interventions are Cyclophosphamide</td>\n",
       "      <td>stage iii non hodgkin lymphoma diagnosis and ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999999</th>\n",
       "      <td>study interventions are Cyclophosphamide</td>\n",
       "      <td>lymphoma diagnosis and patients with active h...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000000 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    study  \\\n",
       "0        study interventions are recombinant CD40-ligand    \n",
       "1          study interventions are Liposomal doxorubicin    \n",
       "2                      study interventions are BI 836909    \n",
       "3                study interventions are Immunoglobulins    \n",
       "4                     study interventions are Paclitaxel    \n",
       "...                                                   ...   \n",
       "999995                 study interventions are Pazopanib    \n",
       "999996  study interventions are Dexamethasone 21-phosp...   \n",
       "999997              study interventions are Camptothecin    \n",
       "999998          study interventions are Cyclophosphamide    \n",
       "999999          study interventions are Cyclophosphamide    \n",
       "\n",
       "                                                condition  qualification  \n",
       "0        melanoma skin diagnosis and no active cns met...              0  \n",
       "1          colorectal cancer diagnosis and cardiovascular              0  \n",
       "2        multiple myeloma diagnosis and indwelling cen...              0  \n",
       "3        recurrent fallopian tube carcinoma diagnosis ...              0  \n",
       "4        stage ovarian cancer diagnosis and patients m...              0  \n",
       "...                                                   ...            ...  \n",
       "999995   carcinoma renal cell diagnosis and pregnant o...              1  \n",
       "999996   uveal melanoma diagnosis and presence of any ...              1  \n",
       "999997   rectal cancer diagnosis and creatinine cleara...              1  \n",
       "999998   stage iii non hodgkin lymphoma diagnosis and ...              1  \n",
       "999999   lymphoma diagnosis and patients with active h...              1  \n",
       "\n",
       "[1000000 rows x 3 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df=pd.read_csv('cancer_clinical_trials.csv')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d21147b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "embed_size = 50 \n",
    "max_features = 20000 \n",
    "maxlen = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b46157fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "list_sentences_train = df[\"condition\"].fillna(\"_na_\").values \n",
    "list_classes = [\"qualification\"] \n",
    "y = df[list_classes].values \n",
    "#list_sentences_test = test[\"comment_text\"].fillna(\"_na_\").values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a02b4b51",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer(num_words=max_features)\n",
    "tokenizer.fit_on_texts(list(list_sentences_train))\n",
    "list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ded2bcb8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 97 157   2   1  36  62 251 146  54 301 331   3 391]]\n"
     ]
    }
   ],
   "source": [
    "print(np.reshape(list_tokenized_train[0], (1,-1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "07ce9fb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "bf9032c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_coefs(word,*arr): \n",
    "    return word, np.asarray(arr, dtype='float32')\n",
    "embeddings_index = dict(get_coefs(*o.strip().split()) for o in open('glove.6B.50d.txt',encoding=\"utf8\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "374c4566",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\AVANISH SINGH\\AppData\\Roaming\\Python\\Python38\\site-packages\\IPython\\core\\interactiveshell.py:3377: FutureWarning: arrays to stack must be passed as a \"sequence\" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.\n",
      "  if (await self.run_code(code, result,  async_=asy)):\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(0.020940226, 0.64410394)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_embs = np.stack(embeddings_index.values())\n",
    "emb_mean,emb_std = all_embs.mean(), all_embs.std()\n",
    "emb_mean,emb_std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "86d7c350",
   "metadata": {},
   "outputs": [],
   "source": [
    "word_index = tokenizer.word_index \n",
    "nb_words = min(max_features, len(word_index))\n",
    "embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5b8dfd1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "for word, i in word_index.items():\n",
    "    if i >= max_features: \n",
    "        continue \n",
    "    embedding_vector = embeddings_index.get(word) \n",
    "    \n",
    "    if embedding_vector is not None: \n",
    "        embedding_matrix[i] = embedding_vector"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6bda8b85",
   "metadata": {},
   "source": [
    "# Binary_Crossentropy computes the crossentropy metric between the labels and predictions. I use this because its work well  when there are only two label classes (0 and 1)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "29628b45",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\AVANISH SINGH\\AppData\\Roaming\\Python\\Python38\\site-packages\\keras\\optimizer_v2\\adam.py:105: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead.\n",
      "  super(Adam, self).__init__(name, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "inp = Input(shape=(maxlen,))\n",
    "x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)\n",
    "\n",
    "x = SpatialDropout1D(0.2)(x)\n",
    "x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)\n",
    "x = Conv1D(64, kernel_size = 3, padding = \"valid\", kernel_initializer = \"glorot_uniform\")(x) \n",
    "avg_pool = GlobalAveragePooling1D()(x) \n",
    "max_pool = GlobalMaxPooling1D()(x)\n",
    "\n",
    "x = concatenate([avg_pool, max_pool])  \n",
    "\n",
    "preds = Dense(1, activation=\"sigmoid\")(x) \n",
    "\n",
    "\n",
    "\n",
    "model = Model(inp, preds) \n",
    "model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-4),metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "89a5af43",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/2\n",
      "14063/14063 [==============================] - ETA: 0s - loss: 0.3600 - accuracy: 0.8476WARNING:tensorflow:Learning rate reduction is conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy,lr\n",
      "14063/14063 [==============================] - 4049s 288ms/step - loss: 0.3600 - accuracy: 0.8476 - val_loss: 0.2868 - val_accuracy: 0.8814 - lr: 1.0000e-04\n",
      "Epoch 2/2\n",
      "14063/14063 [==============================] - ETA: 0s - loss: 0.2879 - accuracy: 0.8827WARNING:tensorflow:Learning rate reduction is conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy,lr\n",
      "14063/14063 [==============================] - 3962s 282ms/step - loss: 0.2879 - accuracy: 0.8827 - val_loss: 0.2140 - val_accuracy: 0.9168 - lr: 1.0000e-04\n"
     ]
    }
   ],
   "source": [
    "learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', \n",
    "                                            patience=2, \n",
    "                                            verbose=1, \n",
    "                                            factor=0.5, \n",
    "                                            min_lr=0.00000001)\n",
    "\n",
    "callbacks = [learning_rate_reduction, \n",
    "             EarlyStopping('val_loss', patience=3), \n",
    "             ModelCheckpoint('ic_model.h5', save_best_only=True)]\n",
    "\n",
    "history = model.fit(X_t, y, batch_size=64, epochs=2, validation_split=0.1, callbacks=callbacks);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "3cdabfef",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = pd.read_csv('assignment_test.csv')\n",
    "list_sentences_test = test[\"condition\"].fillna(\"_na_\").values\n",
    "list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f5b11562",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "6b97b572",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[   0,    0,    0, ...,   82,   79,   22],\n",
       "       [   0,    0,    0, ...,   42, 1519, 5397],\n",
       "       [   0,    0,    0, ..., 1431,   30,  202],\n",
       "       ...,\n",
       "       [   0,    0,    0, ...,   12,  142,  146],\n",
       "       [   0,    0,    0, ...,    3,  521,  529],\n",
       "       [   0,    0,    0, ...,    2,    1,  144]])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_te"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "42bf4889",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20/20 [==============================] - 7s 352ms/step\n"
     ]
    }
   ],
   "source": [
    "y_pred = model.predict([X_te], batch_size=1024, verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "d3f76370",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.9954495 ],\n",
       "       [0.3494191 ],\n",
       "       [0.9680229 ],\n",
       "       ...,\n",
       "       [0.00700411],\n",
       "       [0.25054577],\n",
       "       [0.67711437]], dtype=float32)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6909f6c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}