--- a
+++ b/finalProject/240 final project_newest.ipynb
@@ -0,0 +1,3266 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "719a655c-4a20-493c-9475-582ae6c39e32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "# Load raw data\n",
+    "df = pd.read_csv('overview-of-recordings.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3915fae5-3aa7-4895-a07e-99003db61a2e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 6661 entries, 0 to 6660\n",
+      "Data columns (total 13 columns):\n",
+      " #   Column                               Non-Null Count  Dtype  \n",
+      "---  ------                               --------------  -----  \n",
+      " 0   audio_clipping                       6661 non-null   object \n",
+      " 1   audio_clipping:confidence            6661 non-null   float64\n",
+      " 2   background_noise_audible             6661 non-null   object \n",
+      " 3   background_noise_audible:confidence  6661 non-null   float64\n",
+      " 4   overall_quality_of_the_audio         6661 non-null   float64\n",
+      " 5   quiet_speaker                        6661 non-null   object \n",
+      " 6   quiet_speaker:confidence             6661 non-null   float64\n",
+      " 7   speaker_id                           6661 non-null   int64  \n",
+      " 8   file_download                        6661 non-null   object \n",
+      " 9   file_name                            6661 non-null   object \n",
+      " 10  phrase                               6661 non-null   object \n",
+      " 11  prompt                               6661 non-null   object \n",
+      " 12  writer_id                            6661 non-null   int64  \n",
+      "dtypes: float64(4), int64(2), object(7)\n",
+      "memory usage: 676.6+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b2c4477c-b934-4da8-8333-5b9e336bb0e0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "There are 0 duplicate .\n"
+     ]
+    }
+   ],
+   "source": [
+    "#start cleansing\n",
+    "#count duplicate\n",
+    "duplicate=df.duplicated().sum()\n",
+    "print (f'There are', duplicate ,'duplicate .')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "45abc48b-790b-4f8d-98aa-2c29e1b8cf92",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>phrase</th>\n",
+       "      <th>prompt</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>When I remember her I feel down</td>\n",
+       "      <td>Emotional pain</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>When I carry heavy things I feel like breaking...</td>\n",
+       "      <td>Hair falling out</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>there is too much pain when i move my arm</td>\n",
+       "      <td>Heart hurts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>My son had his lip pierced and it is swollen a...</td>\n",
+       "      <td>Infected wound</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>My muscles in my lower back are aching</td>\n",
+       "      <td>Infected wound</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6656</th>\n",
+       "      <td>I feel a burning sensation in my guts about 2 ...</td>\n",
+       "      <td>Stomach ache</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6657</th>\n",
+       "      <td>I have a split on my thumb that will not heal.</td>\n",
+       "      <td>Open wound</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6658</th>\n",
+       "      <td>I feel a lot of pain in the joints.</td>\n",
+       "      <td>Joint pain</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6659</th>\n",
+       "      <td>The area around my heart doesn't feel good.</td>\n",
+       "      <td>Heart hurts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6660</th>\n",
+       "      <td>I complain alot with skin allergy</td>\n",
+       "      <td>Skin issue</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>6661 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 phrase            prompt\n",
+       "0                       When I remember her I feel down    Emotional pain\n",
+       "1     When I carry heavy things I feel like breaking...  Hair falling out\n",
+       "2             there is too much pain when i move my arm       Heart hurts\n",
+       "3     My son had his lip pierced and it is swollen a...    Infected wound\n",
+       "4                My muscles in my lower back are aching    Infected wound\n",
+       "...                                                 ...               ...\n",
+       "6656  I feel a burning sensation in my guts about 2 ...      Stomach ache\n",
+       "6657     I have a split on my thumb that will not heal.        Open wound\n",
+       "6658                I feel a lot of pain in the joints.        Joint pain\n",
+       "6659        The area around my heart doesn't feel good.       Heart hurts\n",
+       "6660                  I complain alot with skin allergy        Skin issue\n",
+       "\n",
+       "[6661 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# collection the texts needed\n",
+    "Text = df[['phrase', 'prompt']]\n",
+    "Text\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "73ba1336",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'my', 'just', \"you've\", 'you', 'o', 'couldn', 'isn', 'do', 's', 're', \"wouldn't\", 'she', 'this', 'against', 'been', 'ain', \"isn't\", 'which', 'he', 'as', 'him', 'on', 'shan', 'did', 'further', 'am', 'is', 'out', \"couldn't\", 'then', \"weren't\", 'have', 'now', 'because', 'ourselves', 'll', 'under', 'can', 'what', 've', 'up', 'they', 'mightn', 'won', \"that'll\", 'we', 'where', \"didn't\", \"mightn't\", 'theirs', 'who', \"shouldn't\", 'these', \"you'd\", 'herself', 'by', 'down', 'when', \"aren't\", 'of', \"mustn't\", 'it', 'here', 'all', 'didn', 'his', 'but', 'y', 'our', 'those', 'some', \"should've\", 'ma', 'into', 'other', 'mustn', 'ours', 'myself', 'a', \"you're\", 'below', 'yourselves', 'me', 'more', 'i', 'yours', 'for', 'aren', \"hadn't\", 'their', 'are', 'there', 'doing', 'hasn', \"doesn't\", 'himself', 'few', 'same', 'between', 'having', \"you'll\", 'not', 'should', 'hers', 'themselves', 'has', 'if', 'haven', 'about', 'during', 'off', \"needn't\", 'own', 'was', 'does', 'only', \"wasn't\", 'were', 'nor', 'm', 'had', 'such', 'than', 'from', 'at', \"she's\", 'to', 'in', \"don't\", \"hasn't\", 'be', 'any', 'that', 'her', 'the', \"it's\", 'd', 'your', \"haven't\", \"won't\", 'them', 'doesn', 'how', 'don', 'weren', 'why', 'an', 'yourself', 'before', 'hadn', 'with', 'wasn', 'wouldn', \"shan't\", 'while', 'both', 'each', 'most', 'or', 'its', 'and', 'itself', 'through', 'after', 'will', 'very', 'again', 'once', 'no', 'being', 'until', 'shouldn', 'needn', 'too', 'whom', 'above', 't', 'so', 'over'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "#nltk.download() \n",
+    "from nltk.corpus import stopwords\n",
+    "# save English stopwords\n",
+    "stopwords_list = set(stopwords.words(\"english\"))\n",
+    "print(stopwords_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6bd0c9a",
+   "metadata": {},
+   "source": [
+    "## Clean Text Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "39b72f50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Clean text data\n",
+    "from nltk.corpus import wordnet as wn \n",
+    "from nltk.stem import WordNetLemmatizer \n",
+    "from nltk.tokenize import word_tokenize\n",
+    "import string\n",
+    "import re\n",
+    "\n",
+    "def phrase_cleanse(phrase):\n",
+    "    #Tokenize and divide phrase into separate words\n",
+    "    token_words = word_tokenize(phrase)\n",
+    "    \n",
+    "    # Convert all texts to lower cases\n",
+    "    words_step1 = []\n",
+    "    for word_1 in token_words:\n",
+    "        words_step1.append(word_1.lower())\n",
+    "    \n",
+    "    #Clear all punctuation\n",
+    "    words_step2 = [] \n",
+    "    for word_2 in words_step1:\n",
+    "        word_cleaned = re.sub(r'[^\\w\\s]','',word_2)\n",
+    "        words_step2.append(word_cleaned)\n",
+    "    \n",
+    "    #Clean the text list\n",
+    "    words_step3 = []\n",
+    "    for word_3 in words_step2:\n",
+    "        # check if every characters are alphbets\n",
+    "        if word_3.isalpha():\n",
+    "            # get rid of stop words\n",
+    "            if word_3 not in list(stopwords_list):\n",
+    "                words_step3.append(word_3)\n",
+    "            else:\n",
+    "                continue\n",
+    "    \n",
+    "    #Lemmatization - group different forms of same word which has more than 2 characters into one word\n",
+    "    lem = nltk.stem.WordNetLemmatizer()\n",
+    "    lem_list = []\n",
+    "    for word_4 in words_step3:\n",
+    "        if(len(word_4) > 2):\n",
+    "            lem_list.append(lem.lemmatize(word_4))\n",
+    "    \n",
+    "    join_text = \" \".join(lem_list)\n",
+    "    \n",
+    "    return join_text\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c6435d05",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>phrase</th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>new_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>When I remember her I feel down</td>\n",
+       "      <td>Emotional pain</td>\n",
+       "      <td>remember feel</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>When I carry heavy things I feel like breaking...</td>\n",
+       "      <td>Hair falling out</td>\n",
+       "      <td>carry heavy thing feel like breaking back</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>there is too much pain when i move my arm</td>\n",
+       "      <td>Heart hurts</td>\n",
+       "      <td>much pain move arm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>My son had his lip pierced and it is swollen a...</td>\n",
+       "      <td>Infected wound</td>\n",
+       "      <td>son lip pierced swollen skin inside lip grey l...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>My muscles in my lower back are aching</td>\n",
+       "      <td>Infected wound</td>\n",
+       "      <td>muscle lower back aching</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6656</th>\n",
+       "      <td>I feel a burning sensation in my guts about 2 ...</td>\n",
+       "      <td>Stomach ache</td>\n",
+       "      <td>feel burning sensation gut hour meal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6657</th>\n",
+       "      <td>I have a split on my thumb that will not heal.</td>\n",
+       "      <td>Open wound</td>\n",
+       "      <td>split thumb heal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6658</th>\n",
+       "      <td>I feel a lot of pain in the joints.</td>\n",
+       "      <td>Joint pain</td>\n",
+       "      <td>feel lot pain joint</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6659</th>\n",
+       "      <td>The area around my heart doesn't feel good.</td>\n",
+       "      <td>Heart hurts</td>\n",
+       "      <td>area around heart feel good</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6660</th>\n",
+       "      <td>I complain alot with skin allergy</td>\n",
+       "      <td>Skin issue</td>\n",
+       "      <td>complain alot skin allergy</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>6661 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 phrase            prompt  \\\n",
+       "0                       When I remember her I feel down    Emotional pain   \n",
+       "1     When I carry heavy things I feel like breaking...  Hair falling out   \n",
+       "2             there is too much pain when i move my arm       Heart hurts   \n",
+       "3     My son had his lip pierced and it is swollen a...    Infected wound   \n",
+       "4                My muscles in my lower back are aching    Infected wound   \n",
+       "...                                                 ...               ...   \n",
+       "6656  I feel a burning sensation in my guts about 2 ...      Stomach ache   \n",
+       "6657     I have a split on my thumb that will not heal.        Open wound   \n",
+       "6658                I feel a lot of pain in the joints.        Joint pain   \n",
+       "6659        The area around my heart doesn't feel good.       Heart hurts   \n",
+       "6660                  I complain alot with skin allergy        Skin issue   \n",
+       "\n",
+       "                                               new_text  \n",
+       "0                                         remember feel  \n",
+       "1             carry heavy thing feel like breaking back  \n",
+       "2                                    much pain move arm  \n",
+       "3     son lip pierced swollen skin inside lip grey l...  \n",
+       "4                              muscle lower back aching  \n",
+       "...                                                 ...  \n",
+       "6656               feel burning sensation gut hour meal  \n",
+       "6657                                   split thumb heal  \n",
+       "6658                                feel lot pain joint  \n",
+       "6659                        area around heart feel good  \n",
+       "6660                         complain alot skin allergy  \n",
+       "\n",
+       "[6661 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Cleaned Data Result\n",
+    "import numpy as np\n",
+    "#Text[\"new_text\"] = Text[\"phrase\"].apply(text_clean)\n",
+    "#Text \n",
+    "text = np.array(Text.loc[:,'phrase'])\n",
+    "new_text = []\n",
+    "for i in text:\n",
+    "    new_text.append(phrase_cleanse(i))\n",
+    "Text.insert(2,'new_text',new_text)\n",
+    "Text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49e0d8f0",
+   "metadata": {},
+   "source": [
+    "## TF-IDF (Term Frequency-Inverse Document Frequency)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "071e44fc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>abdomen</th>\n",
+       "      <th>abdominal</th>\n",
+       "      <th>able</th>\n",
+       "      <th>abronchial</th>\n",
+       "      <th>accident</th>\n",
+       "      <th>accidentally</th>\n",
+       "      <th>accompanied</th>\n",
+       "      <th>ache</th>\n",
+       "      <th>aching</th>\n",
+       "      <th>...</th>\n",
+       "      <th>wound</th>\n",
+       "      <th>wrap</th>\n",
+       "      <th>write</th>\n",
+       "      <th>wrong</th>\n",
+       "      <th>yard</th>\n",
+       "      <th>year</th>\n",
+       "      <th>yellow</th>\n",
+       "      <th>yesterday</th>\n",
+       "      <th>young</th>\n",
+       "      <th>zit</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Emotional pain</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Hair falling out</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Heart hurts</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Infected wound</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Infected wound</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.593903</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6656</th>\n",
+       "      <td>Stomach ache</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6657</th>\n",
+       "      <td>Open wound</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6658</th>\n",
+       "      <td>Joint pain</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6659</th>\n",
+       "      <td>Heart hurts</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6660</th>\n",
+       "      <td>Skin issue</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>6661 rows × 954 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                prompt  abdomen  abdominal  able  abronchial  accident  \\\n",
+       "0       Emotional pain      0.0        0.0   0.0         0.0       0.0   \n",
+       "1     Hair falling out      0.0        0.0   0.0         0.0       0.0   \n",
+       "2          Heart hurts      0.0        0.0   0.0         0.0       0.0   \n",
+       "3       Infected wound      0.0        0.0   0.0         0.0       0.0   \n",
+       "4       Infected wound      0.0        0.0   0.0         0.0       0.0   \n",
+       "...                ...      ...        ...   ...         ...       ...   \n",
+       "6656      Stomach ache      0.0        0.0   0.0         0.0       0.0   \n",
+       "6657        Open wound      0.0        0.0   0.0         0.0       0.0   \n",
+       "6658        Joint pain      0.0        0.0   0.0         0.0       0.0   \n",
+       "6659       Heart hurts      0.0        0.0   0.0         0.0       0.0   \n",
+       "6660        Skin issue      0.0        0.0   0.0         0.0       0.0   \n",
+       "\n",
+       "      accidentally  accompanied  ache    aching  ...  wound  wrap  write  \\\n",
+       "0              0.0          0.0   0.0  0.000000  ...    0.0   0.0    0.0   \n",
+       "1              0.0          0.0   0.0  0.000000  ...    0.0   0.0    0.0   \n",
+       "2              0.0          0.0   0.0  0.000000  ...    0.0   0.0    0.0   \n",
+       "3              0.0          0.0   0.0  0.000000  ...    0.0   0.0    0.0   \n",
+       "4              0.0          0.0   0.0  0.593903  ...    0.0   0.0    0.0   \n",
+       "...            ...          ...   ...       ...  ...    ...   ...    ...   \n",
+       "6656           0.0          0.0   0.0  0.000000  ...    0.0   0.0    0.0   \n",
+       "6657           0.0          0.0   0.0  0.000000  ...    0.0   0.0    0.0   \n",
+       "6658           0.0          0.0   0.0  0.000000  ...    0.0   0.0    0.0   \n",
+       "6659           0.0          0.0   0.0  0.000000  ...    0.0   0.0    0.0   \n",
+       "6660           0.0          0.0   0.0  0.000000  ...    0.0   0.0    0.0   \n",
+       "\n",
+       "      wrong  yard  year  yellow  yesterday  young  zit  \n",
+       "0       0.0   0.0   0.0     0.0        0.0    0.0  0.0  \n",
+       "1       0.0   0.0   0.0     0.0        0.0    0.0  0.0  \n",
+       "2       0.0   0.0   0.0     0.0        0.0    0.0  0.0  \n",
+       "3       0.0   0.0   0.0     0.0        0.0    0.0  0.0  \n",
+       "4       0.0   0.0   0.0     0.0        0.0    0.0  0.0  \n",
+       "...     ...   ...   ...     ...        ...    ...  ...  \n",
+       "6656    0.0   0.0   0.0     0.0        0.0    0.0  0.0  \n",
+       "6657    0.0   0.0   0.0     0.0        0.0    0.0  0.0  \n",
+       "6658    0.0   0.0   0.0     0.0        0.0    0.0  0.0  \n",
+       "6659    0.0   0.0   0.0     0.0        0.0    0.0  0.0  \n",
+       "6660    0.0   0.0   0.0     0.0        0.0    0.0  0.0  \n",
+       "\n",
+       "[6661 rows x 954 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Use TF-IDF to evaluate how relevant the words are in the file\n",
+    "# tf_idf = tf(word)*idf(word)\n",
+    "\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "# Create and fit tf_idf model\n",
+    "text_vectorize = TfidfVectorizer()\n",
+    "X_tf_idf = text_vectorize.fit_transform(Text[\"new_text\"])\n",
+    "\n",
+    "dense_list = X_tf_idf.todense().tolist()\n",
+    "feature_names = text_vectorize.get_feature_names()\n",
+    "df_tf_idf = pd.DataFrame(dense_list, columns = feature_names)\n",
+    "\n",
+    "# concatenate prompt column with tf_idf matrix\n",
+    "text_tf_idf = pd.concat([Text[\"prompt\"], df_tf_idf], axis = 1)\n",
+    "text_tf_idf\n",
+    "\n",
+    "#text_tf_idf.to_csv(f\"tf_idf.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77627c3d",
+   "metadata": {},
+   "source": [
+    "## HashingVectorizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "92cce5d7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "      <th>...</th>\n",
+       "      <th>65</th>\n",
+       "      <th>66</th>\n",
+       "      <th>67</th>\n",
+       "      <th>68</th>\n",
+       "      <th>69</th>\n",
+       "      <th>70</th>\n",
+       "      <th>71</th>\n",
+       "      <th>72</th>\n",
+       "      <th>73</th>\n",
+       "      <th>74</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Emotional pain</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>-0.707107</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Hair falling out</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>-0.377964</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>-0.377964</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Heart hurts</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Infected wound</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.534522</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>-0.534522</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Infected wound</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6656</th>\n",
+       "      <td>Stomach ache</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>-0.353553</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.353553</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.707107</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.353553</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6657</th>\n",
+       "      <td>Open wound</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>-0.57735</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6658</th>\n",
+       "      <td>Joint pain</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>-0.500000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6659</th>\n",
+       "      <td>Heart hurts</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>-0.447214</td>\n",
+       "      <td>0.447214</td>\n",
+       "      <td>-0.447214</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6660</th>\n",
+       "      <td>Skin issue</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>6661 rows × 76 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                prompt    0    1    2         3         4         5    6    7  \\\n",
+       "0       Emotional pain  0.0  0.0  0.0  0.000000  0.000000 -0.707107  0.0  0.0   \n",
+       "1     Hair falling out  0.0  0.0  0.0 -0.377964  0.000000 -0.377964  0.0  0.0   \n",
+       "2          Heart hurts  0.0  0.0  0.0  0.000000  0.000000  0.000000  0.0  0.0   \n",
+       "3       Infected wound  0.0  0.0  0.0  0.000000  0.000000  0.000000  0.0  0.0   \n",
+       "4       Infected wound  0.0  0.0  0.0  0.000000  0.500000  0.000000  0.0  0.0   \n",
+       "...                ...  ...  ...  ...       ...       ...       ...  ...  ...   \n",
+       "6656      Stomach ache  0.0  0.0  0.0  0.000000  0.000000 -0.353553  0.0  0.0   \n",
+       "6657        Open wound  0.0  0.0  0.0  0.000000  0.000000  0.000000  0.0  0.0   \n",
+       "6658        Joint pain  0.5  0.0  0.0  0.000000  0.000000 -0.500000  0.0  0.0   \n",
+       "6659       Heart hurts  0.0  0.0  0.0 -0.447214  0.447214 -0.447214  0.0  0.0   \n",
+       "6660        Skin issue  0.0  0.0  0.0  0.000000  0.000000  0.000000  0.0  0.0   \n",
+       "\n",
+       "             8  ...        65   66       67        68   69   70   71  \\\n",
+       "0     0.000000  ...  0.000000  0.0  0.00000  0.000000  0.0  0.0  0.0   \n",
+       "1     0.000000  ...  0.000000  0.0  0.00000  0.000000  0.0  0.0  0.0   \n",
+       "2     0.000000  ...  0.000000  0.0  0.00000  0.000000  0.0  0.0  0.0   \n",
+       "3     0.000000  ...  0.534522  0.0  0.00000  0.000000  0.0  0.0  0.0   \n",
+       "4     0.000000  ...  0.000000  0.0  0.00000  0.000000  0.0  0.0  0.0   \n",
+       "...        ...  ...       ...  ...      ...       ...  ...  ...  ...   \n",
+       "6656  0.353553  ...  0.000000  0.0  0.00000  0.707107  0.0  0.0  0.0   \n",
+       "6657  0.000000  ...  0.000000  0.0 -0.57735  0.000000  0.0  0.0  0.0   \n",
+       "6658  0.000000  ...  0.000000  0.5  0.00000  0.000000  0.0  0.0  0.0   \n",
+       "6659  0.000000  ...  0.000000  0.0  0.00000  0.000000  0.0  0.0  0.0   \n",
+       "6660  0.000000  ...  0.000000  0.0  0.00000  0.000000  0.0  0.0  0.0   \n",
+       "\n",
+       "            72   73   74  \n",
+       "0     0.000000  0.0  0.0  \n",
+       "1     0.000000  0.0  0.0  \n",
+       "2     0.000000  0.0  0.0  \n",
+       "3    -0.534522  0.0  0.0  \n",
+       "4     0.000000  0.0  0.0  \n",
+       "...        ...  ...  ...  \n",
+       "6656  0.353553  0.0  0.0  \n",
+       "6657  0.000000  0.0  0.0  \n",
+       "6658  0.000000  0.0  0.0  \n",
+       "6659  0.000000  0.0  0.0  \n",
+       "6660  0.000000  0.0  0.0  \n",
+       "\n",
+       "[6661 rows x 76 columns]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Use hashing vectorizer to consider the importance of counts and frequencies of words\n",
+    "# carefully initiate the pre-defined matrix size. \n",
+    "#Small numbers of features are likely to cause hash collisions\n",
+    "#large numbers will cause larger coefficient dimensions in linear learners\n",
+    "# good for large dataset since it don't store the vocabulary and easily been loaded when needed\n",
+    "\n",
+    "from sklearn.feature_extraction.text import HashingVectorizer\n",
+    "\n",
+    "# determine the size of matrix and generate hash vectorizer\n",
+    "n = Text['prompt'].nunique()\n",
+    "text_hashvectorize = HashingVectorizer(n_features = n*3)\n",
+    "X_hash = text_hashvectorize.fit_transform(Text[\"new_text\"])\n",
+    "\n",
+    "df_hash_vectorize = pd.DataFrame(X_hash.toarray())\n",
+    "\n",
+    "# concatenate prompt column with hash vectorized matrix\n",
+    "text_hash_vectorize = pd.concat([Text[\"prompt\"], df_hash_vectorize], axis = 1)\n",
+    "text_hash_vectorize\n",
+    "\n",
+    "#text_hash_vectorize.to_csv(f\"hash_vectorize.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81396cbc",
+   "metadata": {},
+   "source": [
+    "## CountVectorizer (Bag of Words)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "bcf420d8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>abdomen</th>\n",
+       "      <th>abdominal</th>\n",
+       "      <th>able</th>\n",
+       "      <th>abronchial</th>\n",
+       "      <th>accident</th>\n",
+       "      <th>accidentally</th>\n",
+       "      <th>accompanied</th>\n",
+       "      <th>ache</th>\n",
+       "      <th>aching</th>\n",
+       "      <th>...</th>\n",
+       "      <th>wound</th>\n",
+       "      <th>wrap</th>\n",
+       "      <th>write</th>\n",
+       "      <th>wrong</th>\n",
+       "      <th>yard</th>\n",
+       "      <th>year</th>\n",
+       "      <th>yellow</th>\n",
+       "      <th>yesterday</th>\n",
+       "      <th>young</th>\n",
+       "      <th>zit</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Emotional pain</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Hair falling out</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Heart hurts</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Infected wound</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Infected wound</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6656</th>\n",
+       "      <td>Stomach ache</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6657</th>\n",
+       "      <td>Open wound</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6658</th>\n",
+       "      <td>Joint pain</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6659</th>\n",
+       "      <td>Heart hurts</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6660</th>\n",
+       "      <td>Skin issue</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>6661 rows × 954 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                prompt  abdomen  abdominal  able  abronchial  accident  \\\n",
+       "0       Emotional pain        0          0     0           0         0   \n",
+       "1     Hair falling out        0          0     0           0         0   \n",
+       "2          Heart hurts        0          0     0           0         0   \n",
+       "3       Infected wound        0          0     0           0         0   \n",
+       "4       Infected wound        0          0     0           0         0   \n",
+       "...                ...      ...        ...   ...         ...       ...   \n",
+       "6656      Stomach ache        0          0     0           0         0   \n",
+       "6657        Open wound        0          0     0           0         0   \n",
+       "6658        Joint pain        0          0     0           0         0   \n",
+       "6659       Heart hurts        0          0     0           0         0   \n",
+       "6660        Skin issue        0          0     0           0         0   \n",
+       "\n",
+       "      accidentally  accompanied  ache  aching  ...  wound  wrap  write  wrong  \\\n",
+       "0                0            0     0       0  ...      0     0      0      0   \n",
+       "1                0            0     0       0  ...      0     0      0      0   \n",
+       "2                0            0     0       0  ...      0     0      0      0   \n",
+       "3                0            0     0       0  ...      0     0      0      0   \n",
+       "4                0            0     0       1  ...      0     0      0      0   \n",
+       "...            ...          ...   ...     ...  ...    ...   ...    ...    ...   \n",
+       "6656             0            0     0       0  ...      0     0      0      0   \n",
+       "6657             0            0     0       0  ...      0     0      0      0   \n",
+       "6658             0            0     0       0  ...      0     0      0      0   \n",
+       "6659             0            0     0       0  ...      0     0      0      0   \n",
+       "6660             0            0     0       0  ...      0     0      0      0   \n",
+       "\n",
+       "      yard  year  yellow  yesterday  young  zit  \n",
+       "0        0     0       0          0      0    0  \n",
+       "1        0     0       0          0      0    0  \n",
+       "2        0     0       0          0      0    0  \n",
+       "3        0     0       0          0      0    0  \n",
+       "4        0     0       0          0      0    0  \n",
+       "...    ...   ...     ...        ...    ...  ...  \n",
+       "6656     0     0       0          0      0    0  \n",
+       "6657     0     0       0          0      0    0  \n",
+       "6658     0     0       0          0      0    0  \n",
+       "6659     0     0       0          0      0    0  \n",
+       "6660     0     0       0          0      0    0  \n",
+       "\n",
+       "[6661 rows x 954 columns]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# bag_of_words feature extract\n",
+    "\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "# extract feature using bag_of_words\n",
+    "bag_word = CountVectorizer()\n",
+    "feature_bow = bag_word.fit_transform(Text[\"new_text\"].values)\n",
+    "\n",
+    "# maping feature \n",
+    "df_bow = pd.DataFrame(feature_bow.todense().tolist(), columns = bag_word.get_feature_names())\n",
+    "\n",
+    "# concatenate prompt column with bow matrix\n",
+    "bag_word_df = pd.concat([Text['prompt'], df_bow], axis = 1)\n",
+    "bag_word_df\n",
+    "\n",
+    "#bag_word_df.to_csv('bag_word_df.csv',index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fa5c3196",
+   "metadata": {},
+   "source": [
+    "# Word2Vec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "4f1d0ab2",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "      <th>...</th>\n",
+       "      <th>90</th>\n",
+       "      <th>91</th>\n",
+       "      <th>92</th>\n",
+       "      <th>93</th>\n",
+       "      <th>94</th>\n",
+       "      <th>95</th>\n",
+       "      <th>96</th>\n",
+       "      <th>97</th>\n",
+       "      <th>98</th>\n",
+       "      <th>99</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Emotional pain</td>\n",
+       "      <td>-0.076306</td>\n",
+       "      <td>0.144870</td>\n",
+       "      <td>0.083767</td>\n",
+       "      <td>0.095612</td>\n",
+       "      <td>0.066345</td>\n",
+       "      <td>-0.226576</td>\n",
+       "      <td>0.062405</td>\n",
+       "      <td>0.405740</td>\n",
+       "      <td>-0.126576</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.176404</td>\n",
+       "      <td>0.113494</td>\n",
+       "      <td>0.057781</td>\n",
+       "      <td>0.016227</td>\n",
+       "      <td>0.233634</td>\n",
+       "      <td>0.097241</td>\n",
+       "      <td>0.095603</td>\n",
+       "      <td>-0.143557</td>\n",
+       "      <td>0.059992</td>\n",
+       "      <td>-0.057431</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Hair falling out</td>\n",
+       "      <td>-0.129883</td>\n",
+       "      <td>0.241534</td>\n",
+       "      <td>0.109110</td>\n",
+       "      <td>0.142094</td>\n",
+       "      <td>0.119210</td>\n",
+       "      <td>-0.391520</td>\n",
+       "      <td>0.095924</td>\n",
+       "      <td>0.651806</td>\n",
+       "      <td>-0.195424</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.297623</td>\n",
+       "      <td>0.181590</td>\n",
+       "      <td>0.079316</td>\n",
+       "      <td>0.022262</td>\n",
+       "      <td>0.362759</td>\n",
+       "      <td>0.156125</td>\n",
+       "      <td>0.161772</td>\n",
+       "      <td>-0.229045</td>\n",
+       "      <td>0.110560</td>\n",
+       "      <td>-0.102366</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Heart hurts</td>\n",
+       "      <td>-0.138489</td>\n",
+       "      <td>0.248988</td>\n",
+       "      <td>0.117522</td>\n",
+       "      <td>0.148499</td>\n",
+       "      <td>0.124959</td>\n",
+       "      <td>-0.413804</td>\n",
+       "      <td>0.112814</td>\n",
+       "      <td>0.685549</td>\n",
+       "      <td>-0.227354</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.313381</td>\n",
+       "      <td>0.186616</td>\n",
+       "      <td>0.080913</td>\n",
+       "      <td>0.023393</td>\n",
+       "      <td>0.373568</td>\n",
+       "      <td>0.158586</td>\n",
+       "      <td>0.175424</td>\n",
+       "      <td>-0.229141</td>\n",
+       "      <td>0.104984</td>\n",
+       "      <td>-0.111788</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Infected wound</td>\n",
+       "      <td>-0.133123</td>\n",
+       "      <td>0.237146</td>\n",
+       "      <td>0.101629</td>\n",
+       "      <td>0.131735</td>\n",
+       "      <td>0.109698</td>\n",
+       "      <td>-0.401296</td>\n",
+       "      <td>0.086977</td>\n",
+       "      <td>0.651547</td>\n",
+       "      <td>-0.183476</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.297889</td>\n",
+       "      <td>0.185091</td>\n",
+       "      <td>0.068119</td>\n",
+       "      <td>0.004698</td>\n",
+       "      <td>0.345084</td>\n",
+       "      <td>0.155465</td>\n",
+       "      <td>0.173034</td>\n",
+       "      <td>-0.211400</td>\n",
+       "      <td>0.103105</td>\n",
+       "      <td>-0.107905</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Infected wound</td>\n",
+       "      <td>-0.151671</td>\n",
+       "      <td>0.268915</td>\n",
+       "      <td>0.121587</td>\n",
+       "      <td>0.152607</td>\n",
+       "      <td>0.134066</td>\n",
+       "      <td>-0.438037</td>\n",
+       "      <td>0.122253</td>\n",
+       "      <td>0.726806</td>\n",
+       "      <td>-0.223599</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.338446</td>\n",
+       "      <td>0.204209</td>\n",
+       "      <td>0.077224</td>\n",
+       "      <td>0.024273</td>\n",
+       "      <td>0.405920</td>\n",
+       "      <td>0.180556</td>\n",
+       "      <td>0.188474</td>\n",
+       "      <td>-0.251705</td>\n",
+       "      <td>0.119200</td>\n",
+       "      <td>-0.121408</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6656</th>\n",
+       "      <td>Stomach ache</td>\n",
+       "      <td>-0.115034</td>\n",
+       "      <td>0.212170</td>\n",
+       "      <td>0.102355</td>\n",
+       "      <td>0.127260</td>\n",
+       "      <td>0.111012</td>\n",
+       "      <td>-0.356944</td>\n",
+       "      <td>0.088314</td>\n",
+       "      <td>0.597305</td>\n",
+       "      <td>-0.175593</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.266632</td>\n",
+       "      <td>0.163279</td>\n",
+       "      <td>0.060654</td>\n",
+       "      <td>0.009344</td>\n",
+       "      <td>0.324350</td>\n",
+       "      <td>0.141180</td>\n",
+       "      <td>0.152941</td>\n",
+       "      <td>-0.196083</td>\n",
+       "      <td>0.092586</td>\n",
+       "      <td>-0.085266</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6657</th>\n",
+       "      <td>Open wound</td>\n",
+       "      <td>-0.082318</td>\n",
+       "      <td>0.157860</td>\n",
+       "      <td>0.067382</td>\n",
+       "      <td>0.077496</td>\n",
+       "      <td>0.079493</td>\n",
+       "      <td>-0.261548</td>\n",
+       "      <td>0.057403</td>\n",
+       "      <td>0.424254</td>\n",
+       "      <td>-0.119038</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.188105</td>\n",
+       "      <td>0.113617</td>\n",
+       "      <td>0.041503</td>\n",
+       "      <td>0.008333</td>\n",
+       "      <td>0.222731</td>\n",
+       "      <td>0.104138</td>\n",
+       "      <td>0.115030</td>\n",
+       "      <td>-0.143077</td>\n",
+       "      <td>0.069211</td>\n",
+       "      <td>-0.070054</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6658</th>\n",
+       "      <td>Joint pain</td>\n",
+       "      <td>-0.131884</td>\n",
+       "      <td>0.256373</td>\n",
+       "      <td>0.128843</td>\n",
+       "      <td>0.154810</td>\n",
+       "      <td>0.125906</td>\n",
+       "      <td>-0.405019</td>\n",
+       "      <td>0.116038</td>\n",
+       "      <td>0.680855</td>\n",
+       "      <td>-0.219588</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.304285</td>\n",
+       "      <td>0.186892</td>\n",
+       "      <td>0.076803</td>\n",
+       "      <td>0.021421</td>\n",
+       "      <td>0.373776</td>\n",
+       "      <td>0.150687</td>\n",
+       "      <td>0.167834</td>\n",
+       "      <td>-0.229338</td>\n",
+       "      <td>0.108377</td>\n",
+       "      <td>-0.089778</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6659</th>\n",
+       "      <td>Heart hurts</td>\n",
+       "      <td>-0.110055</td>\n",
+       "      <td>0.223382</td>\n",
+       "      <td>0.098995</td>\n",
+       "      <td>0.134150</td>\n",
+       "      <td>0.111787</td>\n",
+       "      <td>-0.366419</td>\n",
+       "      <td>0.095155</td>\n",
+       "      <td>0.613407</td>\n",
+       "      <td>-0.187147</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.274841</td>\n",
+       "      <td>0.174951</td>\n",
+       "      <td>0.079736</td>\n",
+       "      <td>0.016118</td>\n",
+       "      <td>0.335088</td>\n",
+       "      <td>0.144033</td>\n",
+       "      <td>0.159089</td>\n",
+       "      <td>-0.211854</td>\n",
+       "      <td>0.105012</td>\n",
+       "      <td>-0.087460</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6660</th>\n",
+       "      <td>Skin issue</td>\n",
+       "      <td>-0.101181</td>\n",
+       "      <td>0.170834</td>\n",
+       "      <td>0.069351</td>\n",
+       "      <td>0.099832</td>\n",
+       "      <td>0.083726</td>\n",
+       "      <td>-0.287154</td>\n",
+       "      <td>0.061585</td>\n",
+       "      <td>0.473658</td>\n",
+       "      <td>-0.139816</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.218798</td>\n",
+       "      <td>0.127264</td>\n",
+       "      <td>0.041160</td>\n",
+       "      <td>0.004311</td>\n",
+       "      <td>0.256762</td>\n",
+       "      <td>0.120410</td>\n",
+       "      <td>0.131737</td>\n",
+       "      <td>-0.151181</td>\n",
+       "      <td>0.088392</td>\n",
+       "      <td>-0.077834</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>6661 rows × 101 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                prompt         0         1         2         3         4  \\\n",
+       "0       Emotional pain -0.076306  0.144870  0.083767  0.095612  0.066345   \n",
+       "1     Hair falling out -0.129883  0.241534  0.109110  0.142094  0.119210   \n",
+       "2          Heart hurts -0.138489  0.248988  0.117522  0.148499  0.124959   \n",
+       "3       Infected wound -0.133123  0.237146  0.101629  0.131735  0.109698   \n",
+       "4       Infected wound -0.151671  0.268915  0.121587  0.152607  0.134066   \n",
+       "...                ...       ...       ...       ...       ...       ...   \n",
+       "6656      Stomach ache -0.115034  0.212170  0.102355  0.127260  0.111012   \n",
+       "6657        Open wound -0.082318  0.157860  0.067382  0.077496  0.079493   \n",
+       "6658        Joint pain -0.131884  0.256373  0.128843  0.154810  0.125906   \n",
+       "6659       Heart hurts -0.110055  0.223382  0.098995  0.134150  0.111787   \n",
+       "6660        Skin issue -0.101181  0.170834  0.069351  0.099832  0.083726   \n",
+       "\n",
+       "             5         6         7         8  ...        90        91  \\\n",
+       "0    -0.226576  0.062405  0.405740 -0.126576  ...  0.176404  0.113494   \n",
+       "1    -0.391520  0.095924  0.651806 -0.195424  ...  0.297623  0.181590   \n",
+       "2    -0.413804  0.112814  0.685549 -0.227354  ...  0.313381  0.186616   \n",
+       "3    -0.401296  0.086977  0.651547 -0.183476  ...  0.297889  0.185091   \n",
+       "4    -0.438037  0.122253  0.726806 -0.223599  ...  0.338446  0.204209   \n",
+       "...        ...       ...       ...       ...  ...       ...       ...   \n",
+       "6656 -0.356944  0.088314  0.597305 -0.175593  ...  0.266632  0.163279   \n",
+       "6657 -0.261548  0.057403  0.424254 -0.119038  ...  0.188105  0.113617   \n",
+       "6658 -0.405019  0.116038  0.680855 -0.219588  ...  0.304285  0.186892   \n",
+       "6659 -0.366419  0.095155  0.613407 -0.187147  ...  0.274841  0.174951   \n",
+       "6660 -0.287154  0.061585  0.473658 -0.139816  ...  0.218798  0.127264   \n",
+       "\n",
+       "            92        93        94        95        96        97        98  \\\n",
+       "0     0.057781  0.016227  0.233634  0.097241  0.095603 -0.143557  0.059992   \n",
+       "1     0.079316  0.022262  0.362759  0.156125  0.161772 -0.229045  0.110560   \n",
+       "2     0.080913  0.023393  0.373568  0.158586  0.175424 -0.229141  0.104984   \n",
+       "3     0.068119  0.004698  0.345084  0.155465  0.173034 -0.211400  0.103105   \n",
+       "4     0.077224  0.024273  0.405920  0.180556  0.188474 -0.251705  0.119200   \n",
+       "...        ...       ...       ...       ...       ...       ...       ...   \n",
+       "6656  0.060654  0.009344  0.324350  0.141180  0.152941 -0.196083  0.092586   \n",
+       "6657  0.041503  0.008333  0.222731  0.104138  0.115030 -0.143077  0.069211   \n",
+       "6658  0.076803  0.021421  0.373776  0.150687  0.167834 -0.229338  0.108377   \n",
+       "6659  0.079736  0.016118  0.335088  0.144033  0.159089 -0.211854  0.105012   \n",
+       "6660  0.041160  0.004311  0.256762  0.120410  0.131737 -0.151181  0.088392   \n",
+       "\n",
+       "            99  \n",
+       "0    -0.057431  \n",
+       "1    -0.102366  \n",
+       "2    -0.111788  \n",
+       "3    -0.107905  \n",
+       "4    -0.121408  \n",
+       "...        ...  \n",
+       "6656 -0.085266  \n",
+       "6657 -0.070054  \n",
+       "6658 -0.089778  \n",
+       "6659 -0.087460  \n",
+       "6660 -0.077834  \n",
+       "\n",
+       "[6661 rows x 101 columns]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# word to vec\n",
+    "# using the Word2Vec algorithm with our corpus\n",
+    "# it returns a vector representation for each word\n",
+    "# to extend these word vectors and generate vectors on document level\n",
+    "# we compute an average of all the words in the document \n",
+    "\n",
+    "from gensim.models import Word2Vec\n",
+    "\n",
+    "# Create the list of list format for gensim modeling \n",
+    "Text['new_text_clean'] = Text['new_text'].apply(lambda x: x.split(\" \"))\n",
+    "\n",
+    "# Train the word2vec model\n",
+    "w2v_model = Word2Vec(Text['new_text_clean'], min_count = 1,vector_size = 100, window = 5)\n",
+    "\n",
+    "# Take the average of the word vectors for the words contained in each sentence\n",
+    "def word_avg_vect(data, model, num_features):\n",
+    "    words = set(model.wv.index_to_key)\n",
+    "    X_vect = np.array([np.array([model.wv[i] for i in s if i in words]) for s in data])\n",
+    "    X_vect_avg = []\n",
+    "    for v in X_vect:\n",
+    "        if v.size:\n",
+    "            X_vect_avg.append(v.mean(axis = 0))\n",
+    "        else:\n",
+    "            X_vect_avg.append(np.zeros(num_features, dtype = float))\n",
+    "\n",
+    "    df_vect_avg = pd.DataFrame(X_vect_avg)\n",
+    "    return df_vect_avg\n",
+    "\n",
+    "X_w2v = word_avg_vect(Text['new_text_clean'], w2v_model, 100)\n",
+    "# concatenate prompt column with averaged w2v matrix\n",
+    "df_w2v = pd.concat([Text[\"prompt\"], X_w2v], axis = 1)\n",
+    "df_w2v\n",
+    "\n",
+    "#df_w2v.to_csv(f\"df_w2v.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2588ab9e",
+   "metadata": {},
+   "source": [
+    "## PCA "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "14c6e468",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fucntion for PCA as feature selection \n",
+    "# set cutoff value is number of components that represents 99% of variance \n",
+    "# return reduced dataset with appropriate PCA components represented 99% variance\n",
+    "from sklearn.decomposition import PCA\n",
+    "import pickle\n",
+    "def PCA_project(data, data_name=\"\", threshold = 99):\n",
+    "    max_component = data.shape[1]\n",
+    "    cutoff = threshold\n",
+    "    covar_matrix = PCA(n_components = max_component)\n",
+    "    covar_matrix.fit(data)\n",
+    "    variance = covar_matrix.explained_variance_ratio_\n",
+    "    var = np.cumsum(np.round(variance, decimals = 4)*100)\n",
+    "    index = 0\n",
+    "    for i in range(len(var)):\n",
+    "        \n",
+    "        if np.round(var[i]) < cutoff:\n",
+    "            index += 1\n",
+    "        else:\n",
+    "            break\n",
+    "    principal=PCA(n_components=index)\n",
+    "    principal.fit(data)\n",
+    "    pickle.dump(principal, open(data_name+'.pkl','wb'))\n",
+    "    print('%s reduce features from %d to %d'% (data_name,max_component, index))\n",
+    "    return principal.transform(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "275fc24d",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bag of words reduce features from 953 to 464\n",
+      "tf_idf reduce features from 953 to 507\n",
+      "hash_vectorize reduce features from 75 to 69\n",
+      "word2vec reduce features from 100 to 8\n"
+     ]
+    }
+   ],
+   "source": [
+    "#apply PCA on our 4 dataset:bag of words, tf_idf, hash, word2vec\n",
+    "bow_P= PCA_project(df_bow, 'bag of words')\n",
+    "tf_idf_P= PCA_project(df_tf_idf, 'tf_idf')\n",
+    "hash_P= PCA_project(df_hash_vectorize, 'hash_vectorize')\n",
+    "w2v_P= PCA_project(np.array(X_w2v), 'word2vec')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c1823f94",
+   "metadata": {},
+   "source": [
+    "## Split Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "d0bd1b71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#split data\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "y = Text[\"prompt\"]\n",
+    "X_bow_train, X_bow_test, y_bow_train, y_bow_test = train_test_split(bow_P,y,test_size = 0.2, random_state =3, stratify = y)\n",
+    "X_tf_train, X_tf_test, y_tf_train, y_tf_test = train_test_split(tf_idf_P,y,test_size = 0.2, random_state =3, stratify = y)\n",
+    "X_hash_train, X_hash_test, y_hash_train, y_hash_test = train_test_split(hash_P,y,test_size = 0.2, random_state =3, stratify = y)\n",
+    "X_w2v_train, X_w2v_test, y_w2v_train, y_w2v_test = train_test_split(w2v_P,y,test_size = 0.2, random_state =3, stratify = y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e9ef43f",
+   "metadata": {},
+   "source": [
+    "## Modeling and Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "e53a0d16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#define a funtion to generate a nice matric table\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer\n",
+    "def matric_table(model_list, name_list,y_data, X_data):\n",
+    "    result = []\n",
+    "    for m,n,a,b in zip(model_list, name_list, y_data, X_data):\n",
+    "        report = []\n",
+    "        report.append(n)\n",
+    "        report.append(accuracy_score(a[0], m.predict(b[0])) * 100)\n",
+    "        report.append(accuracy_score(a[1], m.predict(b[1])) * 100)\n",
+    "        report.append(recall_score(a[1], m.predict(b[1]),average = 'weighted') * 100)\n",
+    "        report.append(precision_score(a[1], m.predict(b[1]),average = 'weighted') * 100)\n",
+    "        report.append(f1_score(a[1], m.predict(b[1]),average = 'weighted') * 100)\n",
+    "        result.append(report)\n",
+    "    df = pd.DataFrame(data = result, columns=['Model', 'Training Accuracy %', 'Testing Accuracy %','Testing precision %', 'Testing recall %', 'Testing f1_score %'])\n",
+    "    df = df.set_index('Model')\n",
+    "    return df.style.highlight_max(color = 'lightgreen', axis = 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "b9ece7e9",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RandomForestClassifier(max_features=None, n_jobs=-1, oob_score=True,\n",
+       "                       random_state=0)"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#train model for randomforest\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "# train model with all features\n",
+    "rf_bow = RandomForestClassifier(n_estimators=100,\n",
+    "                                max_features=None,\n",
+    "                                oob_score=True,\n",
+    "                                n_jobs=-1,\n",
+    "                                random_state=0)\n",
+    "rf_tf = RandomForestClassifier(n_estimators=100,\n",
+    "                                max_features=None,\n",
+    "                                oob_score=True,\n",
+    "                                n_jobs=-1,\n",
+    "                                random_state=0)\n",
+    "rf_hash = RandomForestClassifier(n_estimators=100,\n",
+    "                                max_features=None,\n",
+    "                                oob_score=True,\n",
+    "                                n_jobs=-1,\n",
+    "                                random_state=0)\n",
+    "rf_w2v = RandomForestClassifier(n_estimators=100,\n",
+    "                                max_features=None,\n",
+    "                                oob_score=True,\n",
+    "                                n_jobs=-1,\n",
+    "                                random_state=0)\n",
+    "\n",
+    "rf_bow.fit(X_bow_train, y_bow_train)\n",
+    "rf_tf.fit(X_tf_train, y_tf_train)\n",
+    "rf_hash.fit(X_hash_train, y_hash_train)\n",
+    "rf_w2v.fit(X_w2v_train, y_w2v_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8fb3a324",
+   "metadata": {},
+   "source": [
+    "### Random Forest with word2vec extracted data is the best among all Random Forest Classifers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "c085ca5c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_b2cf1_row0_col0, #T_b2cf1_row0_col1, #T_b2cf1_row0_col2, #T_b2cf1_row1_col0, #T_b2cf1_row1_col1, #T_b2cf1_row1_col2, #T_b2cf1_row3_col0, #T_b2cf1_row3_col1, #T_b2cf1_row3_col2, #T_b2cf1_row3_col3, #T_b2cf1_row3_col4 {\n",
+       "  background-color: lightgreen;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_b2cf1_\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th class=\"col_heading level0 col0\" >Training Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col1\" >Testing Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col2\" >Testing precision %</th>\n",
+       "      <th class=\"col_heading level0 col3\" >Testing recall %</th>\n",
+       "      <th class=\"col_heading level0 col4\" >Testing f1_score %</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th class=\"index_name level0\" >Model</th>\n",
+       "      <th class=\"blank col0\" >&nbsp;</th>\n",
+       "      <th class=\"blank col1\" >&nbsp;</th>\n",
+       "      <th class=\"blank col2\" >&nbsp;</th>\n",
+       "      <th class=\"blank col3\" >&nbsp;</th>\n",
+       "      <th class=\"blank col4\" >&nbsp;</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_b2cf1_level0_row0\" class=\"row_heading level0 row0\" >Random Forest with bow</th>\n",
+       "      <td id=\"T_b2cf1_row0_col0\" class=\"data row0 col0\" >99.774775</td>\n",
+       "      <td id=\"T_b2cf1_row0_col1\" class=\"data row0 col1\" >99.549887</td>\n",
+       "      <td id=\"T_b2cf1_row0_col2\" class=\"data row0 col2\" >99.549887</td>\n",
+       "      <td id=\"T_b2cf1_row0_col3\" class=\"data row0 col3\" >99.574821</td>\n",
+       "      <td id=\"T_b2cf1_row0_col4\" class=\"data row0 col4\" >99.549617</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_b2cf1_level0_row1\" class=\"row_heading level0 row1\" >Random Forest with tf_idf</th>\n",
+       "      <td id=\"T_b2cf1_row1_col0\" class=\"data row1 col0\" >99.774775</td>\n",
+       "      <td id=\"T_b2cf1_row1_col1\" class=\"data row1 col1\" >99.549887</td>\n",
+       "      <td id=\"T_b2cf1_row1_col2\" class=\"data row1 col2\" >99.549887</td>\n",
+       "      <td id=\"T_b2cf1_row1_col3\" class=\"data row1 col3\" >99.574821</td>\n",
+       "      <td id=\"T_b2cf1_row1_col4\" class=\"data row1 col4\" >99.549617</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_b2cf1_level0_row2\" class=\"row_heading level0 row2\" >Random Forest with hash</th>\n",
+       "      <td id=\"T_b2cf1_row2_col0\" class=\"data row2 col0\" >99.493243</td>\n",
+       "      <td id=\"T_b2cf1_row2_col1\" class=\"data row2 col1\" >99.174794</td>\n",
+       "      <td id=\"T_b2cf1_row2_col2\" class=\"data row2 col2\" >99.174794</td>\n",
+       "      <td id=\"T_b2cf1_row2_col3\" class=\"data row2 col3\" >99.194755</td>\n",
+       "      <td id=\"T_b2cf1_row2_col4\" class=\"data row2 col4\" >99.176006</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_b2cf1_level0_row3\" class=\"row_heading level0 row3\" >Random Forest with word2vec</th>\n",
+       "      <td id=\"T_b2cf1_row3_col0\" class=\"data row3 col0\" >99.774775</td>\n",
+       "      <td id=\"T_b2cf1_row3_col1\" class=\"data row3 col1\" >99.549887</td>\n",
+       "      <td id=\"T_b2cf1_row3_col2\" class=\"data row3 col2\" >99.549887</td>\n",
+       "      <td id=\"T_b2cf1_row3_col3\" class=\"data row3 col3\" >99.574932</td>\n",
+       "      <td id=\"T_b2cf1_row3_col4\" class=\"data row3 col4\" >99.549645</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x26e807addf0>"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# print result\n",
+    "model_list = [rf_bow,rf_tf,rf_hash,rf_w2v]\n",
+    "name_list = [\"Random Forest with bow\",\"Random Forest with tf_idf\", \"Random Forest with hash\",\"Random Forest with word2vec\"]\n",
+    "y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]\n",
+    "X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]\n",
+    "matric_table(model_list, name_list, y_data, X_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "cab665c2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LogisticRegression()"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#train model for logistic Regression which is not inherently multiclass classifers. \n",
+    "#In this case, we use  defualt auto setting that if input is binary using OVR otherwise using multnomial\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "\n",
+    "lr_bow = LogisticRegression()\n",
+    "lr_tf = LogisticRegression()\n",
+    "lr_hash = LogisticRegression()\n",
+    "lr_w2v = LogisticRegression()\n",
+    "\n",
+    "lr_bow.fit(X_bow_train, y_bow_train)\n",
+    "lr_tf.fit(X_tf_train, y_tf_train)\n",
+    "lr_hash.fit(X_hash_train, y_hash_train)\n",
+    "lr_w2v.fit(X_w2v_train, y_w2v_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0cc61a26",
+   "metadata": {},
+   "source": [
+    "### Logistic Regression with bag-of-words extracted data is the best among all logistic Regression Classifers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "53bdebb2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_aa4d1_row0_col0, #T_aa4d1_row0_col1, #T_aa4d1_row0_col2, #T_aa4d1_row0_col3, #T_aa4d1_row0_col4 {\n",
+       "  background-color: lightgreen;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_aa4d1_\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th class=\"col_heading level0 col0\" >Training Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col1\" >Testing Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col2\" >Testing precision %</th>\n",
+       "      <th class=\"col_heading level0 col3\" >Testing recall %</th>\n",
+       "      <th class=\"col_heading level0 col4\" >Testing f1_score %</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th class=\"index_name level0\" >Model</th>\n",
+       "      <th class=\"blank col0\" >&nbsp;</th>\n",
+       "      <th class=\"blank col1\" >&nbsp;</th>\n",
+       "      <th class=\"blank col2\" >&nbsp;</th>\n",
+       "      <th class=\"blank col3\" >&nbsp;</th>\n",
+       "      <th class=\"blank col4\" >&nbsp;</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_aa4d1_level0_row0\" class=\"row_heading level0 row0\" >Logistic Regression with bow</th>\n",
+       "      <td id=\"T_aa4d1_row0_col0\" class=\"data row0 col0\" >99.568318</td>\n",
+       "      <td id=\"T_aa4d1_row0_col1\" class=\"data row0 col1\" >99.549887</td>\n",
+       "      <td id=\"T_aa4d1_row0_col2\" class=\"data row0 col2\" >99.549887</td>\n",
+       "      <td id=\"T_aa4d1_row0_col3\" class=\"data row0 col3\" >99.574027</td>\n",
+       "      <td id=\"T_aa4d1_row0_col4\" class=\"data row0 col4\" >99.549153</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_aa4d1_level0_row1\" class=\"row_heading level0 row1\" >Logistic Regression with tf_idf</th>\n",
+       "      <td id=\"T_aa4d1_row1_col0\" class=\"data row1 col0\" >99.399399</td>\n",
+       "      <td id=\"T_aa4d1_row1_col1\" class=\"data row1 col1\" >99.474869</td>\n",
+       "      <td id=\"T_aa4d1_row1_col2\" class=\"data row1 col2\" >99.474869</td>\n",
+       "      <td id=\"T_aa4d1_row1_col3\" class=\"data row1 col3\" >99.500348</td>\n",
+       "      <td id=\"T_aa4d1_row1_col4\" class=\"data row1 col4\" >99.474060</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_aa4d1_level0_row2\" class=\"row_heading level0 row2\" >Logistic Regression with hash</th>\n",
+       "      <td id=\"T_aa4d1_row2_col0\" class=\"data row2 col0\" >85.904655</td>\n",
+       "      <td id=\"T_aa4d1_row2_col1\" class=\"data row2 col1\" >84.696174</td>\n",
+       "      <td id=\"T_aa4d1_row2_col2\" class=\"data row2 col2\" >84.696174</td>\n",
+       "      <td id=\"T_aa4d1_row2_col3\" class=\"data row2 col3\" >85.447843</td>\n",
+       "      <td id=\"T_aa4d1_row2_col4\" class=\"data row2 col4\" >84.617119</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_aa4d1_level0_row3\" class=\"row_heading level0 row3\" >Logistic Regressiont with word2vec</th>\n",
+       "      <td id=\"T_aa4d1_row3_col0\" class=\"data row3 col0\" >27.496246</td>\n",
+       "      <td id=\"T_aa4d1_row3_col1\" class=\"data row3 col1\" >27.906977</td>\n",
+       "      <td id=\"T_aa4d1_row3_col2\" class=\"data row3 col2\" >27.906977</td>\n",
+       "      <td id=\"T_aa4d1_row3_col3\" class=\"data row3 col3\" >22.557786</td>\n",
+       "      <td id=\"T_aa4d1_row3_col4\" class=\"data row3 col4\" >20.560358</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x26e8a4b2c10>"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# print result, the warning indicate there are some type the classifers never predict. but since data is imbalence in that rare class so the accuracy won't be impacted\n",
+    "model_list = [lr_bow,lr_tf,lr_hash,lr_w2v]\n",
+    "name_list = [\"Logistic Regression with bow\",\"Logistic Regression with tf_idf\", \"Logistic Regression with hash\",\"Logistic Regressiont with word2vec\"]\n",
+    "y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]\n",
+    "X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]\n",
+    "matric_table(model_list, name_list, y_data, X_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "436bec53",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "SVC()"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#train model for linear svm, which is not inherently multiclass classifers. \n",
+    "#In this case, we use One VS Rest to save computing \n",
+    "from sklearn.svm import SVC\n",
+    "\n",
+    "svc_bow = SVC(decision_function_shape='ovr')\n",
+    "svc_tf = SVC(decision_function_shape='ovr')\n",
+    "svc_hash = SVC(decision_function_shape='ovr')\n",
+    "svc_w2v = SVC(decision_function_shape='ovr')\n",
+    "\n",
+    "svc_bow.fit(X_bow_train, y_bow_train)\n",
+    "svc_tf.fit(X_tf_train, y_tf_train)\n",
+    "svc_hash.fit(X_hash_train, y_hash_train)\n",
+    "svc_w2v.fit(X_w2v_train, y_w2v_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8dad1fb",
+   "metadata": {},
+   "source": [
+    "### Support Vectors Machine with tf_idf extracted data is the best among all SVCs "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "97e09358",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_0e4d5_row0_col1, #T_0e4d5_row0_col2, #T_0e4d5_row0_col3, #T_0e4d5_row0_col4, #T_0e4d5_row1_col0, #T_0e4d5_row1_col1, #T_0e4d5_row1_col2, #T_0e4d5_row1_col3, #T_0e4d5_row1_col4 {\n",
+       "  background-color: lightgreen;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_0e4d5_\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th class=\"col_heading level0 col0\" >Training Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col1\" >Testing Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col2\" >Testing precision %</th>\n",
+       "      <th class=\"col_heading level0 col3\" >Testing recall %</th>\n",
+       "      <th class=\"col_heading level0 col4\" >Testing f1_score %</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th class=\"index_name level0\" >Model</th>\n",
+       "      <th class=\"blank col0\" >&nbsp;</th>\n",
+       "      <th class=\"blank col1\" >&nbsp;</th>\n",
+       "      <th class=\"blank col2\" >&nbsp;</th>\n",
+       "      <th class=\"blank col3\" >&nbsp;</th>\n",
+       "      <th class=\"blank col4\" >&nbsp;</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_0e4d5_level0_row0\" class=\"row_heading level0 row0\" >SVC with bow</th>\n",
+       "      <td id=\"T_0e4d5_row0_col0\" class=\"data row0 col0\" >99.587087</td>\n",
+       "      <td id=\"T_0e4d5_row0_col1\" class=\"data row0 col1\" >99.549887</td>\n",
+       "      <td id=\"T_0e4d5_row0_col2\" class=\"data row0 col2\" >99.549887</td>\n",
+       "      <td id=\"T_0e4d5_row0_col3\" class=\"data row0 col3\" >99.574027</td>\n",
+       "      <td id=\"T_0e4d5_row0_col4\" class=\"data row0 col4\" >99.549153</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_0e4d5_level0_row1\" class=\"row_heading level0 row1\" >SVC with tf_idf</th>\n",
+       "      <td id=\"T_0e4d5_row1_col0\" class=\"data row1 col0\" >99.605856</td>\n",
+       "      <td id=\"T_0e4d5_row1_col1\" class=\"data row1 col1\" >99.549887</td>\n",
+       "      <td id=\"T_0e4d5_row1_col2\" class=\"data row1 col2\" >99.549887</td>\n",
+       "      <td id=\"T_0e4d5_row1_col3\" class=\"data row1 col3\" >99.574027</td>\n",
+       "      <td id=\"T_0e4d5_row1_col4\" class=\"data row1 col4\" >99.549153</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_0e4d5_level0_row2\" class=\"row_heading level0 row2\" >SVC with hash</th>\n",
+       "      <td id=\"T_0e4d5_row2_col0\" class=\"data row2 col0\" >99.324324</td>\n",
+       "      <td id=\"T_0e4d5_row2_col1\" class=\"data row2 col1\" >99.174794</td>\n",
+       "      <td id=\"T_0e4d5_row2_col2\" class=\"data row2 col2\" >99.174794</td>\n",
+       "      <td id=\"T_0e4d5_row2_col3\" class=\"data row2 col3\" >99.249930</td>\n",
+       "      <td id=\"T_0e4d5_row2_col4\" class=\"data row2 col4\" >99.176335</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_0e4d5_level0_row3\" class=\"row_heading level0 row3\" >SVC with word2vec</th>\n",
+       "      <td id=\"T_0e4d5_row3_col0\" class=\"data row3 col0\" >46.677928</td>\n",
+       "      <td id=\"T_0e4d5_row3_col1\" class=\"data row3 col1\" >43.735934</td>\n",
+       "      <td id=\"T_0e4d5_row3_col2\" class=\"data row3 col2\" >43.735934</td>\n",
+       "      <td id=\"T_0e4d5_row3_col3\" class=\"data row3 col3\" >45.756499</td>\n",
+       "      <td id=\"T_0e4d5_row3_col4\" class=\"data row3 col4\" >42.670226</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x26ef57f6880>"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# print result\n",
+    "model_list = [svc_bow,svc_tf,svc_hash,svc_w2v]\n",
+    "name_list = [\"SVC with bow\",\"SVC with tf_idf\", \"SVC with hash\",\"SVC with word2vec\"]\n",
+    "y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]\n",
+    "X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]\n",
+    "matric_table(model_list, name_list, y_data, X_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "2b700817",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "KNeighborsClassifier(n_neighbors=3)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#train model for KNN\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "\n",
+    "knn_bow = KNeighborsClassifier(n_neighbors=3)\n",
+    "knn_tf = KNeighborsClassifier(n_neighbors=3)\n",
+    "knn_hash = KNeighborsClassifier(n_neighbors=3)\n",
+    "knn_w2v = KNeighborsClassifier(n_neighbors=3)\n",
+    "\n",
+    "knn_bow.fit(X_bow_train, y_bow_train)\n",
+    "knn_tf.fit(X_tf_train, y_tf_train)\n",
+    "knn_hash.fit(X_hash_train, y_hash_train)\n",
+    "knn_w2v.fit(X_w2v_train, y_w2v_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fedba1e6",
+   "metadata": {},
+   "source": [
+    "### K Nearest Neighours with bag-of-words extracted data is the best among all KNNs "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "bdbf7bb1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_a4341_row0_col1, #T_a4341_row0_col2, #T_a4341_row0_col3, #T_a4341_row0_col4, #T_a4341_row1_col0, #T_a4341_row1_col1, #T_a4341_row1_col2 {\n",
+       "  background-color: lightgreen;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_a4341_\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th class=\"col_heading level0 col0\" >Training Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col1\" >Testing Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col2\" >Testing precision %</th>\n",
+       "      <th class=\"col_heading level0 col3\" >Testing recall %</th>\n",
+       "      <th class=\"col_heading level0 col4\" >Testing f1_score %</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th class=\"index_name level0\" >Model</th>\n",
+       "      <th class=\"blank col0\" >&nbsp;</th>\n",
+       "      <th class=\"blank col1\" >&nbsp;</th>\n",
+       "      <th class=\"blank col2\" >&nbsp;</th>\n",
+       "      <th class=\"blank col3\" >&nbsp;</th>\n",
+       "      <th class=\"blank col4\" >&nbsp;</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_a4341_level0_row0\" class=\"row_heading level0 row0\" >KNN with bow</th>\n",
+       "      <td id=\"T_a4341_row0_col0\" class=\"data row0 col0\" >99.699700</td>\n",
+       "      <td id=\"T_a4341_row0_col1\" class=\"data row0 col1\" >99.699925</td>\n",
+       "      <td id=\"T_a4341_row0_col2\" class=\"data row0 col2\" >99.699925</td>\n",
+       "      <td id=\"T_a4341_row0_col3\" class=\"data row0 col3\" >99.708187</td>\n",
+       "      <td id=\"T_a4341_row0_col4\" class=\"data row0 col4\" >99.699770</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_a4341_level0_row1\" class=\"row_heading level0 row1\" >KNN with tf_idf</th>\n",
+       "      <td id=\"T_a4341_row1_col0\" class=\"data row1 col0\" >99.831081</td>\n",
+       "      <td id=\"T_a4341_row1_col1\" class=\"data row1 col1\" >99.699925</td>\n",
+       "      <td id=\"T_a4341_row1_col2\" class=\"data row1 col2\" >99.699925</td>\n",
+       "      <td id=\"T_a4341_row1_col3\" class=\"data row1 col3\" >99.708048</td>\n",
+       "      <td id=\"T_a4341_row1_col4\" class=\"data row1 col4\" >99.699734</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_a4341_level0_row2\" class=\"row_heading level0 row2\" >KNN with hash</th>\n",
+       "      <td id=\"T_a4341_row2_col0\" class=\"data row2 col0\" >99.418168</td>\n",
+       "      <td id=\"T_a4341_row2_col1\" class=\"data row2 col1\" >99.324831</td>\n",
+       "      <td id=\"T_a4341_row2_col2\" class=\"data row2 col2\" >99.324831</td>\n",
+       "      <td id=\"T_a4341_row2_col3\" class=\"data row2 col3\" >99.370567</td>\n",
+       "      <td id=\"T_a4341_row2_col4\" class=\"data row2 col4\" >99.326532</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_a4341_level0_row3\" class=\"row_heading level0 row3\" >KNN with word2vec</th>\n",
+       "      <td id=\"T_a4341_row3_col0\" class=\"data row3 col0\" >99.643393</td>\n",
+       "      <td id=\"T_a4341_row3_col1\" class=\"data row3 col1\" >99.549887</td>\n",
+       "      <td id=\"T_a4341_row3_col2\" class=\"data row3 col2\" >99.549887</td>\n",
+       "      <td id=\"T_a4341_row3_col3\" class=\"data row3 col3\" >99.574682</td>\n",
+       "      <td id=\"T_a4341_row3_col4\" class=\"data row3 col4\" >99.549582</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x26e8c81c310>"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_list = [knn_bow,knn_tf,knn_hash,knn_w2v]\n",
+    "name_list = [\"KNN with bow\",\"KNN with tf_idf\", \"KNN with hash\",\"KNN with word2vec\"]\n",
+    "y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]\n",
+    "X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]\n",
+    "matric_table(model_list, name_list, y_data, X_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "7e7eacec",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "GaussianNB()"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#train model for Naive Bayes. \n",
+    "#Bernoulli NB can only focus on a single keyword, \n",
+    "#but will also count how many times that keyword does not occur in the document\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "\n",
+    "\n",
+    "gnb_bow = GaussianNB()\n",
+    "gnb_tf = GaussianNB()\n",
+    "gnb_hash = GaussianNB()\n",
+    "gnb_w2v = GaussianNB()\n",
+    "\n",
+    "gnb_bow.fit(X_bow_train, y_bow_train)\n",
+    "gnb_tf.fit(X_tf_train, y_tf_train)\n",
+    "gnb_hash.fit(X_hash_train, y_hash_train)\n",
+    "gnb_w2v.fit(X_w2v_train, y_w2v_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5f2c49b3",
+   "metadata": {},
+   "source": [
+    "### Gauissian Naive Bayes with tf_idf extracted data is the best among all BNBs "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "a9e7e44e",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_cdc8c_row1_col0, #T_cdc8c_row1_col1, #T_cdc8c_row1_col2, #T_cdc8c_row1_col3, #T_cdc8c_row1_col4 {\n",
+       "  background-color: lightgreen;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_cdc8c_\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th class=\"col_heading level0 col0\" >Training Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col1\" >Testing Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col2\" >Testing precision %</th>\n",
+       "      <th class=\"col_heading level0 col3\" >Testing recall %</th>\n",
+       "      <th class=\"col_heading level0 col4\" >Testing f1_score %</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th class=\"index_name level0\" >Model</th>\n",
+       "      <th class=\"blank col0\" >&nbsp;</th>\n",
+       "      <th class=\"blank col1\" >&nbsp;</th>\n",
+       "      <th class=\"blank col2\" >&nbsp;</th>\n",
+       "      <th class=\"blank col3\" >&nbsp;</th>\n",
+       "      <th class=\"blank col4\" >&nbsp;</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_cdc8c_level0_row0\" class=\"row_heading level0 row0\" >Gaussian Naive Bayes with bow</th>\n",
+       "      <td id=\"T_cdc8c_row0_col0\" class=\"data row0 col0\" >90.653153</td>\n",
+       "      <td id=\"T_cdc8c_row0_col1\" class=\"data row0 col1\" >87.546887</td>\n",
+       "      <td id=\"T_cdc8c_row0_col2\" class=\"data row0 col2\" >87.546887</td>\n",
+       "      <td id=\"T_cdc8c_row0_col3\" class=\"data row0 col3\" >89.318237</td>\n",
+       "      <td id=\"T_cdc8c_row0_col4\" class=\"data row0 col4\" >87.667300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_cdc8c_level0_row1\" class=\"row_heading level0 row1\" >Gaussian Naive Bayes with tf_idf</th>\n",
+       "      <td id=\"T_cdc8c_row1_col0\" class=\"data row1 col0\" >90.878378</td>\n",
+       "      <td id=\"T_cdc8c_row1_col1\" class=\"data row1 col1\" >87.921980</td>\n",
+       "      <td id=\"T_cdc8c_row1_col2\" class=\"data row1 col2\" >87.921980</td>\n",
+       "      <td id=\"T_cdc8c_row1_col3\" class=\"data row1 col3\" >90.570234</td>\n",
+       "      <td id=\"T_cdc8c_row1_col4\" class=\"data row1 col4\" >88.415025</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_cdc8c_level0_row2\" class=\"row_heading level0 row2\" >Gaussian Naive Bayes with hash</th>\n",
+       "      <td id=\"T_cdc8c_row2_col0\" class=\"data row2 col0\" >79.298048</td>\n",
+       "      <td id=\"T_cdc8c_row2_col1\" class=\"data row2 col1\" >75.843961</td>\n",
+       "      <td id=\"T_cdc8c_row2_col2\" class=\"data row2 col2\" >75.843961</td>\n",
+       "      <td id=\"T_cdc8c_row2_col3\" class=\"data row2 col3\" >77.614749</td>\n",
+       "      <td id=\"T_cdc8c_row2_col4\" class=\"data row2 col4\" >76.140452</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_cdc8c_level0_row3\" class=\"row_heading level0 row3\" >Gaussian Naive Bayes with word2vec</th>\n",
+       "      <td id=\"T_cdc8c_row3_col0\" class=\"data row3 col0\" >47.184685</td>\n",
+       "      <td id=\"T_cdc8c_row3_col1\" class=\"data row3 col1\" >48.012003</td>\n",
+       "      <td id=\"T_cdc8c_row3_col2\" class=\"data row3 col2\" >48.012003</td>\n",
+       "      <td id=\"T_cdc8c_row3_col3\" class=\"data row3 col3\" >50.995666</td>\n",
+       "      <td id=\"T_cdc8c_row3_col4\" class=\"data row3 col4\" >48.054367</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x26e8c826460>"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_list = [gnb_bow,gnb_tf,gnb_hash,gnb_w2v]\n",
+    "name_list = [\"Gaussian Naive Bayes with bow\",\"Gaussian Naive Bayes with tf_idf\", \"Gaussian Naive Bayes with hash\",\"Gaussian Naive Bayes with word2vec\"]\n",
+    "y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]\n",
+    "X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]\n",
+    "matric_table(model_list, name_list, y_data, X_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee368178",
+   "metadata": {},
+   "source": [
+    "### In conclusion, KNN with bage of words dataset is the winner among all classifers with highest score on test accuracy, precision, recall and F1 scores, The Random Forest is best on the Training accuracy. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "06f5d0bc",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_d67ec_row0_col0, #T_d67ec_row3_col1, #T_d67ec_row3_col2, #T_d67ec_row3_col3, #T_d67ec_row3_col4 {\n",
+       "  background-color: lightgreen;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_d67ec_\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th class=\"col_heading level0 col0\" >Training Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col1\" >Testing Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col2\" >Testing precision %</th>\n",
+       "      <th class=\"col_heading level0 col3\" >Testing recall %</th>\n",
+       "      <th class=\"col_heading level0 col4\" >Testing f1_score %</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th class=\"index_name level0\" >Model</th>\n",
+       "      <th class=\"blank col0\" >&nbsp;</th>\n",
+       "      <th class=\"blank col1\" >&nbsp;</th>\n",
+       "      <th class=\"blank col2\" >&nbsp;</th>\n",
+       "      <th class=\"blank col3\" >&nbsp;</th>\n",
+       "      <th class=\"blank col4\" >&nbsp;</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_d67ec_level0_row0\" class=\"row_heading level0 row0\" >Random Forest with w2v</th>\n",
+       "      <td id=\"T_d67ec_row0_col0\" class=\"data row0 col0\" >99.774775</td>\n",
+       "      <td id=\"T_d67ec_row0_col1\" class=\"data row0 col1\" >99.549887</td>\n",
+       "      <td id=\"T_d67ec_row0_col2\" class=\"data row0 col2\" >99.549887</td>\n",
+       "      <td id=\"T_d67ec_row0_col3\" class=\"data row0 col3\" >99.574932</td>\n",
+       "      <td id=\"T_d67ec_row0_col4\" class=\"data row0 col4\" >99.549645</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_d67ec_level0_row1\" class=\"row_heading level0 row1\" >Logistic Regression with bow</th>\n",
+       "      <td id=\"T_d67ec_row1_col0\" class=\"data row1 col0\" >99.568318</td>\n",
+       "      <td id=\"T_d67ec_row1_col1\" class=\"data row1 col1\" >99.549887</td>\n",
+       "      <td id=\"T_d67ec_row1_col2\" class=\"data row1 col2\" >99.549887</td>\n",
+       "      <td id=\"T_d67ec_row1_col3\" class=\"data row1 col3\" >99.574027</td>\n",
+       "      <td id=\"T_d67ec_row1_col4\" class=\"data row1 col4\" >99.549153</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_d67ec_level0_row2\" class=\"row_heading level0 row2\" >SVC with tf</th>\n",
+       "      <td id=\"T_d67ec_row2_col0\" class=\"data row2 col0\" >99.605856</td>\n",
+       "      <td id=\"T_d67ec_row2_col1\" class=\"data row2 col1\" >99.549887</td>\n",
+       "      <td id=\"T_d67ec_row2_col2\" class=\"data row2 col2\" >99.549887</td>\n",
+       "      <td id=\"T_d67ec_row2_col3\" class=\"data row2 col3\" >99.574027</td>\n",
+       "      <td id=\"T_d67ec_row2_col4\" class=\"data row2 col4\" >99.549153</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_d67ec_level0_row3\" class=\"row_heading level0 row3\" >KNN withbow</th>\n",
+       "      <td id=\"T_d67ec_row3_col0\" class=\"data row3 col0\" >99.699700</td>\n",
+       "      <td id=\"T_d67ec_row3_col1\" class=\"data row3 col1\" >99.699925</td>\n",
+       "      <td id=\"T_d67ec_row3_col2\" class=\"data row3 col2\" >99.699925</td>\n",
+       "      <td id=\"T_d67ec_row3_col3\" class=\"data row3 col3\" >99.708187</td>\n",
+       "      <td id=\"T_d67ec_row3_col4\" class=\"data row3 col4\" >99.699770</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_d67ec_level0_row4\" class=\"row_heading level0 row4\" >gaussian Naive Bayes with tf</th>\n",
+       "      <td id=\"T_d67ec_row4_col0\" class=\"data row4 col0\" >90.878378</td>\n",
+       "      <td id=\"T_d67ec_row4_col1\" class=\"data row4 col1\" >87.921980</td>\n",
+       "      <td id=\"T_d67ec_row4_col2\" class=\"data row4 col2\" >87.921980</td>\n",
+       "      <td id=\"T_d67ec_row4_col3\" class=\"data row4 col3\" >90.570234</td>\n",
+       "      <td id=\"T_d67ec_row4_col4\" class=\"data row4 col4\" >88.415025</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x26ec7562610>"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "### Find the best classifer among all classifers\n",
+    "model_list = [rf_w2v,lr_bow,svc_tf,knn_bow,gnb_tf]\n",
+    "name_list = [\"Random Forest with w2v\",\"Logistic Regression with bow\", \"SVC with tf\",\"KNN withbow\",\"gaussian Naive Bayes with tf\"]\n",
+    "y_data = [[y_w2v_train,y_w2v_test], [y_bow_train,y_bow_test], [y_tf_train,y_tf_test],[y_bow_train,y_bow_test],[y_tf_train,y_tf_test]]\n",
+    "X_data = [[X_w2v_train,X_w2v_test], [X_bow_train,X_bow_test], [X_tf_train,X_tf_test],[X_bow_train,X_bow_test],[X_tf_train,X_tf_test]]\n",
+    "matric_table(model_list, name_list, y_data, X_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "f19f59d9",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(3, 'uniform', 1, 0.9969992498124531)\n"
+     ]
+    }
+   ],
+   "source": [
+    "##Further tuning the KNN  classifer with bow\n",
+    "best_models = []\n",
+    "n_neighbors = [3,5,7,9]\n",
+    "weights = ['uniform','distance']\n",
+    "ps = [1,2]\n",
+    "\n",
+    "def KNN_clf(n_neighbors, weight, p):\n",
+    "    knn = KNeighborsClassifier(n_neighbors = n_neighbors, weights = weight,p = p)\n",
+    "    knn.fit(X_bow_train, y_bow_train)\n",
+    "    y_pred = knn.predict(X_bow_test)\n",
+    "    n = accuracy_score(y_bow_test,y_pred)\n",
+    "    best_models.append((n_neighbors, weight, p ,n))\n",
+    "\n",
+    "for c in n_neighbors:\n",
+    "    for w in weights:\n",
+    "        for p in ps:\n",
+    "            KNN_clf(c, w, p)\n",
+    "\n",
+    "print(max(best_models,key=lambda item:item[3]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "2a08cf65",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('gini', 13, 1, 50, 0.9969992498124531)\n"
+     ]
+    }
+   ],
+   "source": [
+    "##Further tuning the random forest classifer with w2v\n",
+    "best_models = []\n",
+    "crit = ['gini', 'entropy']\n",
+    "max_d = range(1,20,4)\n",
+    "min_s_leaf = range(1,20,4)\n",
+    "n_est = [50, 100, 200]\n",
+    "\n",
+    "def RF_clf(crit, max_d, min_s_leaf, n_est):\n",
+    "    forest = RandomForestClassifier(criterion=crit, max_depth=max_d, min_samples_leaf=min_s_leaf, n_estimators=n_est, random_state=1)\n",
+    "    forest.fit(X_w2v_train, y_w2v_train)\n",
+    "    y_pred = forest.predict(X_w2v_test)\n",
+    "    n = accuracy_score(y_w2v_test,y_pred)\n",
+    "    best_models.append((crit,max_d,min_s_leaf,n_est,n))\n",
+    "\n",
+    "\n",
+    "for c in crit:\n",
+    "    for md in max_d:\n",
+    "        for msl in min_s_leaf:\n",
+    "            for n_e in n_est:\n",
+    "                RF_clf(c, md, msl, n_e)\n",
+    "\n",
+    "print(max(best_models,key=lambda item:item[4]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "965d3205",
+   "metadata": {},
+   "source": [
+    "### After tuning the Random Forest and KNN, Random Forest indeed is the best performance classifer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "ad815863",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_607af_row0_col1, #T_607af_row0_col2, #T_607af_row0_col3, #T_607af_row0_col4, #T_607af_row1_col0, #T_607af_row1_col1, #T_607af_row1_col2, #T_607af_row1_col3, #T_607af_row1_col4 {\n",
+       "  background-color: lightgreen;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_607af_\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th class=\"col_heading level0 col0\" >Training Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col1\" >Testing Accuracy %</th>\n",
+       "      <th class=\"col_heading level0 col2\" >Testing precision %</th>\n",
+       "      <th class=\"col_heading level0 col3\" >Testing recall %</th>\n",
+       "      <th class=\"col_heading level0 col4\" >Testing f1_score %</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th class=\"index_name level0\" >Model</th>\n",
+       "      <th class=\"blank col0\" >&nbsp;</th>\n",
+       "      <th class=\"blank col1\" >&nbsp;</th>\n",
+       "      <th class=\"blank col2\" >&nbsp;</th>\n",
+       "      <th class=\"blank col3\" >&nbsp;</th>\n",
+       "      <th class=\"blank col4\" >&nbsp;</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_607af_level0_row0\" class=\"row_heading level0 row0\" >Tuned KNN</th>\n",
+       "      <td id=\"T_607af_row0_col0\" class=\"data row0 col0\" >99.699700</td>\n",
+       "      <td id=\"T_607af_row0_col1\" class=\"data row0 col1\" >99.699925</td>\n",
+       "      <td id=\"T_607af_row0_col2\" class=\"data row0 col2\" >99.699925</td>\n",
+       "      <td id=\"T_607af_row0_col3\" class=\"data row0 col3\" >99.708048</td>\n",
+       "      <td id=\"T_607af_row0_col4\" class=\"data row0 col4\" >99.699734</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_607af_level0_row1\" class=\"row_heading level0 row1\" >Tuned Randome Forest</th>\n",
+       "      <td id=\"T_607af_row1_col0\" class=\"data row1 col0\" >99.774775</td>\n",
+       "      <td id=\"T_607af_row1_col1\" class=\"data row1 col1\" >99.699925</td>\n",
+       "      <td id=\"T_607af_row1_col2\" class=\"data row1 col2\" >99.699925</td>\n",
+       "      <td id=\"T_607af_row1_col3\" class=\"data row1 col3\" >99.708048</td>\n",
+       "      <td id=\"T_607af_row1_col4\" class=\"data row1 col4\" >99.699734</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x26ee38be7f0>"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Knn_best = KNeighborsClassifier(n_neighbors = 3, weights = 'uniform' ,p = 1)\n",
+    "Rf_best = RandomForestClassifier(criterion='gini', max_depth=13, min_samples_leaf=1, n_estimators=50, random_state=1)\n",
+    "Knn_best.fit(X_bow_train, y_bow_train)\n",
+    "Rf_best.fit(X_w2v_train, y_w2v_train)\n",
+    "\n",
+    "model_list = [Knn_best, Rf_best]\n",
+    "name_list = [\"Tuned KNN\", 'Tuned Randome Forest']\n",
+    "y_data = [[y_bow_train,y_bow_test], [y_w2v_train,y_w2v_test] ]\n",
+    "X_data = [[X_bow_train,X_bow_test], [X_w2v_train,X_w2v_test]]\n",
+    "matric_table(model_list, name_list, y_data, X_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "349ca9c6",
+   "metadata": {},
+   "source": [
+    "## Draw The ROC and Recall-Precision Graphs to compare"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "d0553c0d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 864x576 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# too perfect, maybe we don't need graph just use our table to explain\n",
+    "import scikitplot as skplt\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "\n",
+    "skplt.metrics.plot_roc(y_w2v_test, Rf_best.predict_proba(X_w2v_test)\n",
+    "                                          ,text_fontsize = 'small'\n",
+    "                                          ,title = ' ROC for best model'\n",
+    "                                          ,figsize = (12,8))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "005ff0bb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 864x576 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "skplt.metrics.plot_precision_recall_curve(y_w2v_test, Rf_best.predict_proba(X_w2v_test),\n",
+    "                                          text_fontsize = 'small'\n",
+    "                                          ,title = 'PR Curve for best model'\n",
+    "                                          ,figsize = (12,8))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cbb69977",
+   "metadata": {},
+   "source": [
+    "## Deployment: building a pipeline to automatically preprose the text and make classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "abb72553",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#save required model for deployment\n",
+    "pickle.dump(Rf_best, open('bestmodel.pkl','wb'))\n",
+    "pickle.dump(w2v_model, open('w2v_model.pkl','wb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "c54f2a3f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I have cut my finger because of playing football and I have to apply pain relief cream but it does not help\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array(['Injury from sports'], dtype=object)"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_input = input()\n",
+    "\n",
+    "def input_process(data):\n",
+    "    input_clean = phrase_cleanse(data)\n",
+    "    w2v_model = pickle.load(open('w2v_model.pkl', 'rb'))\n",
+    "    input_clean = [input_clean.split(\" \")]\n",
+    "    processed_input = word_avg_vect(input_clean, w2v_model, 100)\n",
+    "    pca_model = pickle.load(open('word2vec.pkl', 'rb')) \n",
+    "    test = pca_model.transform(processed_input)\n",
+    "    return test\n",
+    "\n",
+    "def pred(data):\n",
+    "    test = input_process(data)\n",
+    "    model = pickle.load(open('bestmodel.pkl', 'rb'))\n",
+    "    prediction = model.predict(test)\n",
+    "    return prediction\n",
+    "\n",
+    "pred(raw_input)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6315d091",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}