Download this file

1895 lines (1894 with data), 227.9 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9462896c-2450-43da-9fe5-3c9f14c93cc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# IMPORT DEPENDENCIES\n",
    "# Import dependencies\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "317f1a4e-18ca-4c3c-9eb7-10af3f963822",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>OrgStudyId</th>\n",
       "      <th>WhyStopped</th>\n",
       "      <th>EnrollmentCount</th>\n",
       "      <th>PrimaryOutcomeMeasure</th>\n",
       "      <th>FlowDropWithdrawType</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3000</td>\n",
       "      <td>To determine BCI test performance by evaluatin...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2300</td>\n",
       "      <td>Diagnostic potential of SEMA4C as a biomarker ...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80</td>\n",
       "      <td>Role of SORCIN in patients with breast cancer</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30</td>\n",
       "      <td>SENP1 expression</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>241391</td>\n",
       "      <td>NaN</td>\n",
       "      <td>600</td>\n",
       "      <td>Performance of the Syantra DX Breast Cancer te...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>IL-TM-B1-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>200</td>\n",
       "      <td>This study is intended to evaluate the sensiti...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>FH-Risk 2.0 Research Protocol</td>\n",
       "      <td>NaN</td>\n",
       "      <td>271</td>\n",
       "      <td>To explore how much new risk models change bre...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>ID-RPSBC-01-20201012</td>\n",
       "      <td>NaN</td>\n",
       "      <td>316</td>\n",
       "      <td>Absolute risk difference between breast cancer...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>IRST174.22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>60000</td>\n",
       "      <td>To compare the cumulative incidence of stage 2...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>ANILERGİNN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>300</td>\n",
       "      <td>breast cancer incidence after laparoscopic sle...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Rank                     OrgStudyId WhyStopped  EnrollmentCount  \\\n",
       "0     1                BTX-BCI-016-PRT        NaN             3000   \n",
       "1     2                    2018-TJ-BCD        NaN             2300   \n",
       "2     3                  Breast cancer        NaN               80   \n",
       "3     4                       BC-BOMET        NaN               30   \n",
       "4     5                         241391        NaN              600   \n",
       "5     6                    IL-TM-B1-01        NaN              200   \n",
       "6     7  FH-Risk 2.0 Research Protocol        NaN              271   \n",
       "7     8           ID-RPSBC-01-20201012        NaN              316   \n",
       "8     9                     IRST174.22        NaN            60000   \n",
       "9    10                     ANILERGİNN        NaN              300   \n",
       "\n",
       "                               PrimaryOutcomeMeasure FlowDropWithdrawType  \n",
       "0  To determine BCI test performance by evaluatin...                  NaN  \n",
       "1  Diagnostic potential of SEMA4C as a biomarker ...                  NaN  \n",
       "2      Role of SORCIN in patients with breast cancer                  NaN  \n",
       "3                                   SENP1 expression                  NaN  \n",
       "4  Performance of the Syantra DX Breast Cancer te...                  NaN  \n",
       "5  This study is intended to evaluate the sensiti...                  NaN  \n",
       "6  To explore how much new risk models change bre...                  NaN  \n",
       "7  Absolute risk difference between breast cancer...                  NaN  \n",
       "8  To compare the cumulative incidence of stage 2...                  NaN  \n",
       "9  breast cancer incidence after laparoscopic sle...                  NaN  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# READ IN FILE\n",
    "df = pd.read_csv('Tables/free_text_df.csv')\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4537506e-12ad-4d8b-bb6f-8a76d97ae37e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5013 entries, 0 to 5012\n",
      "Data columns (total 6 columns):\n",
      " #   Column                 Non-Null Count  Dtype \n",
      "---  ------                 --------------  ----- \n",
      " 0   Rank                   5013 non-null   int64 \n",
      " 1   OrgStudyId             5013 non-null   object\n",
      " 2   WhyStopped             320 non-null    object\n",
      " 3   EnrollmentCount        5013 non-null   int64 \n",
      " 4   PrimaryOutcomeMeasure  5013 non-null   object\n",
      " 5   FlowDropWithdrawType   70 non-null     object\n",
      "dtypes: int64(2), object(4)\n",
      "memory usage: 235.1+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d5649ff7-a0d0-4737-bd7e-7d9d0f14ae92",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>WhyStopped</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>Pandemic situation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>Technical problem with plasma blood samples ob...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>study did not start and is currently on pause</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>Principal investigator left the study institut...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>The study was stopped prematurely due to insuf...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4967</th>\n",
       "      <td>Technical problem with blood plasma samples ob...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4968</th>\n",
       "      <td>Evolving data with Ipatasertib that changes th...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4970</th>\n",
       "      <td>Study is part of PhD trajectory and currently ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4981</th>\n",
       "      <td>no funding</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4986</th>\n",
       "      <td>sponsor on campus training restrictions</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>320 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             WhyStopped\n",
       "32                                   Pandemic situation\n",
       "47    Technical problem with plasma blood samples ob...\n",
       "50        study did not start and is currently on pause\n",
       "54    Principal investigator left the study institut...\n",
       "84    The study was stopped prematurely due to insuf...\n",
       "...                                                 ...\n",
       "4967  Technical problem with blood plasma samples ob...\n",
       "4968  Evolving data with Ipatasertib that changes th...\n",
       "4970  Study is part of PhD trajectory and currently ...\n",
       "4981                                         no funding\n",
       "4986            sponsor on campus training restrictions\n",
       "\n",
       "[320 rows x 1 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Extract WhyStopped column and drop null values\n",
    "df_text = pd.DataFrame(df['WhyStopped'])\n",
    "df_text = df_text.dropna()\n",
    "df_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "08352273-ac09-41d5-9c43-7d2ecc3438d4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 320 entries, 32 to 4986\n",
      "Data columns (total 1 columns):\n",
      " #   Column      Non-Null Count  Dtype \n",
      "---  ------      --------------  ----- \n",
      " 0   WhyStopped  320 non-null    object\n",
      "dtypes: object(1)\n",
      "memory usage: 5.0+ KB\n"
     ]
    }
   ],
   "source": [
    "df_text.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "74b3827b-a463-43a1-bf77-3c2c677f4fb1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>WhyStopped</th>\n",
       "      <th>word_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>Pandemic situation</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>Technical problem with plasma blood samples ob...</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>study did not start and is currently on pause</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>Principal investigator left the study institut...</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>The study was stopped prematurely due to insuf...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>No participants enrolled</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>PI no longer working at Indiana University;</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>Temporarily paused per study team for interim ...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>Study classified as out of scope by the Ethics...</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>Enrollment into AWARE cohorts1-4 have conclude...</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            WhyStopped  word_count\n",
       "32                                  Pandemic situation           2\n",
       "47   Technical problem with plasma blood samples ob...          10\n",
       "50       study did not start and is currently on pause           9\n",
       "54   Principal investigator left the study institut...           6\n",
       "84   The study was stopped prematurely due to insuf...           9\n",
       "97                            No participants enrolled           3\n",
       "105        PI no longer working at Indiana University;           7\n",
       "112  Temporarily paused per study team for interim ...           9\n",
       "131  Study classified as out of scope by the Ethics...          16\n",
       "143  Enrollment into AWARE cohorts1-4 have conclude...          18"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_text['word_count'] = df_text['WhyStopped'].apply(lambda x: len(str(x).split(\" \")))\n",
    "df_text.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4f0097e7-54f2-4414-810a-255de0258b90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9.0625"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# AVERAGE WORD COUNT\n",
    "df_text['word_count'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "415583c8-07e1-4a8d-ae1a-bb692524736a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# IMPORT STOPWORD LIBRARY\n",
    "from nltk.corpus import stopwords\n",
    "stop = stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "dcd05c97-aef0-449f-b72c-7a8feebf5cf5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>WhyStopped</th>\n",
       "      <th>word_count</th>\n",
       "      <th>stop_words</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>Pandemic situation</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>Technical problem with plasma blood samples ob...</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>study did not start and is currently on pause</td>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>Principal investigator left the study institut...</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>The study was stopped prematurely due to insuf...</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>No participants enrolled</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>PI no longer working at Indiana University;</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>Temporarily paused per study team for interim ...</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>Study classified as out of scope by the Ethics...</td>\n",
       "      <td>16</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>Enrollment into AWARE cohorts1-4 have conclude...</td>\n",
       "      <td>18</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            WhyStopped  word_count  stop_words\n",
       "32                                  Pandemic situation           2           0\n",
       "47   Technical problem with plasma blood samples ob...          10           3\n",
       "50       study did not start and is currently on pause           9           5\n",
       "54   Principal investigator left the study institut...           6           1\n",
       "84   The study was stopped prematurely due to insuf...           9           2\n",
       "97                            No participants enrolled           3           0\n",
       "105        PI no longer working at Indiana University;           7           2\n",
       "112  Temporarily paused per study team for interim ...           9           1\n",
       "131  Study classified as out of scope by the Ethics...          16           6\n",
       "143  Enrollment into AWARE cohorts1-4 have conclude...          18           8"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_text['stop_words'] = df_text['WhyStopped'].apply(\n",
    "    lambda x: len([x for x in x.split() if x in stop]))\n",
    "df_text.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "79e0a2ea-f1ff-474f-84e3-85c382edbce9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2.95625"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# AVERAGE STOPWORDS\n",
    "df_text['stop_words'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b6e68297-622f-4e40-ba0e-e1ce4fc0101e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>WhyStopped</th>\n",
       "      <th>word_count</th>\n",
       "      <th>stop_words</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>pandemic situation</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>technical problem with plasma blood samples ob...</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>study did not start and is currently on pause</td>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>principal investigator left the study institut...</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>the study was stopped prematurely due to insuf...</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>no participants enrolled</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>pi no longer working at indiana university;</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>temporarily paused per study team for interim ...</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>study classified as out of scope by the ethics...</td>\n",
       "      <td>16</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>enrollment into aware cohorts1-4 have conclude...</td>\n",
       "      <td>18</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            WhyStopped  word_count  stop_words\n",
       "32                                  pandemic situation           2           0\n",
       "47   technical problem with plasma blood samples ob...          10           3\n",
       "50       study did not start and is currently on pause           9           5\n",
       "54   principal investigator left the study institut...           6           1\n",
       "84   the study was stopped prematurely due to insuf...           9           2\n",
       "97                            no participants enrolled           3           0\n",
       "105        pi no longer working at indiana university;           7           2\n",
       "112  temporarily paused per study team for interim ...           9           1\n",
       "131  study classified as out of scope by the ethics...          16           6\n",
       "143  enrollment into aware cohorts1-4 have conclude...          18           8"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# LOWERCASE RESPONSES\n",
    "df_text['WhyStopped'] = df_text['WhyStopped'].apply(\n",
    "    lambda x: \" \".join(x.lower() for x in x.split()))\n",
    "df_text.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b37154ec-5526-48ae-95bc-8432591e24fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "9b6968c9-1515-4a5f-ae26-d44b8604e197",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>WhyStopped</th>\n",
       "      <th>word_count</th>\n",
       "      <th>stop_words</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>pandemic situation</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>technical problem with plasma blood samples ob...</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>study did not start and is currently on pause</td>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>principal investigator left the study institution</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>the study was stopped prematurely due to insuf...</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>no participants enrolled</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>pi no longer working at indiana university</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>temporarily paused per study team for interim ...</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>study classified as out of scope by the ethics...</td>\n",
       "      <td>16</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>enrollment into aware cohorts14 have concluded...</td>\n",
       "      <td>18</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            WhyStopped  word_count  stop_words\n",
       "32                                  pandemic situation           2           0\n",
       "47   technical problem with plasma blood samples ob...          10           3\n",
       "50       study did not start and is currently on pause           9           5\n",
       "54   principal investigator left the study institution           6           1\n",
       "84   the study was stopped prematurely due to insuf...           9           2\n",
       "97                            no participants enrolled           3           0\n",
       "105         pi no longer working at indiana university           7           2\n",
       "112  temporarily paused per study team for interim ...           9           1\n",
       "131  study classified as out of scope by the ethics...          16           6\n",
       "143  enrollment into aware cohorts14 have concluded...          18           8"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# REMOVING PUNCTUATION\n",
    "df_text['WhyStopped'] = df_text['WhyStopped'].str.replace('[^\\w\\s]','')\n",
    "df_text.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "02bf2432-7098-472f-8f2c-ed9fa60c1068",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>WhyStopped</th>\n",
       "      <th>word_count</th>\n",
       "      <th>stop_words</th>\n",
       "      <th>filtered_responses</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>pandemic situation</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>pandemic situation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>technical problem with plasma blood samples ob...</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "      <td>technical problem plasma blood samples obtaine...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>study did not start and is currently on pause</td>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "      <td>study start currently pause</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>principal investigator left the study institution</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>principal investigator left study institution</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>the study was stopped prematurely due to insuf...</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>study stopped prematurely due insufficient rec...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>no participants enrolled</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>participants enrolled</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>pi no longer working at indiana university</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>pi longer working indiana university</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>temporarily paused per study team for interim ...</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>temporarily paused per study team interim data...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>study classified as out of scope by the ethics...</td>\n",
       "      <td>16</td>\n",
       "      <td>6</td>\n",
       "      <td>study classified scope ethics committee projec...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>enrollment into aware cohorts14 have concluded...</td>\n",
       "      <td>18</td>\n",
       "      <td>8</td>\n",
       "      <td>enrollment aware cohorts14 concluded primary o...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            WhyStopped  word_count  \\\n",
       "32                                  pandemic situation           2   \n",
       "47   technical problem with plasma blood samples ob...          10   \n",
       "50       study did not start and is currently on pause           9   \n",
       "54   principal investigator left the study institution           6   \n",
       "84   the study was stopped prematurely due to insuf...           9   \n",
       "97                            no participants enrolled           3   \n",
       "105         pi no longer working at indiana university           7   \n",
       "112  temporarily paused per study team for interim ...           9   \n",
       "131  study classified as out of scope by the ethics...          16   \n",
       "143  enrollment into aware cohorts14 have concluded...          18   \n",
       "\n",
       "     stop_words                                 filtered_responses  \n",
       "32            0                                 pandemic situation  \n",
       "47            3  technical problem plasma blood samples obtaine...  \n",
       "50            5                        study start currently pause  \n",
       "54            1      principal investigator left study institution  \n",
       "84            2  study stopped prematurely due insufficient rec...  \n",
       "97            0                              participants enrolled  \n",
       "105           2               pi longer working indiana university  \n",
       "112           1  temporarily paused per study team interim data...  \n",
       "131           6  study classified scope ethics committee projec...  \n",
       "143           8  enrollment aware cohorts14 concluded primary o...  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# REMOVE STPWORDS\n",
    "df_text['filtered_responses'] = df_text['WhyStopped'].apply(\n",
    "    lambda x: \" \".join(x for x in x.split() if x not in stop))\n",
    "df_text.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "21937e27-2108-48df-b7de-944ebf169ec0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "study       112\n",
       "due          76\n",
       "decision     35\n",
       "dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# WORD FREQUENCY\n",
    "freq = pd.Series(' '.join(df_text['filtered_responses']).split()).value_counts()[:3]\n",
    "freq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "25a44150-7257-4785-b34f-98ba3389715e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32                                   pandemic situation\n",
       "47    technical problem plasma blood samples obtaine...\n",
       "50                                start currently pause\n",
       "54              principal investigator left institution\n",
       "84         stopped prematurely insufficient recruitment\n",
       "Name: filtered_responses, dtype: object"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# FILTERING OUT TOP THREE MOST FREQUENT WORDS\n",
    "freq = list(freq.index)\n",
    "df_text['filtered_responses'] = df_text['filtered_responses'].apply(\n",
    "    lambda x: \" \".join(x for x in x.split() if x not in freq))\n",
    "df_text['filtered_responses'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "7e95e955-a506-4f40-9ae2-b8f4486d5655",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sponsor        30\n",
       "funding        27\n",
       "enrollment     25\n",
       "accrual        25\n",
       "recruitment    24\n",
       "covid19        23\n",
       "terminated     22\n",
       "safety         21\n",
       "patients       20\n",
       "trial          19\n",
       "dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_freq = pd.Series(' '.join(df_text['filtered_responses']).split()).value_counts()[:10]\n",
    "new_freq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c48a1b42-3488-4e30-bdc6-7ad348cb5cd6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "new_freq.plot.bar()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c2cee97-c154-4bbf-b59a-d26216ecb0bb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e4b2da91-9887-4d4c-b26e-4627479f6e9a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "7d432db9-e432-4edb-8d6a-e7e17bf98da5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>WhyStopped</th>\n",
       "      <th>word_count</th>\n",
       "      <th>stop_words</th>\n",
       "      <th>filtered_responses</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3413</th>\n",
       "      <td>sponsor decision</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>sponsor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3315</th>\n",
       "      <td>funding  sponsor filing of chapter 11 bankruptcy</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>funding sponsor filing chapter 11 bankruptcy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4127</th>\n",
       "      <td>sponsor decision based on strategic realignment</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>sponsor based strategic realignment</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1897</th>\n",
       "      <td>decision to discontinue the study based on bro...</td>\n",
       "      <td>24</td>\n",
       "      <td>9</td>\n",
       "      <td>discontinue based broader development strategi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2570</th>\n",
       "      <td>this was a sponsor decision and was not a cons...</td>\n",
       "      <td>14</td>\n",
       "      <td>8</td>\n",
       "      <td>sponsor consequence safety concern</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>896</th>\n",
       "      <td>after demonstrating the on target effect of gm...</td>\n",
       "      <td>23</td>\n",
       "      <td>6</td>\n",
       "      <td>demonstrating target effect gmi1359 via pharma...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4221</th>\n",
       "      <td>sponsor decision based on strategic realignment</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>sponsor based strategic realignment</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2267</th>\n",
       "      <td>administrative closure based on sponsor recomm...</td>\n",
       "      <td>10</td>\n",
       "      <td>2</td>\n",
       "      <td>administrative closure based sponsor recommend...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4610</th>\n",
       "      <td>based on the overall results from the phase 1 ...</td>\n",
       "      <td>28</td>\n",
       "      <td>12</td>\n",
       "      <td>based overall results phase 1 part sponsor dec...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1242</th>\n",
       "      <td>terminated by sponsor due to lack of interest</td>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "      <td>terminated sponsor lack interest</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1921</th>\n",
       "      <td>the study was prematurely discontinued by the ...</td>\n",
       "      <td>23</td>\n",
       "      <td>11</td>\n",
       "      <td>prematurely discontinued sponsor probability s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2030</th>\n",
       "      <td>due to the fact that the sponsor decided not t...</td>\n",
       "      <td>17</td>\n",
       "      <td>9</td>\n",
       "      <td>fact sponsor decided move forward development ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3117</th>\n",
       "      <td>sponsor decision not safety related</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>sponsor safety related</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1695</th>\n",
       "      <td>sponsor withdrew support</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>sponsor withdrew support</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4392</th>\n",
       "      <td>the sponsor terminated study after dosing 2 do...</td>\n",
       "      <td>36</td>\n",
       "      <td>9</td>\n",
       "      <td>sponsor terminated dosing 2 dose groups 7 pts ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2292</th>\n",
       "      <td>sponsors decision no safety concerns</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>sponsors safety concerns</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2075</th>\n",
       "      <td>per sponsor</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>per sponsor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3113</th>\n",
       "      <td>the study was terminated due to the review of ...</td>\n",
       "      <td>26</td>\n",
       "      <td>11</td>\n",
       "      <td>terminated review asset vbir2 within sponsors ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2711</th>\n",
       "      <td>the sponsor has discontinued the development o...</td>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "      <td>sponsor discontinued development tesetaxel</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2474</th>\n",
       "      <td>sponsor decision</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>sponsor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             WhyStopped  word_count  \\\n",
       "3413                                   sponsor decision           2   \n",
       "3315   funding  sponsor filing of chapter 11 bankruptcy           8   \n",
       "4127    sponsor decision based on strategic realignment           6   \n",
       "1897  decision to discontinue the study based on bro...          24   \n",
       "2570  this was a sponsor decision and was not a cons...          14   \n",
       "896   after demonstrating the on target effect of gm...          23   \n",
       "4221    sponsor decision based on strategic realignment           6   \n",
       "2267  administrative closure based on sponsor recomm...          10   \n",
       "4610  based on the overall results from the phase 1 ...          28   \n",
       "1242      terminated by sponsor due to lack of interest           8   \n",
       "1921  the study was prematurely discontinued by the ...          23   \n",
       "2030  due to the fact that the sponsor decided not t...          17   \n",
       "3117                sponsor decision not safety related           5   \n",
       "1695                           sponsor withdrew support           3   \n",
       "4392  the sponsor terminated study after dosing 2 do...          36   \n",
       "2292               sponsors decision no safety concerns           5   \n",
       "2075                                        per sponsor           2   \n",
       "3113  the study was terminated due to the review of ...          26   \n",
       "2711  the sponsor has discontinued the development o...           8   \n",
       "2474                                   sponsor decision           2   \n",
       "\n",
       "      stop_words                                 filtered_responses  \n",
       "3413           0                                            sponsor  \n",
       "3315           1       funding sponsor filing chapter 11 bankruptcy  \n",
       "4127           1                sponsor based strategic realignment  \n",
       "1897           9  discontinue based broader development strategi...  \n",
       "2570           8                 sponsor consequence safety concern  \n",
       "896            6  demonstrating target effect gmi1359 via pharma...  \n",
       "4221           1                sponsor based strategic realignment  \n",
       "2267           2  administrative closure based sponsor recommend...  \n",
       "4610          12  based overall results phase 1 part sponsor dec...  \n",
       "1242           3                   terminated sponsor lack interest  \n",
       "1921          11  prematurely discontinued sponsor probability s...  \n",
       "2030           9  fact sponsor decided move forward development ...  \n",
       "3117           1                             sponsor safety related  \n",
       "1695           0                           sponsor withdrew support  \n",
       "4392           9  sponsor terminated dosing 2 dose groups 7 pts ...  \n",
       "2292           1                           sponsors safety concerns  \n",
       "2075           0                                        per sponsor  \n",
       "3113          11  terminated review asset vbir2 within sponsors ...  \n",
       "2711           3         sponsor discontinued development tesetaxel  \n",
       "2474           0                                            sponsor  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rate_freq = df_text.loc[df_text['WhyStopped'].str.contains(\"sponsor\", case=False)]\n",
    "rate_freq.sample(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "06cac714-6419-4231-9c26-0c7f7beaa2a2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9983     confirmed objective response rate by recist 11...\n",
       "9984     level of decisional conflict level of decision...\n",
       "9985     cancerrelated fatigue severity as assessed usi...\n",
       "9986             detection rate of sentinel node technique\n",
       "9987     risk of reporting cancer overall specific canc...\n",
       "9988     time until adverse liver outcome assessed by i...\n",
       "9989     the feasibility of hpv testing as a single vis...\n",
       "9990                                genomics outcome scale\n",
       "9991     fatigue severity impact of fatigue state of fa...\n",
       "9992     the role of vitamin d in response to neoadjuva...\n",
       "9993     rate of men diagnosed with pc and aggressive p...\n",
       "9994                 doubling of progression free survival\n",
       "9995                           objective response rate orr\n",
       "9996     icg transit time detected by the smartgoggles ...\n",
       "9997     number of participants with grade 2 radiation ...\n",
       "9998                 the number of circulating tumor cells\n",
       "9999     to evaluate the impact of visceral adipose tis...\n",
       "10000    feasibility defined by percent randomized numb...\n",
       "10001           number of patients with complete data safe\n",
       "10002    the psychosocial factors predictive of uptake ...\n",
       "10003                                  overall survival os\n",
       "10004         occult cancer missed by screening strategies\n",
       "10005    the role of magnetic resonance imaging mri in ...\n",
       "10006    objective response or durable clinical benefit...\n",
       "10007      change in mdasi scores incidents of dehydration\n",
       "10008    the primary outcome of the study will be durat...\n",
       "10009                                     overall survival\n",
       "10010                                    psa response rate\n",
       "10011                          3year local recurrence rate\n",
       "10012    initially targeted drug therapy risks in patie...\n",
       "Name: WhyStopped, dtype: object"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_text['WhyStopped'].tail(30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "2650179f-de07-497b-abd7-54fd53a882a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# IMPORT WORDCLOUD LIBRARY\n",
    "from wordcloud import WordCloud"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "526319e9-c2b9-480b-bbbf-9dcffb5d6841",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<PIL.Image.Image image mode=RGB size=400x200>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# GENERATE A WORD CLOUD OF FILTERED RESPONSES\n",
    "long_string = ','.join(list(df_text['filtered_responses'].values))\n",
    "\n",
    "wordcloud = WordCloud(\n",
    "    background_color=\"white\",\n",
    "    max_words = 5000,\n",
    "    contour_width=3,\n",
    "    contour_color='steelblue')\n",
    "\n",
    "wordcloud.generate(long_string)\n",
    "\n",
    "wordcloud.to_image()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "c9d4476d-229e-4412-a701-49dfdca5e03a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# IMPORT GENSIM FOR LATENT DIRILECHT ALLOCATION\n",
    "import gensim\n",
    "from gensim import corpora\n",
    "from gensim.utils import simple_preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "fe4c7e08-2d4d-4310-a949-f697cd88170d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['pandemic', 'situation'], ['technical', 'problem', 'plasma', 'blood', 'samples', 'obtained', 'patients'], ['start', 'currently', 'pause'], ['principal', 'investigator', 'left', 'institution'], ['stopped', 'prematurely', 'insufficient', 'recruitment'], ['participants', 'enrolled'], ['pi', 'longer', 'working', 'indiana', 'university'], ['temporarily', 'paused', 'per', 'team', 'interim', 'data', 'review'], ['classified', 'scope', 'ethics', 'committee', 'project', 'involving', 'human', 'person'], ['enrollment', 'aware', 'cohorts', 'concluded', 'primary', 'objective', 'core', 'goals', 'met'], ['slow', 'recruitment', 'rate'], ['sponsor', 'prematurely', 'stop', 'linked', 'safety', 'concern'], ['one', 'participant', 'accrued', 'stopped', 'new', 'safety', 'data', 'company', 'slow', 'accrual'], ['researcher', 'able', 'recruit', 'patients', 'abandoned', 'project'], ['unable', 'achieve', 'devices', 'used'], ['moving', 'different', 'institution', 'pi'], ['decided', 'halt', 'potentially', 'reopen', 'future'], ['covid', 'trial', 'never', 'got', 'running'], ['halted', 'prematurely', 'low', 'recruitment'], ['participant', 'recruitment', 'stopped', 'corona', 'pandemic'], ['poor', 'enrolled', 'patients'], ['termination', 'collaboration', 'puma'], ['terminated', 'based', 'pfizers', 'change', 'clinical', 'development', 'strategy', 'related', 'safety', 'efficacy'], ['clinical', 'treatment', 'patients', 'stage', 'iv', 'breast', 'cancer', 'received', 'surgical', 'treatment', 'made', 'difficult', 'enroll', 'project', 'terminated'], ['logistics'], ['prematurely', 'discontinued', 'significant', 'data', 'quality', 'issues', 'august', 'safety', 'concerns', 'led', 'terminate'], ['recruitment', 'difficulties'], ['lack', 'accrual'], ['covid', 'pandemic', 'pursuing'], ['pi', 'moving', 'new', 'institution'], ['institutional', 'conflict', 'interest'], ['approved', 'treatment', 'regimen', 'based', 'current', 'guidelines', 'however', 'reimbursement', 'imp', 'feasible'], ['surgery', 'wait', 'times', 'covid', 'pandemic', 'eventually', 'reduced', 'prepandemic', 'wait', 'times', 'became', 'irrelevant'], ['withdrawn', 'change', 'plan', 'safety', 'reasons'], ['limited', 'staff', 'carry'], ['strategic', 'changes', 'regarding', 'product', 'development'], ['premature', 'termination', 'lack', 'funding'], ['security', 'effect', 'data', 'another', 'ongoing'], ['relocation', 'principal', 'investigator'], ['covid', 'never', 'started'], ['dose', 'limiting', 'toxicity'], ['insurance', 'nonpayment', 'subjects', 'enrolled'], ['enrollment'], ['demonstrating', 'target', 'effect', 'gmi', 'via', 'pharmacodynamic', 'markers', 'cxcr', 'eselectin', 'sponsor', 'terminated', 'trial', 'covidrelated', 'slow', 'enrollment'], ['treatment', 'standard', 'changed'], ['strategic', 'considerations'], ['terminated', 'part', 'dose', 'escalation', 'part', 'expansion', 'initiated', 'voluntarily', 'terminated', 'business', 'proceed', 'isb', 'asset', 'safety', 'issue'], ['initiating', 'new', 'revised', 'statistics'], ['inadequate', 'fna', 'samples', 'low', 'cell', 'counts'], ['difficulty', 'accruing', 'subjects', 'accrual', 'closed'], ['funder'], ['camad', 'clinical', 'trial', 'terminated', 'difficulties', 'recruiting', 'patients'], ['data', 'safety', 'monitoring', 'board', 'agreement', 'findings', 'far', 'stopping', 'rule', 'met', 'suspends', 'treatment', 'arms', 'march'], ['dompé', 'decided', 'withdraw', 'numerous', 'difficulties', 'encountered', 'enrollment', 'mainly', 'rapidly', 'continuously', 'changing', 'oncology', 'drug', 'scenario', 'patients', 'enrolled'], ['funding', 'withdrawn', 'funding', 'source'], ['never', 'activated', 'enrollment'], ['business', 'regarding', 'drug'], ['lack', 'funding'], ['slow', 'patient', 'accrual'], ['financial', 'support'], ['determined', 'sufficient', 'enrollment', 'data', 'explore', 'impact', 'endopredict', 'test', 'endocrine', 'therapy', 'decisionmaking'], ['slow', 'recruitment', 'rate'], ['accrual', 'rate', 'date', 'low', 'finish', 'trial', 'reasonable', 'timeframe'], ['terminated', 'sponsor', 'lack', 'interest'], ['slow', 'accrual'], ['participants', 'enrolled'], ['difficulty', 'recruiting', 'retaining', 'participants'], ['delayed', 'start', 'technical', 'problems', 'well', 'covid', 'pandemic', 'investigator'], ['needle', 'production', 'stopped'], ['slow', 'recruitment', 'subject', 'profile'], ['ip', 'breach'], ['unable', 'recruit', 'patients', 'competing', 'studies'], ['halted', 'prematurely', 'prior', 'enrollment', 'first', 'participant'], ['funding', 'withdrawn'], ['pi', 'leaving', 'duke', 'position', 'another', 'state'], ['interim', 'analysis', 'provided', 'protocol', 'results', 'unsatisfactory'], ['limited', 'operating', 'room', 'availability'], ['enough', 'eligble', 'patient', 'found', 'many', 'screen', 'failures'], ['funder', 'decided', 'continue'], ['slow', 'accrual', 'partially', 'pandemic'], ['terminated', 'trial', 'initiated', 'new', 'one', 'including', 'pertuzumab'], ['funding'], ['low', 'accrual'], ['lack', 'funding'], ['drug', 'provider', 'decided', 'move', 'forward'], ['cessation', 'main', 'trial', 'biomarker', 'imaging', 'side'], ['unable', 'enroll', 'covid'], ['funding', 'discontinued'], ['sponsor'], ['pi', 'cancel', 'research'], ['disapproved', 'relevant', 'danish', 'ethical', 'committee'], ['recruitment', 'challenges', 'safety', 'concerns'], ['logistical', 'problems', 'meant', 'longer', 'feasible'], ['sponsor', 'withdrew', 'support'], ['china', 'experienced', 'new', 'waves', 'covid', 'since', 'january', 'chinese', 'government', 'adopted', 'strict', 'covid', 'policies', 'control', 'occasional', 'outbreaks', 'covid'], ['strategic', 'emerging', 'new', 'data', 'patients', 'hr', 'her', 'metastatic', 'breast', 'cancer'], ['insufficient', 'resources'], ['lack', 'funding'], ['business', 'perform'], ['design', 'modification'], ['business', 'based', 'inability', 'enroll', 'subjects', 'trial'], ['new', 'insights'], ['never', 'initiated', 'covid', 'pandemic', 'staffing', 'changes'], ['pharmaceutical', 'company', 'decided', 'withdraw', 'funding', 'provide', 'drug'], ['meeting', 'recruitment', 'goals'], ['pi', 'decided', 'withdraw'], ['prematurely', 'terminated', 'lack', 'recruitment', 'several', 'screening', 'failures', 'since', 'may', 'potential', 'candidate', 'cohort', 'found', 'gbrca', 'wild', 'type', 'hrd', 'score', 'mychoice', 'cdx', 'plus', 'test'], ['discontinue', 'based', 'broader', 'development', 'strategic', 'prioritisation', 'sponsor', 'concludes', 'benefitrisk', 'impact', 'co'], ['prematurely', 'discontinued', 'sponsor', 'probability', 'success', 'low', 'justify', 'continuation', 'recruitment'], ['company'], ['lost', 'funding'], ['sponsor', 'strategy', 'adjustment'], ['lack', 'effectiveness'], ['fact', 'sponsor', 'decided', 'move', 'forward', 'development'], ['per', 'sponsor'], ['low', 'accrual'], ['closure', 'radiolaboratory'], ['modification', 'care', 'habits', 'believe', 'today', 'longer', 'able', 'carry', 'initially', 'described'], ['insufficient', 'patient', 'inclusion'], ['funding'], ['ctp', 'lares', 'expire', 'date', 'trying', 'rws', 'pathway', 'indication', 'expend'], ['participants', 'enrolled'], ['meeting', 'fda', 'sparc', 'concludes', 'required'], ['early', 'terminated', 'low', 'enrollment', 'compared', 'anticipated', 'figures'], ['principal', 'investigator', 'wishes', 'revisit', 'design', 'start', 'new'], ['trial', 'initiated'], ['low', 'accrualloss', 'funding'], ['drug', 'manufacturer', 'terminate', 'development'], ['administrative', 'closure', 'based', 'sponsor', 'recommendation', 'prior', 'subject', 'enrollment'], ['principal', 'investigator', 'decided', 'move', 'forward'], ['sponsors', 'safety', 'concerns'], ['never', 'initiated', 'never', 'established', 'sites', 'enrolled', 'subjects'], ['inadequate', 'accrual', 'rate'], ['difficulty', 'recruiting', 'staffing', 'covid'], ['related', 'covid', 'continuing', 'data', 'analysis', 'time'], ['patients', 'enrolled', 'efforts', 'discontinued'], ['enrollment', 'incomplete', 'end', 'contracted', 'enrollment', 'period'], ['limited', 'recruitment'], ['participants', 'enrolled'], ['early', 'discontinuation', 'based', 'strategic', 'sponsor', 'driven', 'safety', 'concerns'], ['stopped', 'covid', 'pandemic', 'university', 'closed', 'since', 'beginning', 'pandemic', 'present', 'date', 'social', 'distance', 'imposed', 'government'], ['withdrawn'], ['application', 'withdrawn'], ['sponsor'], ['pi', 'left'], ['funding', 'terminated'], ['patients', 'screened', 'enrolled'], ['funding', 'issues', 'pharmacy', 'preparation', 'drug'], ['decided', 'halt', 'potentially', 'reopen', 'future'], ['unforeseen', 'slow', 'enrollment', 'shift', 'corporate', 'resources', 'covid', 'impact'], ['lack', 'resources'], ['sponsor', 'consequence', 'safety', 'concern'], ['accrual', 'suspended', 'pending', 'completion', 'amendment'], ['change', 'design'], ['dropped'], ['funding'], ['team', 'determined', 'data', 'collected', 'appropriate', 'outcome', 'goals'], ['low', 'enrollment'], ['logistic', 'reasons', 'operating', 'room', 'performing', 'surgery', 'investigated', 'moved', 'another', 'structure'], ['sponsor', 'discontinued', 'development', 'tesetaxel'], ['withdrawn', 'change', 'treatment', 'landscape', 'her', 'metastatic', 'breast', 'cancer'], ['per', 'sponsor', 'requestno', 'longer', 'manufacturing', 'drug'], ['funding', 'pi', 'leaving', 'institution'], ['sponsor', 'discontinued', 'development', 'tesetaxel'], ['hold', 'staffing', 'issues'], ['covid'], ['patients', 'recruited'], ['low', 'recruitment'], ['slow', 'recruitment', 'subject', 'profile'], ['trial', 'withdrawn', 'based', 'portfolio', 'prioritization', 'oral', 'atri', 'combination', 'niraparib', 'investigation', 'ddriver', 'solid', 'tumor'], ['manufacturer', 'supporting'], ['insufficient', 'recruitment'], ['terminated', 'early', 'given', 'first', 'four', 'patients', 'enrolled', 'experienced', 'grade', 'neutropenia', 'alopecia', 'cycle', 'failed', 'meet', 'primary', 'endpoint', 'main', 'secondary', 'endpoint'], ['principal', 'investigator', 'moving', 'new', 'institution', 'closing'], ['feasability', 'recruitment', 'issues'], ['lack', 'site', 'participation'], ['device', 'sent', 'back', 'repairs', 'returned', 'site', 'date'], ['trial', 'handovered', 'another', 'sponser'], ['feasibility', 'low', 'patient', 'accrual', 'financial', 'reasons'], ['halted', 'funding'], ['lack', 'enrollment'], ['lack', 'financial', 'human', 'resources', 'investigators', 'unable', 'continue', 'targeted', 'sample', 'size', 'achieved', 'recruitment', 'difficulties', 'mainly', 'related', 'covid', 'pandemic'], ['her', 'patient', 'develop', 'brain', 'mets', 'onafter', 'received', 'tdm', 'proven', 'insurmountable', 'challenge', 'recruit', 'patients'], ['pi', 'left', 'institution'], ['replaced', 'nct'], ['closed', 'prematurely', 'slow', 'accrual'], ['principal', 'investigator', 'retired', 'completed'], ['interim', 'analysis', 'algorithm', 'development'], ['recruitment', 'issues'], ['change', 'business', 'strategy'], ['agent', 'longer', 'available'], ['intervention', 'supply', 'interruption'], ['changes', 'standard', 'adjuvant', 'treatment', 'allow', 'iterative', 'picc', 'placement'], ['pi', 'withdrawn'], ['insufficient', 'staff'], ['terminated', 'review', 'asset', 'vbir', 'within', 'sponsors', 'oncology', 'portfolio', 'terminated', 'safety', 'concerns'], ['sponsor', 'safety', 'related'], ['closed', 'low', 'accrual'], ['reduction', 'available', 'resources'], ['low', 'accrual'], ['change', 'business', 'need'], ['pi'], ['terminated', 'slow', 'accrual'], ['funding', 'sponsor', 'filing', 'chapter', 'bankruptcy'], ['patient', 'enrolled', 'authorized', 'period'], ['sufficiently', 'staff', 'available', 'perform', 'trial'], ['diagnostic', 'issues'], ['revised', 'listed', 'us'], ['leading', 'entity', 'clinical', 'trial', 'replaced', 'patients', 'enrolled'], ['slow', 'inclusion', 'rate'], ['sponsor'], ['closing', 'clinical', 'partners', 'relocate', 'osu', 'could', 'find', 'continued', 'interest', 'however', 'demonstrated', 'feasibility', 'imaging', 'approach', 'also', 'filed', 'patent'], ['principal', 'investigator', 'departed', 'institution'], ['funding', 'unavailable', 'company', 'shutting'], ['enrollment', 'temporarily', 'halted', 'interim', 'analysis', 'ensure', 'adequate', 'evaluable', 'subjects'], ['funding'], ['suspended', 'qtultrasound', 'studies', 'reprioritized'], ['benefit', 'completing', 'worth', 'exposing', 'subjects', 'risk', 'covid', 'delaying', 'return', 'visits', 'would', 'make', 'difficult', 'analyze', 'changes', 'data', 'overtime'], ['slow', 'accrual', 'result', 'covid'], ['covid'], ['unable', 'meet', 'accrual', 'goal'], ['recruitment', 'temporarily', 'suspended', 'covid', 'resume', 'appropriate'], ['low', 'accrual', 'temporarily', 'suspended'], ['difficulty', 'recruiting', 'research', 'subjects'], ['quality', 'data', 'originating', 'prior', 'versions', 'protocol', 'affected', 'protocol', 'deviations', 'triggered', 'covid', 'pandemics'], ['abandoned', 'prior', 'opening', 'accrual', 'start'], ['change', 'development', 'priorities', 'clinical', 'development', 'lucitanib', 'plus', 'rucaparib', 'lucitanib', 'plus', 'sacituzumab', 'govitecan', 'combinations', 'planned', 'time'], ['sponsor'], ['stopped', 'early', 'increased', 'global', 'access', 'genomic', 'screening', 'longer', 'economical', 'continue', 'particular', 'singlegene', 'screening', 'protocol'], ['termination', 'business', 'safety', 'concerns'], ['progress', 'doesnt', 'meet', 'sponsors', 'requirement'], ['pi', 'left', 'institution', 'never', 'submitted', 'irb'], ['accrual'], ['strategic', 'business', 'unrelated', 'safety'], ['new', 'medical', 'team', 'surgical', 'center', 'location'], ['data', 'initial', 'patients', 'sufficient'], ['low', 'accrual'], ['stopped', 'unacceptable', 'toxicity', 'doseescalation', 'portion', 'phase', 'progress', 'phase'], ['clinical', 'treatment', 'patient', 'score', 'less', 'fact', 'ntx', 'score', 'points', 'weeks', 'application', 'albumin', 'bound', 'paclitaxel', 'project', 'therefore', 'suitable', 'subject', 'screening'], ['pi', 'left', 'institution'], ['halted', 'prematurely', 'slow', 'enrollment'], ['product', 'development', 'discontinued', 'unrelated', 'safety'], ['secondary', 'medicare', 'coverage', 'determination'], ['closed', 'portfolio', 'prioritization'], ['lack', 'funding'], ['unforeseen', 'complications', 'covid', 'funding'], ['award', 'yet', 'received', 'pi', 'transferring', 'different', 'institution'], ['withdrawn', 'scientific', 'interest', 'pursuing', 'syd', 'paclitaxel', 'combination', 'diminished'], ['high', 'number', 'screen', 'failures'], ['company', 'liquidated'], ['enrollment', 'challenges'], ['funding', 'withdrawn', 'sponsor'], ['unable', 'recruit', 'limited', 'number', 'patients', 'adh'], ['suspended', 'covid'], ['futility', 'recruitment'], ['resume', 'based', 'results', 'planned', 'interim', 'analysis', 'showed', 'futility'], ['sponsor', 'based', 'strategic', 'realignment'], ['mycotoxin', 'potential', 'contamination', 'one', 'lot', 'drug'], ['mainly', 'insufficient', 'recruitment'], ['slow', 'enrollment'], ['constraints', 'covid', 'unable', 'recruit', 'onsite'], ['never', 'started', 'change', 'standard', 'care', 'guideline'], ['clinical', 'hold', 'fda'], ['research', 'cancelled', 'inadequate', 'staffing'], ['similar', 'clinical', 'trials', 'showed', 'encouraging', 'results', 'made', 'us', 'decide', 'several', 'modifications', 'protocol', 'numerous', 'difficulties', 'encountered', 'abandon', 'trial'], ['sponsor', 'based', 'strategic', 'realignment'], ['funding', 'sought'], ['suspended', 'covid', 'pandemic', 'terminated', 'prevent', 'inconsistencies', 'baseline', 'anxiety', 'patients', 'enrolled', 'vs', 'covid', 'pandemic'], ['funding', 'available'], ['patient', 'feedback', 'principal', 'investigator', 'disponibility'], ['trial', 'initiated'], ['despite', 'demonstrated', 'safety', 'tolerability', 'trial', 'terminated', 'early', 'program', 'light', 'competitive', 'landscape'], ['departure', 'department', 'principal', 'investigator'], ['lack', 'human', 'ressources'], ['pi', 'left', 'nih'], ['pi', 'slow', 'accrual'], ['sponsor', 'terminated', 'dosing', 'dose', 'groups', 'pts', 'closed', 'trial', 'rtx', 'welltolerated', 'dlts', 'related', 'deaths', 'saes', 'gr', 'aes', 'cleared', 'rapidly', 'win', 'min'], ['enough', 'patients', 'initialize', 'clinical', 'trial'], ['design', 'changed', 'covid'], ['sponsor'], ['review', 'data', 'showed', 'low', 'likelihood', 'efficacy', 'patients', 'novartis', 'decided', 'terminate', 'trial', 'early', 'termination', 'safety', 'related'], ['critical', 'personnel', 'left', 'institution'], ['suspended', 'covid', 'pandemic'], ['terminated', 'mtd', 'reached'], ['stop', 'enrollment', 'strategic', 'considerations', 'specific', 'safety', 'reasons', 'request', 'regulatory', 'authority'], ['disapproved', 'moving', 'forward'], ['recruitment', 'stopped', 'target', 'sample', 'size', 'achieved'], ['business', 'strategy', 'change'], ['terminated', 'strategic', 'business', 'eli', 'lilly', 'company'], ['investigator', 'left', 'nih'], ['funded'], ['based', 'overall', 'results', 'phase', 'part', 'sponsor', 'decided', 'end', 'safety', 'reasons'], ['cami', 'combination', 'pembrolizumab', 'solid', 'tumors', 'showed', 'signals', 'activity', 'however', 'signals', 'insufficiently', 'compelling', 'tested', 'doseschedule', 'justify', 'continuation'], ['slow', 'accrual'], ['terminated', 'lack', 'enrollment', 'compounded', 'global', 'covid', 'pandemic', 'safety', 'andor', 'efficacy', 'concerns', 'involved', 'stop', 'enrollment'], ['participants', 'enrolled'], ['clear', 'benefit', 'gb', 'observed', 'either', 'monotherapy', 'combination', 'pembrolizumab'], ['zero', 'accrual'], ['business'], ['please', 'refer', 'nct'], ['prospective', 'recruitment', 'possible'], ['manufacturer', 'clovis', 'supplying', 'rucaparib', 'gone', 'bankrupt', 'longer', 'able', 'fund', 'trial', 'supply', 'product'], ['sponsor', 'based', 'portfolio', 'prioritization'], ['insufficient', 'fundingstaff'], ['subjects', 'eligible', 'closed'], ['unable', 'enroll', 'subjects'], ['data', 'longer', 'needed'], ['terminated', 'change', 'development', 'priorities'], ['business', 'reasons'], ['development', 'bdtx', 'discontinued', 'sponsor'], ['poor', 'enrollment'], ['closed', 'enrollment', 'data', 'analysis', 'recruitment', 'conducted', 'kaiser', 'permanente', 'msk', 'patients', 'recruited'], ['part', 'reached', 'original', 'enrollment', 'goal', 'protocol', 'amended', 'begin', 'enrollment', 'part', 'soon'], ['recommended', 'closure'], ['business', 'priorities'], ['technical', 'problem', 'blood', 'plasma', 'samples', 'obtained', 'hospital'], ['evolving', 'data', 'ipatasertib', 'changes', 'known', 'risk', 'benefit', 'background', 'pursuing', 'future', 'studies'], ['part', 'phd', 'trajectory', 'currently', 'achievability', 'question'], ['funding'], ['sponsor', 'campus', 'training', 'restrictions']]\n"
     ]
    }
   ],
   "source": [
    "# TOKENIZE FILTERED RESPONSES\n",
    "def sent_to_words(sentences):\n",
    "    for sentence in sentences:\n",
    "        yield(gensim.utils.simple_preprocess(str(sentence)))\n",
    "\n",
    "data = df_text.filtered_responses.values.tolist()\n",
    "data_words = list(sent_to_words(data))\n",
    "\n",
    "print(data_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "ceffb2c6-18d2-4100-9518-59f86c8c9f06",
   "metadata": {},
   "outputs": [],
   "source": [
    "# CREATING TERM DICTINOARY OF CORPUS\n",
    "dictionary = corpora.Dictionary(data_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "ec996c0f-2d87-4c0f-85a1-e93351dac814",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[(0, 1), (1, 1)], [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(9, 1), (10, 1), (11, 1)], [(12, 1), (13, 1), (14, 1), (15, 1)], [(16, 1), (17, 1), (18, 1), (19, 1)], [(20, 1), (21, 1)], [(22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)], [(34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1)], [(18, 1), (51, 1), (52, 1)], [(17, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)], [(19, 1), (27, 1), (52, 1), (55, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1)], [(4, 1), (40, 1), (64, 1), (65, 1), (66, 1), (67, 1)], [(68, 1), (69, 1), (70, 1), (71, 1)], [(12, 1), (24, 1), (72, 1), (73, 1)], [(74, 1), (75, 1), (76, 1), (77, 1), (78, 1)], [(79, 1), (80, 1), (81, 1), (82, 1), (83, 1)], [(17, 1), (18, 1), (84, 1), (85, 1)], [(0, 1), (18, 1), (19, 1), (63, 1), (86, 1)], [(4, 1), (20, 1), (87, 1)], [(88, 1), (89, 1), (90, 1)], [(55, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1)], [(4, 1), (40, 1), (93, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 2)], [(110, 1)], [(17, 1), (27, 1), (55, 1), (111, 1), (112, 1), (113, 1), (114, 1), (115, 1), (116, 1), (117, 1), (118, 1)], [(18, 1), (119, 1)], [(58, 1), (120, 1)], [(0, 1), (79, 1), (121, 1)], [(12, 1), (24, 1), (61, 1), (73, 1)], [(122, 1), (123, 1), (124, 1)], [(91, 1), (109, 1), (125, 1), (126, 1), (127, 1), (128, 1), (129, 1), (130, 1), (131, 1), (132, 1)], [(0, 1), (79, 1), (133, 1), (134, 1), (135, 1), (136, 1), (137, 1), (138, 1), (139, 2), (140, 2)], [(55, 1), (92, 1), (141, 1), (142, 1), (143, 1)], [(144, 1), (145, 1), (146, 1)], [(94, 1), (147, 1), (148, 1), (149, 1), (150, 1)], [(90, 1), (120, 1), (151, 1), (152, 1)], [(27, 1), (153, 1), (154, 1), (155, 1), (156, 1)], [(13, 1), (15, 1), (157, 1)], [(79, 1), (81, 1), (158, 1)], [(159, 1), (160, 1), (161, 1)], [(20, 1), (162, 1), (163, 1), (164, 1)], [(46, 1)], [(46, 1), (52, 1), (56, 1), (83, 1), (99, 1), (154, 1), (165, 1), (166, 1), (167, 1), (168, 1), (169, 1), (170, 1), (171, 1), (172, 1), (173, 1)], [(109, 1), (174, 1), (175, 1)], [(150, 1), (176, 1)], [(55, 1), (99, 2), (159, 1), (177, 1), (178, 1), (179, 1), (180, 1), (181, 1), (182, 1), (183, 1), (184, 2), (185, 1), (186, 1)], [(61, 1), (187, 1), (188, 1), (189, 1)], [(7, 1), (85, 1), (190, 1), (191, 1), (192, 1), (193, 1)], [(58, 1), (164, 1), (194, 1), (195, 1), (196, 1)], [(197, 1)], [(4, 1), (83, 1), (93, 1), (99, 1), (119, 1), (198, 1), (199, 1)], [(27, 1), (48, 1), (55, 1), (109, 1), (200, 1), (201, 1), (202, 1), (203, 1), (204, 1), (205, 1), (206, 1), (207, 1), (208, 1), (209, 1)], [(4, 1), (20, 1), (46, 1), (74, 1), (119, 1), (210, 1), (211, 1), (212, 1), (213, 1), (214, 1), (215, 1), (216, 1), (217, 1), (218, 1), (219, 1), (220, 1)], [(143, 1), (151, 2), (221, 1)], [(46, 1), (81, 1), (222, 1)], [(149, 1), (178, 1), (213, 1)], [(120, 1), (151, 1)], [(52, 1), (58, 1), (223, 1)], [(224, 1), (225, 1)], [(27, 1), (46, 1), (226, 1), (227, 1), (228, 1), (229, 1), (230, 1), (231, 1), (232, 1), (233, 1), (234, 1)], [(18, 1), (51, 1), (52, 1)], [(51, 1), (58, 1), (83, 1), (85, 1), (235, 1), (236, 1), (237, 1), (238, 1)], [(56, 1), (99, 1), (120, 1), (124, 1)], [(52, 1), (58, 1)], [(20, 1), (21, 1)], [(21, 1), (196, 1), (199, 1), (239, 1)], [(0, 1), (8, 1), (11, 1), (13, 1), (79, 1), (240, 1), (241, 1), (242, 1)], [(19, 1), (243, 1), (244, 1)], [(18, 1), (52, 1), (245, 1), (246, 1)], [(247, 1), (248, 1)], [(4, 1), (66, 1), (70, 1), (249, 1), (250, 1)], [(17, 1), (46, 1), (63, 1), (84, 1), (251, 1), (252, 1)], [(143, 1), (151, 1)], [(24, 1), (153, 1), (253, 1), (254, 1), (255, 1), (256, 1)], [(28, 1), (257, 1), (258, 1), (259, 1), (260, 1), (261, 1)], [(145, 1), (262, 1), (263, 1), (264, 1)], [(223, 1), (265, 1), (266, 1), (267, 1), (268, 1), (269, 1), (270, 1)], [(74, 1), (197, 1), (271, 1)], [(0, 1), (52, 1), (58, 1), (272, 1)], [(61, 1), (62, 1), (83, 1), (99, 1), (181, 1), (273, 1), (274, 1)], [(151, 1)], [(58, 1), (85, 1)], [(120, 1), (151, 1)], [(74, 1), (213, 1), (275, 1), (276, 1), (277, 1)], [(83, 1), (278, 1), (279, 1), (280, 1), (281, 1), (282, 1)], [(70, 1), (79, 1), (103, 1)], [(113, 1), (151, 1)], [(56, 1)], [(24, 1), (283, 1), (284, 1)], [(35, 1), (285, 1), (286, 1), (287, 1), (288, 1)], [(18, 1), (55, 1), (112, 1), (289, 1)], [(23, 1), (127, 1), (241, 1), (290, 1), (291, 1)], [(56, 1), (225, 1), (292, 1)], [(61, 1), (79, 3), (293, 1), (294, 1), (295, 1), (296, 1), (297, 1), (298, 1), (299, 1), (300, 1), (301, 1), (302, 1), (303, 1), (304, 1), (305, 1)], [(4, 1), (27, 1), (61, 1), (100, 1), (101, 1), (150, 1), (306, 1), (307, 1), (308, 1), (309, 1)], [(16, 1), (310, 1)], [(120, 1), (151, 1)], [(178, 1), (311, 1)], [(312, 1), (313, 1)], [(83, 1), (91, 1), (103, 1), (164, 1), (178, 1), (314, 1)], [(61, 1), (315, 1)], [(0, 1), (79, 1), (81, 1), (147, 1), (181, 1), (316, 1)], [(60, 1), (74, 1), (151, 1), (213, 1), (220, 1), (317, 1), (318, 1)], [(18, 1), (47, 1), (319, 1)], [(24, 1), (74, 1), (220, 1)], [(17, 1), (18, 1), (99, 1), (120, 1), (233, 1), (267, 1), (268, 1), (303, 1), (320, 1), (321, 1), (322, 1), (323, 1), (324, 1), (325, 1), (326, 1), (327, 1), (328, 1), (329, 1), (330, 1), (331, 1), (332, 1), (333, 1)], [(56, 1), (91, 1), (94, 1), (150, 1), (231, 1), (334, 1), (335, 1), (336, 1), (337, 1), (338, 1), (339, 1)], [(17, 1), (18, 1), (56, 1), (85, 1), (113, 1), (340, 1), (341, 1), (342, 1), (343, 1)], [(60, 1)], [(151, 1), (344, 1)], [(56, 1), (98, 1), (345, 1)], [(120, 1), (346, 1)], [(56, 1), (74, 1), (94, 1), (275, 1), (276, 1), (347, 1)], [(30, 1), (56, 1)], [(58, 1), (85, 1)], [(348, 1), (349, 1)], [(23, 1), (65, 1), (144, 1), (313, 1), (350, 1), (351, 1), (352, 1), (353, 1), (354, 1), (355, 1)], [(16, 1), (223, 1), (356, 1)], [(151, 1)], [(235, 1), (357, 1), (358, 1), (359, 1), (360, 1), (361, 1), (362, 1), (363, 1), (364, 1)], [(20, 1), (21, 1)], [(319, 1), (337, 1), (365, 1), (366, 1), (367, 1)], [(46, 1), (85, 1), (99, 1), (368, 1), (369, 1), (370, 1), (371, 1)], [(11, 1), (13, 1), (15, 1), (61, 1), (312, 1), (372, 1), (373, 1)], [(83, 1), (181, 1)], [(85, 1), (151, 1), (374, 1)], [(94, 1), (118, 1), (213, 1), (375, 1)], [(46, 1), (56, 1), (91, 1), (246, 1), (252, 1), (348, 1), (376, 1), (377, 1)], [(13, 1), (15, 1), (74, 1), (275, 1), (276, 1)], [(55, 1), (112, 1), (378, 1)], [(20, 1), (81, 2), (164, 1), (181, 1), (379, 1), (380, 1)], [(51, 1), (58, 1), (193, 1)], [(79, 1), (196, 1), (199, 1), (316, 1)], [(27, 1), (79, 1), (97, 1), (257, 1), (381, 1), (382, 1)], [(4, 1), (20, 1), (113, 1), (383, 1)], [(46, 2), (384, 1), (385, 1), (386, 1), (387, 1)], [(18, 1), (145, 1)], [(20, 1), (21, 1)], [(55, 1), (56, 1), (91, 1), (112, 1), (150, 1), (370, 1), (388, 1), (389, 1)], [(0, 2), (19, 1), (25, 1), (79, 1), (195, 1), (235, 1), (298, 1), (303, 1), (390, 1), (391, 1), (392, 1), (393, 1), (394, 1)], [(143, 1)], [(143, 1), (395, 1)], [(56, 1)], [(14, 1), (24, 1)], [(99, 1), (151, 1)], [(4, 1), (20, 1), (396, 1)], [(114, 1), (151, 1), (213, 1), (397, 1), (398, 1)], [(74, 1), (75, 1), (76, 1), (77, 1), (78, 1)], [(46, 1), (52, 1), (79, 1), (231, 1), (310, 1), (399, 1), (400, 1), (401, 1)], [(120, 1), (310, 1)], [(53, 1), (55, 1), (56, 1), (402, 1)], [(58, 1), (403, 1), (404, 1), (405, 1), (406, 1)], [(92, 1), (312, 1)], [(407, 1)], [(151, 1)], [(27, 1), (32, 1), (47, 1), (227, 1), (408, 1), (409, 1), (410, 1)], [(46, 1), (85, 1)], [(138, 1), (142, 1), (153, 1), (263, 1), (264, 1), (411, 1), (412, 1), (413, 1), (414, 1), (415, 1)], [(56, 1), (94, 1), (113, 1), (416, 1)], [(92, 1), (100, 1), (101, 1), (109, 1), (143, 1), (307, 1), (309, 1), (417, 1)], [(23, 1), (30, 1), (56, 1), (213, 1), (418, 1), (419, 1)], [(12, 1), (24, 1), (151, 1), (254, 1)], [(56, 1), (94, 1), (113, 1), (416, 1)], [(114, 1), (316, 1), (420, 1)], [(79, 1)], [(4, 1), (421, 1)], [(18, 1), (85, 1)], [(18, 1), (52, 1), (245, 1), (246, 1)], [(83, 1), (91, 1), (143, 1), (422, 1), (423, 1), (424, 1), (425, 1), (426, 1), (427, 1), (428, 1), (429, 1), (430, 1), (431, 1)], [(375, 1), (432, 1)], [(16, 1), (18, 1)], [(4, 1), (20, 1), (50, 1), (99, 1), (251, 1), (281, 1), (297, 1), (370, 1), (433, 1), (434, 1), (435, 2), (436, 1), (437, 1), (438, 1), (439, 1), (440, 1), (441, 1), (442, 1)], [(12, 1), (13, 1), (15, 1), (61, 1), (73, 1), (443, 1)], [(18, 1), (114, 1), (444, 1)], [(120, 1), (445, 1), (446, 1)], [(235, 1), (446, 1), (447, 1), (448, 1), (449, 1), (450, 1), (451, 1)], [(83, 1), (153, 1), (452, 1), (453, 1)], [(58, 1), (85, 1), (142, 1), (223, 1), (224, 1), (454, 1)], [(84, 1), (151, 1)], [(46, 1), (120, 1)], [(0, 1), (18, 1), (37, 1), (70, 1), (79, 1), (97, 1), (119, 1), (120, 1), (215, 1), (224, 1), (271, 1), (310, 1), (455, 1), (456, 1), (457, 1), (458, 1), (459, 1)], [(4, 1), (66, 1), (106, 1), (223, 1), (307, 1), (460, 1), (461, 1), (462, 1), (463, 1), (464, 1), (465, 1), (466, 1), (467, 1)], [(12, 1), (14, 1), (24, 1)], [(468, 1), (469, 1)], [(17, 1), (52, 1), (58, 1), (195, 1)], [(13, 1), (15, 1), (470, 1), (471, 1)], [(28, 1), (94, 1), (257, 1), (472, 1)], [(18, 1), (114, 1)], [(92, 1), (98, 1), (178, 1)], [(23, 1), (473, 1), (474, 1)], [(475, 1), (476, 1), (477, 1)], [(109, 1), (147, 1), (175, 1), (478, 1), (479, 1), (480, 1), (481, 1), (482, 1)], [(24, 1), (143, 1)], [(16, 1), (146, 1)], [(31, 1), (55, 1), (99, 2), (112, 1), (177, 1), (217, 1), (378, 1), (428, 1), (483, 1), (484, 1)], [(55, 1), (56, 1), (97, 1)], [(58, 1), (85, 1), (195, 1)], [(310, 1), (474, 1), (485, 1)], [(58, 1), (85, 1)], [(92, 1), (178, 1), (486, 1)], [(24, 1)], [(52, 1), (58, 1), (99, 1)], [(56, 1), (151, 1), (487, 1), (488, 1), (489, 1)], [(20, 1), (223, 1), (387, 1), (490, 1)], [(83, 1), (146, 1), (311, 1), (474, 1), (491, 1)], [(114, 1), (492, 1)], [(188, 1), (493, 1), (494, 1)], [(4, 1), (20, 1), (83, 1), (93, 1), (469, 1), (495, 1), (496, 1)], [(51, 1), (52, 1), (356, 1)], [(56, 1)], [(93, 1), (124, 1), (129, 1), (280, 1), (443, 1), (454, 1), (497, 1), (498, 1), (499, 1), (500, 1), (501, 1), (502, 1), (503, 1), (504, 1), (505, 1), (506, 1), (507, 1)], [(12, 1), (13, 1), (15, 1), (508, 1)], [(60, 1), (151, 1), (509, 1), (510, 1)], [(28, 1), (33, 1), (46, 1), (84, 1), (164, 1), (257, 1), (511, 1), (512, 1), (513, 1)], [(151, 1)], [(250, 1), (406, 1), (514, 1), (515, 1)], [(27, 1), (79, 1), (102, 1), (147, 1), (164, 1), (516, 1), (517, 1), (518, 1), (519, 1), (520, 1), (521, 1), (522, 1), (523, 1), (524, 1), (525, 1), (526, 1), (527, 1)], [(52, 1), (58, 1), (79, 1), (528, 1)], [(79, 1)], [(58, 1), (70, 1), (440, 1), (529, 1)], [(18, 1), (33, 1), (79, 1), (406, 1), (408, 1), (530, 1)], [(33, 1), (58, 1), (85, 1), (406, 1)], [(164, 1), (196, 1), (199, 1), (284, 1)], [(27, 1), (79, 1), (116, 1), (252, 1), (258, 2), (531, 1), (532, 1), (533, 1), (534, 1), (535, 1), (536, 1)], [(11, 1), (58, 1), (64, 1), (252, 1), (537, 1)], [(92, 1), (93, 1), (94, 2), (327, 2), (382, 1), (538, 1), (539, 1), (540, 2), (541, 1), (542, 1), (543, 1), (544, 1)], [(56, 1)], [(19, 1), (23, 1), (258, 1), (271, 1), (330, 2), (370, 1), (545, 1), (546, 1), (547, 1), (548, 1), (549, 1), (550, 1), (551, 1)], [(55, 1), (90, 1), (112, 1), (178, 1)], [(378, 1), (440, 1), (552, 1), (553, 1), (554, 1)], [(12, 1), (14, 1), (24, 1), (81, 1), (555, 1), (556, 1)], [(58, 1)], [(55, 1), (150, 1), (178, 1), (557, 1)], [(32, 1), (61, 1), (108, 1), (558, 1), (559, 1), (560, 1)], [(4, 1), (27, 1), (232, 1), (561, 1)], [(58, 1), (85, 1)], [(19, 1), (161, 1), (553, 1), (562, 1), (563, 2), (564, 1), (565, 1)], [(40, 1), (93, 1), (109, 1), (223, 1), (246, 1), (329, 2), (330, 1), (347, 1), (395, 1), (566, 1), (567, 1), (568, 1), (569, 1), (570, 1), (571, 1), (572, 1), (573, 1), (574, 1)], [(12, 1), (14, 1), (24, 1)], [(17, 1), (46, 1), (52, 1), (84, 1)], [(55, 1), (94, 1), (113, 1), (148, 1), (557, 1)], [(442, 1), (575, 1), (576, 1), (577, 1)], [(195, 1), (428, 1), (429, 1)], [(120, 1), (151, 1)], [(79, 1), (151, 1), (401, 1), (578, 1)], [(12, 1), (24, 1), (72, 1), (106, 1), (579, 1), (580, 1), (581, 1)], [(121, 1), (124, 1), (143, 1), (423, 1), (570, 1), (582, 1), (583, 1), (584, 1)], [(267, 1), (270, 1), (585, 1), (586, 1)], [(60, 1), (587, 1)], [(46, 1), (289, 1)], [(56, 1), (143, 1), (151, 1)], [(4, 1), (66, 1), (70, 1), (145, 1), (586, 1), (588, 1)], [(79, 1), (406, 1)], [(18, 1), (589, 1)], [(28, 1), (91, 1), (257, 1), (260, 1), (530, 1), (541, 1), (589, 1), (590, 1)], [(56, 1), (91, 1), (150, 1), (591, 1)], [(62, 1), (213, 1), (328, 1), (592, 1), (593, 1), (594, 1)], [(16, 1), (18, 1), (215, 1)], [(46, 1), (52, 1)], [(66, 1), (70, 1), (79, 1), (595, 1), (596, 1)], [(81, 1), (92, 1), (158, 1), (175, 1), (351, 1), (597, 1)], [(93, 1), (365, 1), (420, 1)], [(193, 1), (284, 1), (316, 1), (598, 1)], [(83, 1), (93, 1), (105, 1), (119, 1), (214, 1), (216, 1), (258, 1), (260, 1), (331, 1), (494, 1), (590, 1), (599, 1), (600, 1), (601, 1), (602, 1), (603, 1), (604, 1)], [(56, 1), (91, 1), (150, 1), (591, 1)], [(151, 1), (605, 1)], [(0, 2), (4, 1), (20, 1), (79, 2), (99, 1), (406, 1), (606, 1), (607, 1), (608, 1), (609, 1), (610, 1)], [(151, 1), (474, 1)], [(13, 1), (15, 1), (223, 1), (611, 1), (612, 1)], [(83, 1), (181, 1)], [(55, 1), (83, 1), (99, 1), (370, 1), (417, 1), (501, 1), (613, 1), (614, 1), (615, 1), (616, 1), (617, 1)], [(13, 1), (15, 1), (618, 1), (619, 1)], [(37, 1), (120, 1), (620, 1)], [(14, 1), (24, 1), (621, 1)], [(24, 1), (52, 1), (58, 1)], [(56, 1), (83, 1), (97, 1), (99, 1), (159, 1), (195, 1), (218, 1), (622, 1), (623, 1), (624, 1), (625, 1), (626, 1), (627, 1), (628, 1), (629, 1), (630, 1), (631, 1), (632, 1), (633, 1), (634, 1)], [(4, 1), (83, 1), (93, 1), (266, 1), (635, 1)], [(79, 1), (174, 1), (312, 1)], [(56, 1)], [(4, 1), (27, 1), (31, 1), (55, 1), (74, 1), (83, 1), (85, 1), (90, 1), (95, 1), (97, 1), (118, 1), (370, 1), (590, 1), (636, 1), (637, 1)], [(12, 1), (14, 1), (638, 1), (639, 1)], [(0, 1), (79, 1), (406, 1)], [(99, 1), (640, 1), (641, 1)], [(46, 1), (55, 1), (57, 1), (142, 1), (150, 1), (176, 1), (642, 1), (643, 1), (644, 1), (645, 1)], [(73, 1), (275, 1), (286, 1)], [(18, 1), (19, 1), (172, 1), (455, 1), (457, 1), (458, 1)], [(92, 1), (98, 1), (178, 1)], [(60, 1), (99, 1), (150, 1), (178, 1), (646, 1), (647, 1)], [(13, 1), (14, 1), (621, 1)], [(648, 1)], [(55, 1), (56, 1), (74, 1), (91, 1), (142, 1), (184, 1), (260, 1), (385, 1), (563, 1), (649, 1)], [(129, 1), (340, 1), (341, 1), (423, 1), (430, 1), (590, 1), (650, 1), (651, 1), (652, 1), (653, 1), (654, 1), (655, 1), (656, 2), (657, 1), (658, 1)], [(52, 1), (58, 1)], [(0, 1), (46, 2), (55, 1), (57, 1), (79, 1), (95, 1), (99, 1), (112, 1), (120, 1), (548, 1), (659, 1), (660, 1), (661, 1)], [(20, 1), (21, 1)], [(423, 1), (517, 1), (655, 1), (662, 1), (663, 1), (664, 1), (665, 1), (666, 1)], [(58, 1), (667, 1)], [(178, 1)], [(468, 1), (668, 1), (669, 1)], [(18, 1), (670, 1), (671, 1)], [(23, 1), (65, 1), (83, 1), (148, 1), (375, 1), (477, 1), (543, 1), (672, 1), (673, 1), (674, 1), (675, 1), (676, 1)], [(56, 1), (91, 1), (428, 1), (429, 1)], [(16, 1), (677, 1)], [(164, 1), (195, 1), (678, 1)], [(70, 1), (103, 1), (164, 1)], [(23, 1), (27, 1), (679, 1)], [(92, 1), (94, 1), (99, 1), (542, 1)], [(142, 1), (178, 1)], [(56, 1), (94, 1), (113, 1), (680, 1)], [(46, 1), (87, 1)], [(4, 1), (18, 1), (27, 1), (46, 1), (195, 1), (257, 1), (421, 1), (681, 1), (682, 1), (683, 1), (684, 1)], [(46, 2), (184, 2), (258, 1), (529, 1), (641, 1), (685, 1), (686, 1), (687, 1), (688, 1)], [(348, 1), (689, 1)], [(178, 1), (542, 1)], [(2, 1), (3, 1), (5, 1), (6, 1), (7, 1), (8, 1), (690, 1)], [(27, 1), (75, 1), (121, 1), (147, 1), (250, 1), (517, 1), (524, 1), (691, 1), (692, 1), (693, 1), (694, 1)], [(9, 1), (184, 1), (695, 1), (696, 1), (697, 1), (698, 1)], [(151, 1)], [(56, 1), (699, 1), (700, 1), (701, 1)]]\n"
     ]
    }
   ],
   "source": [
    "# CONVERT CORPUS INTO DOC-TERM MATRIX\n",
    "doc_term_matrix = [dictionary.doc2bow(doc) for doc in data_words]\n",
    "\n",
    "print(doc_term_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "67479119-b9d3-4a2e-afe4-8a8d4b362305",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('pandemic', 1), ('situation', 1)],\n",
       " [('blood', 1),\n",
       "  ('obtained', 1),\n",
       "  ('patients', 1),\n",
       "  ('plasma', 1),\n",
       "  ('problem', 1),\n",
       "  ('samples', 1),\n",
       "  ('technical', 1)],\n",
       " [('currently', 1), ('pause', 1), ('start', 1)],\n",
       " [('institution', 1), ('investigator', 1), ('left', 1), ('principal', 1)],\n",
       " [('insufficient', 1), ('prematurely', 1), ('recruitment', 1), ('stopped', 1)],\n",
       " [('enrolled', 1), ('participants', 1)],\n",
       " [('indiana', 1), ('longer', 1), ('pi', 1), ('university', 1), ('working', 1)],\n",
       " [('data', 1),\n",
       "  ('interim', 1),\n",
       "  ('paused', 1),\n",
       "  ('per', 1),\n",
       "  ('review', 1),\n",
       "  ('team', 1),\n",
       "  ('temporarily', 1)],\n",
       " [('classified', 1),\n",
       "  ('committee', 1),\n",
       "  ('ethics', 1),\n",
       "  ('human', 1),\n",
       "  ('involving', 1),\n",
       "  ('person', 1),\n",
       "  ('project', 1),\n",
       "  ('scope', 1)],\n",
       " [('aware', 1),\n",
       "  ('cohorts', 1),\n",
       "  ('concluded', 1),\n",
       "  ('core', 1),\n",
       "  ('enrollment', 1),\n",
       "  ('goals', 1),\n",
       "  ('met', 1),\n",
       "  ('objective', 1),\n",
       "  ('primary', 1)]]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[[(dictionary[i], freq) for i, freq in doc] for doc in doc_term_matrix[:10]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "2053e842-a272-49bd-965d-09038a6c2fc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# CREATING THE OBJECT FOR LDA MODEL \n",
    "Lda = gensim.models.ldamodel.LdaModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "45bad93d-c0ec-4e46-81b0-4eec58112590",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, '0.032*\"enrollment\" + 0.022*\"terminated\" + 0.021*\"pi\" + 0.019*\"accrual\" + 0.015*\"institution\" + 0.013*\"enrolled\" + 0.012*\"low\" + 0.011*\"patients\" + 0.011*\"left\" + 0.008*\"slow\"'), (1, '0.036*\"sponsor\" + 0.022*\"safety\" + 0.018*\"development\" + 0.016*\"business\" + 0.016*\"slow\" + 0.014*\"accrual\" + 0.014*\"data\" + 0.014*\"strategic\" + 0.014*\"change\" + 0.013*\"based\"'), (2, '0.035*\"funding\" + 0.028*\"recruitment\" + 0.028*\"covid\" + 0.012*\"lack\" + 0.011*\"pandemic\" + 0.009*\"patients\" + 0.009*\"insufficient\" + 0.009*\"subjects\" + 0.009*\"trial\" + 0.008*\"never\"')]\n"
     ]
    }
   ],
   "source": [
    "# RUNNING AND TRAINING LDA MODEL\n",
    "ldamodel = Lda(\n",
    "    doc_term_matrix, num_topics=3, id2word=dictionary, passes=50)\n",
    "\n",
    "print(ldamodel.print_topics())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "c02bb604-dfa1-4de5-bc96-d0040cce74ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyLDAvis\n",
    "import pickle\n",
    "import pyLDAvis.gensim_models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "9ab87623-c839-4682-a9c3-74a837079069",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<link rel=\"stylesheet\" type=\"text/css\" href=\"https://cdn.jsdelivr.net/gh/bmabey/pyLDAvis@3.3.1/pyLDAvis/js/ldavis.v1.0.0.css\">\n",
       "\n",
       "\n",
       "<div id=\"ldavis_el302711402825237329446870855297\"></div>\n",
       "<script type=\"text/javascript\">\n",
       "\n",
       "var ldavis_el302711402825237329446870855297_data = {\"mdsDat\": {\"x\": [0.12495862603628331, -0.009108257919659966, -0.11585036811662332], \"y\": [0.05200471301188651, -0.11732204477215984, 0.06531733176027336], \"topics\": [1, 2, 3], \"cluster\": [1, 1, 1], \"Freq\": [34.07361161861341, 33.859390967883556, 32.06699741350303]}, \"tinfo\": {\"Term\": [\"funding\", \"enrollment\", \"sponsor\", \"recruitment\", \"pi\", \"development\", \"safety\", \"institution\", \"business\", \"strategic\", \"accrual\", \"change\", \"covid\", \"slow\", \"terminated\", \"left\", \"data\", \"based\", \"insufficient\", \"enrolled\", \"lack\", \"participants\", \"issues\", \"pandemic\", \"rate\", \"investigator\", \"halted\", \"recruit\", \"resources\", \"principal\", \"funding\", \"insufficient\", \"issues\", \"recruit\", \"recruitment\", \"resources\", \"limited\", \"available\", \"difficulty\", \"enroll\", \"continue\", \"staff\", \"standard\", \"manufacturer\", \"endpoint\", \"experienced\", \"signals\", \"government\", \"pembrolizumab\", \"achieved\", \"sample\", \"size\", \"continuation\", \"justify\", \"screen\", \"operating\", \"room\", \"supply\", \"number\", \"reopen\", \"covid\", \"subjects\", \"unable\", \"never\", \"longer\", \"lack\", \"pandemic\", \"stopped\", \"treatment\", \"initiated\", \"design\", \"recruiting\", \"staffing\", \"patients\", \"trial\", \"another\", \"closed\", \"enrolled\", \"decided\", \"withdrawn\", \"patient\", \"low\", \"discontinued\", \"sponsor\", \"pi\", \"institution\", \"enrollment\", \"left\", \"participants\", \"halted\", \"interest\", \"moving\", \"prior\", \"portfolio\", \"score\", \"prioritization\", \"plus\", \"test\", \"demonstrated\", \"times\", \"wait\", \"paclitaxel\", \"oncology\", \"imaging\", \"asset\", \"closing\", \"different\", \"sufficient\", \"reached\", \"application\", \"nih\", \"poor\", \"recruited\", \"disapproved\", \"part\", \"terminated\", \"enrolled\", \"accrual\", \"low\", \"protocol\", \"patients\", \"prematurely\", \"withdrawn\", \"new\", \"clinical\", \"closed\", \"lack\", \"data\", \"pandemic\", \"slow\", \"investigator\", \"trial\", \"safety\", \"covid\", \"development\", \"strategic\", \"change\", \"rate\", \"business\", \"strategy\", \"sponsor\", \"termination\", \"forward\", \"phase\", \"team\", \"move\", \"priorities\", \"per\", \"technical\", \"closure\", \"terminate\", \"risk\", \"blood\", \"obtained\", \"plasma\", \"problem\", \"concludes\", \"progress\", \"us\", \"concern\", \"realignment\", \"unrelated\", \"tesetaxel\", \"regarding\", \"reasons\", \"safety\", \"based\", \"slow\", \"data\", \"concerns\", \"investigator\", \"principal\", \"related\", \"results\", \"accrual\", \"showed\", \"discontinued\", \"trial\", \"drug\", \"decided\", \"clinical\", \"new\", \"patients\", \"patient\", \"low\", \"changes\"], \"Freq\": [19.0, 18.0, 21.0, 17.0, 11.0, 9.0, 15.0, 8.0, 9.0, 7.0, 18.0, 7.0, 20.0, 13.0, 16.0, 6.0, 11.0, 9.0, 5.0, 11.0, 11.0, 4.0, 4.0, 10.0, 4.0, 8.0, 4.0, 4.0, 4.0, 6.0, 18.971204942313882, 5.084983915003155, 4.38678935279237, 3.702190618787388, 15.398679656016036, 3.6874322602335803, 3.0074056854930555, 3.007381402207422, 3.001600190931958, 3.000271767401048, 2.313619689885618, 2.3130443038228745, 2.3019516989455684, 2.253286972082237, 1.6197032035695935, 1.6197005196274972, 1.6196951517433045, 1.6196868443034826, 1.6195876662526854, 1.619527724879201, 1.619527724879201, 1.619527724879201, 1.6194362152343933, 1.6194362152343933, 1.6192457831523215, 1.6192296794997434, 1.6192296794997434, 1.6192261009102817, 1.6192186881178252, 1.6191997727163845, 15.12392169186733, 5.04462969532415, 4.471423504789925, 4.488497466339021, 4.42161516298003, 6.526586472365728, 6.172671131777149, 3.723003183869885, 3.6926237710884537, 2.992012382923319, 2.2557983750437898, 2.2495542476600945, 2.225076184514629, 5.118778068971699, 4.674024324713015, 2.118362391151582, 2.8295032675808627, 3.6807569128197435, 3.009505295052973, 2.8781743847825183, 2.6626528119942323, 2.8684702726163724, 2.574002460166456, 2.6715885498802328, 11.467777392682903, 7.976082233390903, 17.268289959769522, 5.870286626342992, 4.44162203312778, 3.747067189984101, 3.0495755237823197, 3.048599630840854, 3.0480944119286795, 3.0451685111497246, 2.3482856985087697, 2.337626062074217, 2.323145151977142, 1.6436596249687743, 1.6436445115825127, 1.643612125754809, 1.643612125754809, 1.6436070456249732, 1.64354989416432, 1.6435175083366165, 1.6435012519211418, 1.643481566418028, 1.6432864894323316, 1.6432546116176117, 1.6432252738678095, 1.64288236510389, 1.6427476146599942, 1.6421227586901854, 1.6407390583261472, 1.635140247234063, 3.78056988423202, 11.76883299893682, 7.280926410887643, 10.511348460670225, 6.702449994428093, 3.0715021261733, 6.080246868419495, 3.693067441881008, 3.8004593545588268, 3.7780194050479343, 3.6700427694260096, 3.2338742520125296, 4.410686074479421, 4.425481444613335, 4.066480353412825, 4.474619508463524, 3.1989755380857203, 3.9322074417216184, 3.7356577263857775, 3.9769372229067543, 9.199364881624566, 7.129781252217018, 7.117035885985779, 3.668471805149421, 8.485879366601518, 2.9916796755657593, 18.511669941067485, 2.9519003824970964, 2.9326897174317104, 2.3015313971291924, 2.301412560337108, 2.3012557150163806, 2.2842776901682784, 2.2752441694873777, 2.271010067508667, 2.2696068792328976, 2.2685178627191984, 1.6110228796315997, 1.6108746944819556, 1.610874453921648, 1.610874333641494, 1.610874333641494, 1.6108627867467165, 1.610817681688992, 1.6106596335667256, 1.6105830151086709, 1.610573994097126, 1.61055607235419, 1.610550900307571, 1.6104551573050414, 3.7023975442468346, 11.296354103545188, 6.751422226561471, 8.460862056824917, 7.1636723506301, 3.6700154806449827, 4.911672320890061, 4.2429247685616716, 2.9778108926967963, 2.3369512562987484, 7.4138646932207495, 2.3065651215712424, 3.4238299105421124, 5.308194612030555, 3.2823219933633965, 3.7680298536016186, 3.108651403583601, 2.982376486779743, 3.450958137020219, 2.6436240318918123, 2.317066060130112, 2.3097450882808936], \"Total\": [19.0, 18.0, 21.0, 17.0, 11.0, 9.0, 15.0, 8.0, 9.0, 7.0, 18.0, 7.0, 20.0, 13.0, 16.0, 6.0, 11.0, 9.0, 5.0, 11.0, 11.0, 4.0, 4.0, 10.0, 4.0, 8.0, 4.0, 4.0, 4.0, 6.0, 19.441984076812478, 5.556318247631983, 4.862072842915488, 4.167805087777845, 17.380954176419454, 4.167755565897867, 3.4735190743996958, 3.473517993380824, 3.4735941215577446, 3.473467861189911, 2.7792431748582485, 2.7792371344537066, 2.7791601333278395, 2.7788676653499977, 2.084966088865954, 2.0849658273476135, 2.08496553058467, 2.0849655841655395, 2.084964350746015, 2.084963347187942, 2.084963347187942, 2.084963347187942, 2.0849633585669665, 2.084963371921213, 2.084960883185358, 2.084960747008057, 2.084960792113115, 2.0849604578881618, 2.084960578232119, 2.084959988495199, 20.876741983190126, 6.950878961057098, 6.243595456575556, 6.269650182551674, 6.270660621410497, 11.171440370472972, 10.472158605332355, 6.239622799221234, 6.281483900691905, 4.882790332336458, 3.4835629040676426, 3.4846621650587135, 3.485017086623965, 14.649983074411413, 13.914426378465187, 3.46876167179728, 6.294337080605352, 11.213238049531585, 8.333723193061106, 7.68692055918461, 6.247915676399513, 11.887986327174577, 6.233516437598883, 21.427189767616788, 11.968795541759619, 8.446432341705876, 18.29900656439761, 6.332897102254228, 4.923436043568161, 4.219038993699141, 3.5145862010070554, 3.514562162629366, 3.5145562861471884, 3.5144898451932574, 2.8100982061962254, 2.8098581491585874, 2.809560772833407, 2.1055302435940657, 2.1055293764561416, 2.105528484653165, 2.105528484653165, 2.105528200670382, 2.1055271954103922, 2.105525389226777, 2.1055260637493145, 2.1055243812637707, 2.105517565739929, 2.1055173550631623, 2.1055157182278403, 2.105505809953429, 2.105500736720792, 2.1054866707397735, 2.1054611604831313, 2.1053332075081963, 4.910204168902013, 16.129866349033687, 11.213238049531585, 18.164938708213985, 11.887986327174577, 4.899687730585263, 14.649983074411413, 6.990776302004966, 7.68692055918461, 7.676073349465323, 7.673827907098074, 6.294337080605352, 11.171440370472972, 11.828936910538904, 10.472158605332355, 13.211234824631198, 8.351652663176294, 13.914426378465187, 15.268202776295741, 20.876741983190126, 9.674232698686348, 7.602427504563451, 7.602450284593578, 4.1495190902979555, 9.688499225447499, 3.458777418649345, 21.427189767616788, 3.4589343449168983, 3.4599192782325945, 2.768186894518321, 2.7681867201104264, 2.7681864813853627, 2.7684935444989587, 2.768288966405711, 2.7683096326105705, 2.768765519675505, 2.768326318144333, 2.0775943517239925, 2.0775936825407726, 2.077593505883848, 2.077593465482923, 2.077593465482923, 2.0775936143369367, 2.0775932478213206, 2.0775939592753367, 2.077594166744971, 2.0775925220960882, 2.0775930712614366, 2.077592337746371, 2.0775925419451715, 4.843472261027012, 15.268202776295741, 9.019082890916623, 13.211234824631198, 11.828936910538904, 5.558371942036082, 8.351652663176294, 6.956204196011482, 4.847342611378358, 3.471949704431525, 18.164938708213985, 3.4623648824613613, 6.233516437598883, 13.914426378465187, 6.244324966801354, 8.333723193061106, 7.673827907098074, 7.676073349465323, 14.649983074411413, 6.247915676399513, 11.887986327174577, 4.156629090279561], \"Category\": [\"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\", \"Topic3\"], \"logprob\": [30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, -3.365, -4.6817, -4.8294, -4.999, -3.5737, -5.003, -5.2069, -5.2069, -5.2088, -5.2093, -5.4692, -5.4694, -5.4742, -5.4956, -5.8257, -5.8257, -5.8257, -5.8257, -5.8258, -5.8258, -5.8258, -5.8258, -5.8259, -5.8259, -5.826, -5.826, -5.826, -5.826, -5.826, -5.826, -3.5917, -4.6896, -4.8103, -4.8064, -4.8215, -4.4321, -4.4878, -4.9934, -5.0016, -5.212, -5.4945, -5.4972, -5.5082, -4.675, -4.7659, -5.5573, -5.2679, -5.0048, -5.2062, -5.2508, -5.3286, -5.2542, -5.3625, -5.3253, -3.8621, -4.2252, -3.4528, -4.5318, -4.8106, -4.9807, -5.1867, -5.187, -5.1871, -5.1881, -5.448, -5.4525, -5.4587, -5.8047, -5.8047, -5.8048, -5.8048, -5.8048, -5.8048, -5.8048, -5.8048, -5.8048, -5.805, -5.805, -5.805, -5.8052, -5.8053, -5.8057, -5.8065, -5.8099, -4.9718, -3.8362, -4.3164, -3.9492, -4.3992, -5.1795, -4.4966, -4.9952, -4.9665, -4.9725, -5.0015, -5.128, -4.8176, -4.8143, -4.8989, -4.8032, -5.1388, -4.9325, -4.9837, -4.9211, -4.0281, -4.283, -4.2848, -4.9475, -4.1089, -5.1514, -3.3289, -5.1648, -5.1713, -5.4137, -5.4137, -5.4138, -5.4212, -5.4252, -5.427, -5.4277, -5.4281, -5.7704, -5.7705, -5.7705, -5.7705, -5.7705, -5.7705, -5.7705, -5.7706, -5.7707, -5.7707, -5.7707, -5.7707, -5.7708, -4.9383, -3.8228, -4.3375, -4.1118, -4.2782, -4.9471, -4.6557, -4.802, -5.1561, -5.3984, -4.2439, -5.4115, -5.0165, -4.578, -5.0587, -4.9207, -5.1131, -5.1545, -5.0086, -5.2751, -5.407, -5.4101], \"loglift\": [30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 1.0521, 0.988, 0.9738, 0.9582, 0.9556, 0.9542, 0.9326, 0.9325, 0.9306, 0.9302, 0.8933, 0.893, 0.8883, 0.867, 0.8241, 0.8241, 0.8241, 0.8241, 0.8241, 0.824, 0.824, 0.824, 0.824, 0.824, 0.8239, 0.8238, 0.8238, 0.8238, 0.8238, 0.8238, 0.7543, 0.7561, 0.7428, 0.7424, 0.7273, 0.5392, 0.5481, 0.5603, 0.5454, 0.5869, 0.6421, 0.639, 0.628, 0.0251, -0.0143, 0.5835, 0.2771, -0.0373, 0.0581, 0.0943, 0.2237, -0.3451, 0.1922, -1.0053, 1.0402, 1.0257, 1.025, 1.0071, 0.98, 0.9643, 0.941, 0.9407, 0.9406, 0.9396, 0.9034, 0.899, 0.8928, 0.8353, 0.8353, 0.8353, 0.8353, 0.8353, 0.8352, 0.8352, 0.8352, 0.8352, 0.8351, 0.8351, 0.8351, 0.8349, 0.8348, 0.8344, 0.8336, 0.8302, 0.8215, 0.7677, 0.6511, 0.5359, 0.5099, 0.6159, 0.2036, 0.4448, 0.3786, 0.374, 0.3453, 0.417, 0.1536, 0.0998, 0.137, 0.0003, 0.1233, -0.1808, -0.3249, -0.5752, 1.087, 1.0732, 1.0714, 1.0141, 1.0048, 0.9923, 0.9911, 0.9788, 0.972, 0.9527, 0.9527, 0.9526, 0.9451, 0.9412, 0.9393, 0.9385, 0.9382, 0.883, 0.8829, 0.8829, 0.8829, 0.8829, 0.8829, 0.8829, 0.8828, 0.8827, 0.8827, 0.8827, 0.8827, 0.8826, 0.8687, 0.8361, 0.8478, 0.6917, 0.6358, 0.7222, 0.6065, 0.643, 0.6501, 0.7415, 0.2412, 0.7312, 0.5382, 0.1737, 0.4942, 0.3436, 0.2337, 0.192, -0.3084, 0.2772, -0.4979, 0.5498]}, \"token.table\": {\"Topic\": [2, 3, 1, 1, 3, 2, 2, 1, 2, 3, 3, 2, 3, 3, 1, 3, 1, 2, 3, 1, 2, 2, 3, 3, 2, 3, 3, 1, 1, 1, 2, 3, 2, 3, 1, 2, 3, 2, 1, 2, 3, 2, 1, 2, 1, 3, 1, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 3, 1, 3, 2, 1, 2, 1, 2, 3, 2, 1, 3, 2, 1, 2, 1, 2, 2, 3, 2, 1, 2, 3, 1, 2, 3, 1, 3, 3, 2, 3, 2, 2, 2, 1, 2, 3, 2, 3, 2, 3, 2, 3, 3, 1, 2, 3, 3, 2, 3, 1, 3, 1, 2, 1, 2, 1, 2, 3, 1, 3, 1, 1, 2, 3, 3, 1, 2, 3, 1, 2, 1, 1, 3, 1, 1, 2, 3, 1, 3, 1, 1, 2, 1, 1, 3, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 2, 3, 3, 3, 2, 2, 1, 2, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3], \"Freq\": [0.6055621864017587, 0.3853577549829374, 0.9592494768301155, 0.576574636493759, 0.2882873182468795, 0.9498905158776252, 0.9498813785466023, 0.8636776909510285, 0.2217520366748439, 0.7761321283619536, 0.962652137810758, 0.10321516023590448, 0.8257212818872358, 0.9207557745146393, 0.4811591211438802, 0.4811591211438802, 0.1303130604577447, 0.5212522418309788, 0.3909391813732342, 0.476618897523594, 0.476618897523594, 0.9498821375792224, 0.7223435808440712, 0.9626519134550034, 0.35981759062841373, 0.7196351812568275, 0.962652169412977, 0.9592494715948565, 0.7196203693482156, 0.7185029164070689, 0.1916007777085517, 0.09580038885427584, 0.3381538028524127, 0.5917691549917222, 0.3599831588476427, 0.23998877256509513, 0.47997754513019025, 0.9498798840632847, 0.5741248414560465, 0.28706242072802324, 0.9303063385297833, 0.9498852123312267, 0.8636587623699226, 0.9499683911636652, 0.4812692851670066, 0.4812692851670066, 0.3202908257711156, 0.1601454128855578, 0.48043623865667345, 0.9592482154411595, 0.8636901563189606, 0.3567212238187607, 0.6242621416828312, 0.9290121810806414, 0.05464777535768479, 0.9592483357601584, 0.8670722519088562, 0.977266513794772, 0.9592484476430603, 0.948083202353365, 0.9498816828489873, 0.6144027893502595, 0.40960185956683964, 0.9471454545960748, 0.8998764608436393, 0.8535855513062653, 0.3592103408739034, 0.5986839014565057, 0.8226943793794427, 0.9592494654508378, 0.6265978036727986, 0.3580558878130278, 0.9474336789499183, 0.863677422159678, 0.6378913230198474, 0.3189456615099237, 0.25235560652878136, 0.5888297485671564, 0.16823707101918756, 0.7197176119389264, 0.7224946778148713, 0.8535913895333113, 0.6379941278274073, 0.31899706391370364, 0.13027494064652925, 0.521099762586117, 0.3908248219395878, 0.9498928046517314, 0.9592507507723915, 0.9626522196646748, 0.9498808680123347, 0.9592506731217955, 0.9498804145027443, 0.5729477776382071, 0.3819651850921381, 0.814630076959601, 0.20365751923990025, 0.8124407354139367, 0.48016012945437353, 0.16005337648479118, 0.48016012945437353, 0.34129732263877605, 0.40955678716653127, 0.20477839358326563, 0.9592490151135609, 0.7224679302886354, 0.722494569987483, 0.919056555157998, 0.9626522383844296, 0.7118550413070528, 0.9498991505357238, 0.8536089538295518, 0.28609125991149176, 0.5721825198229835, 0.14304562995574588, 0.28751312406078466, 0.5750262481215693, 0.8535928167731046, 0.7224145434523523, 0.7117797033985152, 0.9626522383844296, 0.962652339237871, 0.20409463928848198, 0.6122839178654459, 0.20409463928848198, 0.9639671279865785, 0.9498860458203322, 0.9626526755026029, 0.20646345144711525, 0.825853805788461, 0.959737779420171, 0.9499106597345488, 0.5739437297693682, 0.2869718648846841, 0.8630136094801016, 0.11506848126401355, 0.9626526663055286, 0.4125972022908637, 0.6188958034362956, 0.9592510220992211, 0.9597491831645537, 0.2880226054898263, 0.5760452109796526, 0.9626518277451012, 0.959250652369819, 0.2619823733419429, 0.720451526690343, 0.9592494768301155, 0.7117188985032726, 0.9592506104692207, 0.2888199349137084, 0.5776398698274168, 0.9592484722944826, 0.9592494768301155, 0.3027726062776772, 0.6055452125553544, 0.14000902743363677, 0.8867238404130329, 0.7196219333738589, 0.5738852781170886, 0.2869426390585443, 0.7196418716632738, 0.6410643926263041, 0.32053219631315205, 0.9207585334813339, 0.8673585018290949, 0.7193334868889149, 0.14386669737778296, 0.14386669737778296, 0.9498853073761546, 0.9592508061403632, 0.722494615507807, 0.7224625368636819, 0.7224581823650912, 0.12399358784022514, 0.7439615270413508, 0.12399358784022514, 0.8673191511740231, 0.962652760921068, 0.9498794928664006, 0.9498802863877911, 0.6367922075800274, 0.3183961037900137, 0.35933928312979574, 0.28747142650383656, 0.35933928312979574, 0.6406564979778322, 0.3203282489889161, 0.9626524210468583, 0.962652009585934, 0.9498802863877911, 0.39027331906214274, 0.5203644254161903, 0.13009110635404758], \"Term\": [\"accrual\", \"accrual\", \"achieved\", \"another\", \"another\", \"application\", \"asset\", \"available\", \"based\", \"based\", \"blood\", \"business\", \"business\", \"change\", \"changes\", \"changes\", \"clinical\", \"clinical\", \"clinical\", \"closed\", \"closed\", \"closing\", \"closure\", \"concern\", \"concerns\", \"concerns\", \"concludes\", \"continuation\", \"continue\", \"covid\", \"covid\", \"covid\", \"data\", \"data\", \"decided\", \"decided\", \"decided\", \"demonstrated\", \"design\", \"design\", \"development\", \"different\", \"difficulty\", \"disapproved\", \"discontinued\", \"discontinued\", \"drug\", \"drug\", \"drug\", \"endpoint\", \"enroll\", \"enrolled\", \"enrolled\", \"enrollment\", \"enrollment\", \"experienced\", \"forward\", \"funding\", \"government\", \"halted\", \"imaging\", \"initiated\", \"initiated\", \"institution\", \"insufficient\", \"interest\", \"investigator\", \"investigator\", \"issues\", \"justify\", \"lack\", \"lack\", \"left\", \"limited\", \"longer\", \"longer\", \"low\", \"low\", \"low\", \"manufacturer\", \"move\", \"moving\", \"never\", \"never\", \"new\", \"new\", \"new\", \"nih\", \"number\", \"obtained\", \"oncology\", \"operating\", \"paclitaxel\", \"pandemic\", \"pandemic\", \"part\", \"part\", \"participants\", \"patient\", \"patient\", \"patient\", \"patients\", \"patients\", \"patients\", \"pembrolizumab\", \"per\", \"phase\", \"pi\", \"plasma\", \"plus\", \"poor\", \"portfolio\", \"prematurely\", \"prematurely\", \"prematurely\", \"principal\", \"principal\", \"prior\", \"priorities\", \"prioritization\", \"problem\", \"progress\", \"protocol\", \"protocol\", \"protocol\", \"rate\", \"reached\", \"realignment\", \"reasons\", \"reasons\", \"recruit\", \"recruited\", \"recruiting\", \"recruiting\", \"recruitment\", \"recruitment\", \"regarding\", \"related\", \"related\", \"reopen\", \"resources\", \"results\", \"results\", \"risk\", \"room\", \"safety\", \"safety\", \"sample\", \"score\", \"screen\", \"showed\", \"showed\", \"signals\", \"size\", \"slow\", \"slow\", \"sponsor\", \"sponsor\", \"staff\", \"staffing\", \"staffing\", \"standard\", \"stopped\", \"stopped\", \"strategic\", \"strategy\", \"subjects\", \"subjects\", \"subjects\", \"sufficient\", \"supply\", \"team\", \"technical\", \"terminate\", \"terminated\", \"terminated\", \"terminated\", \"termination\", \"tesetaxel\", \"test\", \"times\", \"treatment\", \"treatment\", \"trial\", \"trial\", \"trial\", \"unable\", \"unable\", \"unrelated\", \"us\", \"wait\", \"withdrawn\", \"withdrawn\", \"withdrawn\"]}, \"R\": 30, \"lambda.step\": 0.01, \"plot.opts\": {\"xlab\": \"PC1\", \"ylab\": \"PC2\"}, \"topic.order\": [3, 1, 2]};\n",
       "\n",
       "function LDAvis_load_lib(url, callback){\n",
       "  var s = document.createElement('script');\n",
       "  s.src = url;\n",
       "  s.async = true;\n",
       "  s.onreadystatechange = s.onload = callback;\n",
       "  s.onerror = function(){console.warn(\"failed to load library \" + url);};\n",
       "  document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
       "}\n",
       "\n",
       "if(typeof(LDAvis) !== \"undefined\"){\n",
       "   // already loaded: just create the visualization\n",
       "   !function(LDAvis){\n",
       "       new LDAvis(\"#\" + \"ldavis_el302711402825237329446870855297\", ldavis_el302711402825237329446870855297_data);\n",
       "   }(LDAvis);\n",
       "}else if(typeof define === \"function\" && define.amd){\n",
       "   // require.js is available: use it to load d3/LDAvis\n",
       "   require.config({paths: {d3: \"https://d3js.org/d3.v5\"}});\n",
       "   require([\"d3\"], function(d3){\n",
       "      window.d3 = d3;\n",
       "      LDAvis_load_lib(\"https://cdn.jsdelivr.net/gh/bmabey/pyLDAvis@3.3.1/pyLDAvis/js/ldavis.v3.0.0.js\", function(){\n",
       "        new LDAvis(\"#\" + \"ldavis_el302711402825237329446870855297\", ldavis_el302711402825237329446870855297_data);\n",
       "      });\n",
       "    });\n",
       "}else{\n",
       "    // require.js not available: dynamically load d3 & LDAvis\n",
       "    LDAvis_load_lib(\"https://d3js.org/d3.v5.js\", function(){\n",
       "         LDAvis_load_lib(\"https://cdn.jsdelivr.net/gh/bmabey/pyLDAvis@3.3.1/pyLDAvis/js/ldavis.v3.0.0.js\", function(){\n",
       "                 new LDAvis(\"#\" + \"ldavis_el302711402825237329446870855297\", ldavis_el302711402825237329446870855297_data);\n",
       "            })\n",
       "         });\n",
       "}\n",
       "</script>"
      ],
      "text/plain": [
       "PreparedData(topic_coordinates=              x         y  topics  cluster       Freq\n",
       "topic                                                \n",
       "2      0.124959  0.052005       1        1  34.073612\n",
       "0     -0.009108 -0.117322       2        1  33.859391\n",
       "1     -0.115850  0.065317       3        1  32.066997, topic_info=            Term       Freq      Total Category  logprob  loglift\n",
       "151      funding  19.000000  19.000000  Default  30.0000  30.0000\n",
       "46    enrollment  18.000000  18.000000  Default  29.0000  29.0000\n",
       "56       sponsor  21.000000  21.000000  Default  28.0000  28.0000\n",
       "18   recruitment  17.000000  17.000000  Default  27.0000  27.0000\n",
       "24            pi  11.000000  11.000000  Default  26.0000  26.0000\n",
       "..           ...        ...        ...      ...      ...      ...\n",
       "61           new   2.982376   7.676073   Topic3  -5.1545   0.1920\n",
       "4       patients   3.450958  14.649983   Topic3  -5.0086  -0.3084\n",
       "223      patient   2.643624   6.247916   Topic3  -5.2751   0.2772\n",
       "85           low   2.317066  11.887986   Topic3  -5.4070  -0.4979\n",
       "147      changes   2.309745   4.156629   Topic3  -5.4101   0.5498\n",
       "\n",
       "[186 rows x 6 columns], token_table=      Topic      Freq       Term\n",
       "term                            \n",
       "58        2  0.605562    accrual\n",
       "58        3  0.385358    accrual\n",
       "455       1  0.959249   achieved\n",
       "153       1  0.576575    another\n",
       "153       3  0.288287    another\n",
       "...     ...       ...        ...\n",
       "494       3  0.962652         us\n",
       "140       2  0.949880       wait\n",
       "143       1  0.390273  withdrawn\n",
       "143       2  0.520364  withdrawn\n",
       "143       3  0.130091  withdrawn\n",
       "\n",
       "[194 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[3, 1, 2])"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/alejandra/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n",
      "  from imp import reload\n",
      "/Users/alejandra/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n",
      "  from imp import reload\n",
      "/Users/alejandra/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n",
      "  from imp import reload\n",
      "/Users/alejandra/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n",
      "  from imp import reload\n",
      "/Users/alejandra/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n",
      "  from imp import reload\n",
      "/Users/alejandra/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n",
      "  from imp import reload\n",
      "/Users/alejandra/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n",
      "  from imp import reload\n",
      "/Users/alejandra/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n",
      "  from imp import reload\n",
      "/Users/alejandra/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n",
      "  from imp import reload\n",
      "/Users/alejandra/opt/anaconda3/envs/mlenv/lib/python3.7/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n",
      "  from imp import reload\n"
     ]
    }
   ],
   "source": [
    "# VISUALIZE THE TOPICS\n",
    "pyLDAvis.enable_notebook()\n",
    "p = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary)\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "4966dfd0-1e9f-41f0-9b7c-23d8abde57d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "pyLDAvis.save_html(p, 'lda.html')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "b27ab81d-20e6-45bc-95da-76c8d66b7332",
   "metadata": {},
   "outputs": [],
   "source": [
    "import imageio.v2 as imageio\n",
    "import os\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "0b6383cd-4661-40ab-93bc-065366da3a8f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['topic1.png', 'topic2.png', 'topic3.png']\n"
     ]
    }
   ],
   "source": [
    "filepath = os.listdir('Images/Gif')\n",
    "print(filepath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "9b2d8cd7-61aa-4905-9f95-95b947949ae1",
   "metadata": {},
   "outputs": [],
   "source": [
    "image_path = [os.path.join('Images/Gif',file) for file in filepath]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "ac3ed395-95d8-4513-bc67-d8831a72e7cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "images = []\n",
    "for img in image_path:\n",
    "        images.append(imageio.imread(img))\n",
    "        \n",
    "imageio.mimwrite('Images/LDAvis.gif', images, fps=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "804a3698-0846-482a-90a6-3d914f7eb6af",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mlenv",
   "language": "python",
   "name": "mlenv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}