Clinical_Trials_Research_ / Git / [b4fdb6] /Unsupervised Machine Learning -NLP/ML

Models:
joseph-gordon/
Clinical_Trials_Research_
Downloads: 1
[b4fdb6]: / Unsupervised Machine Learning -NLP / ML_text.ipynb
History
Download this file
1551 lines (1550 with data), 60.5 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9aaf82f6-279b-457e-8ad8-265e6253b406",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import dependencies\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "from collections import Counter\n",
    "from sklearn.metrics import balanced_accuracy_score\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from imblearn.metrics import classification_report_imbalanced"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "d9b11686-9c23-4b3e-b6f1-b491a79a9d81",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read in file\n",
    "#file_path = Path('/Tables/free_text_df.csv')\n",
    "df = pd.read_csv('Tables/free_text_df.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "bbf8ba12-e96c-4800-9706-3dc0637a8067",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>OrgStudyId</th>\n",
       "      <th>WhyStopped</th>\n",
       "      <th>EnrollmentCount</th>\n",
       "      <th>PrimaryOutcomeMeasure</th>\n",
       "      <th>FlowDropWithdrawType</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3000</td>\n",
       "      <td>To determine BCI test performance by evaluatin...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2300</td>\n",
       "      <td>Diagnostic potential of SEMA4C as a biomarker ...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80</td>\n",
       "      <td>Role of SORCIN in patients with breast cancer</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30</td>\n",
       "      <td>SENP1 expression</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>241391</td>\n",
       "      <td>NaN</td>\n",
       "      <td>600</td>\n",
       "      <td>Performance of the Syantra DX Breast Cancer te...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>IL-TM-B1-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>200</td>\n",
       "      <td>This study is intended to evaluate the sensiti...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>FH-Risk 2.0 Research Protocol</td>\n",
       "      <td>NaN</td>\n",
       "      <td>271</td>\n",
       "      <td>To explore how much new risk models change bre...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>ID-RPSBC-01-20201012</td>\n",
       "      <td>NaN</td>\n",
       "      <td>316</td>\n",
       "      <td>Absolute risk difference between breast cancer...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>IRST174.22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>60000</td>\n",
       "      <td>To compare the cumulative incidence of stage 2...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>ANILERGİNN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>300</td>\n",
       "      <td>breast cancer incidence after laparoscopic sle...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Rank                     OrgStudyId WhyStopped  EnrollmentCount  \\\n",
       "0     1                BTX-BCI-016-PRT        NaN             3000   \n",
       "1     2                    2018-TJ-BCD        NaN             2300   \n",
       "2     3                  Breast cancer        NaN               80   \n",
       "3     4                       BC-BOMET        NaN               30   \n",
       "4     5                         241391        NaN              600   \n",
       "5     6                    IL-TM-B1-01        NaN              200   \n",
       "6     7  FH-Risk 2.0 Research Protocol        NaN              271   \n",
       "7     8           ID-RPSBC-01-20201012        NaN              316   \n",
       "8     9                     IRST174.22        NaN            60000   \n",
       "9    10                     ANILERGİNN        NaN              300   \n",
       "\n",
       "                               PrimaryOutcomeMeasure FlowDropWithdrawType  \n",
       "0  To determine BCI test performance by evaluatin...                  NaN  \n",
       "1  Diagnostic potential of SEMA4C as a biomarker ...                  NaN  \n",
       "2      Role of SORCIN in patients with breast cancer                  NaN  \n",
       "3                                   SENP1 expression                  NaN  \n",
       "4  Performance of the Syantra DX Breast Cancer te...                  NaN  \n",
       "5  This study is intended to evaluate the sensiti...                  NaN  \n",
       "6  To explore how much new risk models change bre...                  NaN  \n",
       "7  Absolute risk difference between breast cancer...                  NaN  \n",
       "8  To compare the cumulative incidence of stage 2...                  NaN  \n",
       "9  breast cancer incidence after laparoscopic sle...                  NaN  "
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "3b9fad39-9555-4995-a832-5e0408219774",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([nan, 'Pandemic situation',\n",
       "       'Technical problem with plasma blood samples obtained from the patients',\n",
       "       'study did not start and is currently on pause',\n",
       "       'Principal investigator left the study institution.',\n",
       "       'The study was stopped prematurely due to insufficient recruitment',\n",
       "       'No participants enrolled',\n",
       "       'PI no longer working at Indiana University;',\n",
       "       'Temporarily paused per study team for interim data review.',\n",
       "       'Study classified as out of scope by the Ethics Committee (not a project involving human person).',\n",
       "       'Enrollment into AWARE cohorts1-4 have concluded and the primary objective and core goals for the study were met.',\n",
       "       'Slow recruitment rate',\n",
       "       'Sponsor decision to prematurely stop the study, not linked to any safety concern',\n",
       "       'One participant was accrued, and the study was stopped due to new safety data from the company for M7824 and slow accrual.',\n",
       "       'The researcher who was able to recruit the patients abandoned the project.',\n",
       "       'Unable to achieve device(s) which will be used in this study.',\n",
       "       'Study is moving to a different institution with PI.',\n",
       "       'Decided to halt and will potentially reopen in the future.',\n",
       "       'Due to COVID this trial never got up and running.',\n",
       "       'Study was halted Prematurely for low recruitment.',\n",
       "       'Participant recruitment was stopped due to corona pandemic.',\n",
       "       'due to poor enrolled patients',\n",
       "       'Termination of collaboration with PUMA',\n",
       "       'This study was terminated based on Pfizers change in clinical development strategy not related to safety and efficacy.',\n",
       "       'In clinical treatment, most patients with stage IV breast cancer received surgical treatment, which made it difficult to enroll this project, so this study was terminated.',\n",
       "       'Logistics',\n",
       "       'The study was prematurely discontinued due to significant data quality issues on August 26, 2021. There were no safety concerns that led to the decision to terminate.',\n",
       "       'recruitment difficulties', 'Lack of accrual',\n",
       "       'Due to COVID-19 pandemic, we will not be pursuing this study',\n",
       "       'PI is moving to a new institution',\n",
       "       'institutional conflict of interest',\n",
       "       'Study approved with treatment regimen based on current guidelines. However, reimbursement of IMP was not feasible.',\n",
       "       'Surgery wait times during the COVID-19 pandemic were eventually reduced to pre-pandemic wait times. This study became irrelevant.',\n",
       "       'Withdrawn due to change in plan for this study and not due to safety reasons.',\n",
       "       'Limited staff to carry out study',\n",
       "       'Strategic changes regarding the product development',\n",
       "       'premature termination due to lack of funding',\n",
       "       'Security and effect data from another ongoing study.',\n",
       "       'Relocation of Principal Investigator',\n",
       "       'Due to COVID this study was never started.',\n",
       "       'Dose limiting toxicity',\n",
       "       'Due to insurance non-payment, no subjects were enrolled.',\n",
       "       'no enrollment',\n",
       "       'After demonstrating the on target effect of GMI-1359 via pharmacodynamic markers (CXCR4 and E-selectin), Sponsor terminated the trial due to COVID-related slow enrollment.',\n",
       "       'Treatment standard has changed', 'Strategic considerations',\n",
       "       'The study was terminated during Part 1 (dose escalation), and Part 2 (expansion) of the study was not initiated. This study was voluntarily terminated due to a business decision not to proceed with the ISB 1302 asset, and not due to any safety issue.',\n",
       "       'Initiating a new study with revised Statistics.',\n",
       "       'Inadequate FNA samples (low cell counts)',\n",
       "       'Difficulty accruing subjects the study accrual was closed',\n",
       "       'Funder decision',\n",
       "       'The CAMAD clinical trial has been terminated due to difficulties in recruiting patients.',\n",
       "       'Data Safety Monitoring Board is in agreement with the study findings so far and the stopping rule has been met, which suspends the study treatment arms in March 2021.',\n",
       "       'Dompé decided to withdraw the study due to the numerous difficulties encountered in enrollment, mainly due to the rapidly and continuously changing oncology drug scenario . No patients were enrolled.',\n",
       "       'Funding withdrawn from funding source.',\n",
       "       'Study never activated to enrollment.',\n",
       "       'due to business decision regarding the study drug.',\n",
       "       'lack of funding', 'Slow patient accrual', 'No financial support.',\n",
       "       'It was determined that we had sufficient enrollment and study data to explore the impact of the Endopredict test in endocrine therapy decision-making.',\n",
       "       'Accrual rate to date was too low to finish the trial in a reasonable timeframe',\n",
       "       'Terminated by sponsor due to lack of interest', 'Slow accrual',\n",
       "       'no participants enrolled',\n",
       "       'Difficulty recruiting and retaining participants',\n",
       "       'Delayed start due to technical problems as well as the covid pandemic. Decision of the investigator',\n",
       "       'Needle production stopped',\n",
       "       'Very slow recruitment due to subject profile', 'IP breach',\n",
       "       'unable to recruit patients due to competing studies',\n",
       "       'Study halted prematurely, prior to enrollment of first participant.',\n",
       "       'Funding withdrawn.',\n",
       "       'The PI is leaving Duke for a position in another state.',\n",
       "       'an interim analysis was provided for in the protocol. The results are unsatisfactory',\n",
       "       'Limited operating room availability',\n",
       "       'Not enough eligble patient can be found, too many screen failures',\n",
       "       'Funder decided to not continue the study',\n",
       "       'slow study accrual, partially due to pandemic',\n",
       "       'We terminated this trial and initiated a new one including pertuzumab.',\n",
       "       'No funding', 'Low accrual',\n",
       "       'Drug provider decided not to move forward with the study.',\n",
       "       'Cessation of main trial of which this study was a biomarker imaging side study.',\n",
       "       'Unable to enroll due to COVID-19', 'Funding Discontinued',\n",
       "       'Sponsor decision.', 'PI decision to cancel research',\n",
       "       'Disapproved by relevant Danish ethical committee',\n",
       "       'Recruitment challenges and not due to safety concerns.',\n",
       "       'Logistical problems meant that the study was no longer feasible.',\n",
       "       'Sponsor withdrew support',\n",
       "       'China has experienced new waves of COVID-19 since January 2021, and the Chinese government has adopted strict COVID-19 policies to control the occasional outbreaks of COVID-19.',\n",
       "       'Strategic decision due to emerging new data for patients with HR+, HER2- metastatic breast cancer.',\n",
       "       'Insufficient resources', 'Lack of Funding',\n",
       "       'Business decision to not perform this study.',\n",
       "       'Design modification',\n",
       "       'Business decision based on the inability to enroll subjects into the trial',\n",
       "       'Due to new insights',\n",
       "       'Study never initiated due to COVID pandemic, staffing changes',\n",
       "       'Pharmaceutical company decided to withdraw funding and not provide drug.',\n",
       "       'Study not meeting recruitment goals', 'PI decided to withdraw',\n",
       "       'Study prematurely terminated due to lack of recruitment. After several screening failures since May 2022, no potential candidate for cohort B was found (gBRCA wild type y HRD score >25 on myChoice® CDx PLUS test)',\n",
       "       'Decision to discontinue the study based on broader development and strategic prioritisation. The Sponsor concludes there is no benefit-risk impact on the CO41863 study.',\n",
       "       'The study was prematurely discontinued by the sponsor due to probability of success which was too low to justify the continuation of recruitment.',\n",
       "       'Company Decision', 'lost funding',\n",
       "       'Sponsor R & D Strategy Adjustment', 'Lack of effectiveness',\n",
       "       'Due to the fact that the sponsor decided not to move forward with the development of M7824',\n",
       "       'Per sponsor', 'closure of radiolaboratory',\n",
       "       'Modification of the care habits. We believe today that we are no longer able to carry out this study as initially described.',\n",
       "       'Insufficient patient inclusion', 'Funding',\n",
       "       'CTP of LARES has been out of the expire date, we are trying RWS pathway for the indication expend',\n",
       "       'No participants Enrolled',\n",
       "       'After meeting with the FDA, SPARC concludes a BE study will not be required',\n",
       "       'Study early terminated due to low enrollment compared to the anticipated figures',\n",
       "       'Principal investigator wishes to revisit design and start a new study',\n",
       "       'Trial not initiated', 'Low accrual/Loss of funding',\n",
       "       'Drug manufacturer decision to terminate development.',\n",
       "       'Administrative closure based on sponsor recommendation, prior to subject enrollment',\n",
       "       'Principal Investigator decided not to move forward with the study',\n",
       "       'Sponsors decision, no safety concerns.',\n",
       "       'Study was never initiated. Never established sites or enrolled subjects.',\n",
       "       'Inadequate accrual rate',\n",
       "       'Difficulty with recruiting due to staffing and Covid-19',\n",
       "       'Related to COVID-19, we are continuing data analysis at this time',\n",
       "       'There are no patients enrolled on this study and all efforts are being discontinued',\n",
       "       'Enrollment incomplete by the end of the contracted enrollment period',\n",
       "       'Limited recruitment', 'No participants enrolled.',\n",
       "       'early discontinuation based on strategic sponsor decision not driven by any safety concerns',\n",
       "       'The study had to be stopped due to the COVID-19 pandemic. The University has been closed since the beginning of the pandemic until the present date (02/2021) due to the social distance imposed by the government.',\n",
       "       'Withdrawn', 'application has been withdrawn', 'sponsor decision',\n",
       "       'PI left', 'Funding was terminated.',\n",
       "       'Patients were screened but not enrolled',\n",
       "       'funding issues and pharmacy preparation for the drug',\n",
       "       'Due to unforeseen slow enrollment and a shift in corporate resources due to the COVID-19 impact.',\n",
       "       'lack of resources',\n",
       "       'This was a sponsor decision and was not a consequence of any safety concern',\n",
       "       'Accrual suspended pending completion of amendment',\n",
       "       'change in study design.', 'study was dropped',\n",
       "       'Study team determined that data collected was appropriate for study outcome goals.',\n",
       "       'Low enrollment',\n",
       "       'Logistic reasons (Operating room performing the surgery investigated moved to another structure)',\n",
       "       'The Sponsor has discontinued the development of tesetaxel',\n",
       "       'Study withdrawn due to change in treatment landscape for HER2+ metastatic breast cancer.',\n",
       "       'Per Sponsor Request-no longer manufacturing the study drug',\n",
       "       'Funding and PI leaving institution',\n",
       "       'The study is on hold due to staffing issues', 'COVID-19',\n",
       "       'No patients recruited', 'Low recruitment',\n",
       "       'Trial withdrawn based on portfolio prioritization; oral ATRi M1774 in combination with niraparib is under investigation in DDRiver Solid Tumor 301',\n",
       "       'Manufacturer will not be supporting this study',\n",
       "       'insufficient recruitment',\n",
       "       'The study has been terminated early given that the first four patients enrolled have experienced Grade 4 neutropenia and alopecia after cycle 1 and as such failed to meet the primary endpoint and the main secondary endpoint.',\n",
       "       'Principal investigator is moving to a new institution. Study is closing.',\n",
       "       'feasability and recruitment issues', 'Lack of site participation',\n",
       "       'Device was sent back for repairs. It has not been returned to the site to date.',\n",
       "       'trial handovered to another sponser.',\n",
       "       'Feasibility (low patient accrual and financial reasons)',\n",
       "       'Halted due to study funding', 'Lack of enrollment',\n",
       "       'Due to the lack of financial and human resources the investigators were unable to continue the study. The targeted sample size was not achieved due to recruitment difficulties (mainly related to the Covid-19 pandemic).',\n",
       "       'As most HER2+ patient develop brain mets while on/after having received TDM1, it has proven to be an insurmountable challenge to recruit patients',\n",
       "       'PI left institution', 'Replaced by other study NCT05747794',\n",
       "       'A211601 closed prematurely due to slow accrual.',\n",
       "       'Principal Investigator retired before study completed.',\n",
       "       'Interim analysis and algorithm development.',\n",
       "       'Recruitment issues', 'Change in business strategy',\n",
       "       'Study agent no longer available',\n",
       "       'Intervention supply interruption',\n",
       "       'Changes of the standard adjuvant treatment which does not allow an iterative PICC placement',\n",
       "       'PI withdrawn', 'insufficient staff',\n",
       "       'The study was terminated due to the review of the asset (VBIR-2) within the Sponsors oncology portfolio; the study was not terminated because of safety concerns.',\n",
       "       'Sponsor decision, not safety related',\n",
       "       'Closed due to low (0) accrual',\n",
       "       'Reduction in available resources', 'Low Accrual',\n",
       "       'Change in business need.', 'PI decision',\n",
       "       'Terminated due to slow accrual.',\n",
       "       'funding - sponsor filing of Chapter 11 bankruptcy',\n",
       "       'No patient enrolled during the authorized period',\n",
       "       'not sufficiently staff available to perform trial',\n",
       "       'Diagnostic issues',\n",
       "       'A revised study has been listed here for us.',\n",
       "       'leading entity of the clinical trial was replaced, and no patients were enrolled.',\n",
       "       'slow inclusion rate', 'Sponsor decision',\n",
       "       'We are closing this study as our clinical partners have all relocate from OSU and we could not find continued interest. However, we demonstrated the feasibility of this imaging approach on which we also filed a patent.',\n",
       "       'Principal Investigator departed from institution',\n",
       "       'Funding unavailable - Company shutting down',\n",
       "       'Study enrollment is temporarily halted for interim analysis to ensure adequate evaluable subjects.',\n",
       "       'This study was suspended by QTultrasound as other studies were reprioritized.',\n",
       "       'The benefit of completing the study was not worth exposing subjects to the risk during COVID. Delaying the return visits would make it difficult to analyze the changes in data overtime.',\n",
       "       'slow accrual as a result of COVID-19', 'Due to COVID-19',\n",
       "       'Unable to meet accrual goal',\n",
       "       'Recruitment is temporarily suspended due to COVID. Will resume when appropriate.',\n",
       "       'Due to low accrual study has been temporarily suspended.',\n",
       "       'Difficulty in recruiting research subjects',\n",
       "       'Quality of the data originating from prior versions of the protocol has been affected by protocol deviations triggered by the COVID-19 pandemics',\n",
       "       'study abandoned prior to opening to accrual or study start',\n",
       "       'Due to a change in development priorities, no further clinical development of the lucitanib plus rucaparib or lucitanib plus sacituzumab govitecan combinations is planned at this time.',\n",
       "       'The study was stopped early due to increased global access to genomic screening. It is no longer economical to continue with this particular single-gene screening protocol.',\n",
       "       'This termination decision is a business decision and is not due to any safety concerns.',\n",
       "       'The study progress doesnt meet the sponsors requirement',\n",
       "       'PI left institution, never submitted to IRB', 'No accrual',\n",
       "       'Strategic business decision (unrelated to safety)',\n",
       "       'New Medical Team and Surgical Center Location',\n",
       "       'Data on initial patients sufficient',\n",
       "       'The study was stopped due to unacceptable toxicity during the dose-escalation portion (Phase 1) of the study and did not progress to Phase 2',\n",
       "       'In the clinical treatment, no patient with a score of less than 38(FACT Ntx Score) points after 1-2 weeks of application of albumin bound paclitaxel in this project, therefore, there is no suitable subject after screening',\n",
       "       'PI Left Institution',\n",
       "       'This study was halted prematurely due to slow enrollment',\n",
       "       'Product development discontinued unrelated to safety.',\n",
       "       'Secondary to medicare coverage determination',\n",
       "       'Study closed due to portfolio prioritization',\n",
       "       'Unforeseen complications due to COVID-19 and funding',\n",
       "       'Award not yet received and PI is transferring to a different institution',\n",
       "       'Study withdrawn as scientific interest in pursuing the SYD985+Paclitaxel combination has diminished.',\n",
       "       'High number of screen failures', 'the company is liquidated',\n",
       "       'Enrollment challenges', 'Funding withdrawn from sponsor.',\n",
       "       'Unable to recruit to study due to limited number of patients with ADH',\n",
       "       'Suspended due to Covid-19', 'Futility in recruitment',\n",
       "       'The study will not resume based on the results of a planned interim analysis that showed futility',\n",
       "       'Sponsor decision based on strategic re-alignment',\n",
       "       'due to mycotoxin potential contamination of one lot of study drug',\n",
       "       'Mainly due to insufficient recruitment', 'Slow Enrollment',\n",
       "       'Due to constraints of COVID19, we were unable to recruit on-site for this study.',\n",
       "       'Study never started due to change in standard of care guideline',\n",
       "       'Clinical Hold by the FDA',\n",
       "       'Research cancelled because of inadequate staffing.',\n",
       "       'Similar clinical trials showed not very encouraging results, which made us decide, after several modifications to the protocol and numerous difficulties encountered, to abandon the trial.',\n",
       "       'Funding being sought',\n",
       "       'The study was suspended due to the COVID-19 pandemic. The study was then terminated to prevent inconsistencies in baseline anxiety for patients enrolled before vs after the COVID-19 pandemic.',\n",
       "       'No study funding available.',\n",
       "       'few patient feedback and principal investigator disponibility',\n",
       "       'Despite demonstrated safety and tolerability the trial was terminated early due to program re-prioritization in light of the competitive landscape.',\n",
       "       'Departure from the department of the principal investigator',\n",
       "       'Lack of human ressources', 'PI left the NIH',\n",
       "       'PI decision due to slow accrual',\n",
       "       'The Sponsor terminated study after dosing 2 dose groups (7 pts) and closed trial on 11/30/22. RTX-224 was well-tolerated with no DLTs, no related deaths, SAEs or Gr. 3/4 AEs and cleared rapidly (w/in 10 min).',\n",
       "       'Not enough patients to initialize this clinical trial',\n",
       "       'study design changed due to COVID-19',\n",
       "       'After review of data which showed low likelihood of efficacy in these patients Novartis decided to terminate the trial early. Termination was not safety related',\n",
       "       'Critical study personnel left the institution',\n",
       "       'Suspended due to COVID-19 pandemic',\n",
       "       'Study was terminated due to MTD was reached',\n",
       "       'The decision to stop enrollment was due to strategic considerations and not due to any specific safety reasons or request from a regulatory authority.',\n",
       "       'The study itself was Disapproved on 04/20/2018 and will not be moving forward.',\n",
       "       'Recruitment was stopped before the target sample size was achieved.',\n",
       "       'Business strategy change',\n",
       "       'Study terminated due to strategic business decision by Eli Lilly and Company.',\n",
       "       'Investigator left NIH', 'Study not funded',\n",
       "       'Based on the overall results from the Phase 1 part of the study the sponsor decided to end the study. The decision was not due to safety reasons.',\n",
       "       'Cami in combination with pembrolizumab in solid tumors showed signals of immunomodulatory activity. However, the signals were insufficiently compelling at the tested dose/schedule to justify continuation of the study.',\n",
       "       'slow accrual',\n",
       "       'Study terminated due to lack of enrollment that has been compounded by the global COVID-19 pandemic. There were no safety and/or efficacy concerns involved in the decision to stop enrollment.',\n",
       "       'No clear benefit of GB1275 was observed either as monotherapy or in combination with pembrolizumab.',\n",
       "       'Zero accrual', 'Business decision', 'please refer to NCT05621837',\n",
       "       'Prospective recruitment not possible',\n",
       "       'The manufacturer (Clovis) supplying Rucaparib has gone bankrupt and is no longer able to fund the trial and supply the product.',\n",
       "       'Sponsor decision based on portfolio prioritization',\n",
       "       'Insufficient funding/staff',\n",
       "       'No subjects were eligible for the study. The study closed on 07-15-2021.',\n",
       "       'Unable to enroll subjects',\n",
       "       'The data from this study is no longer needed.',\n",
       "       'The study was terminated due to a change in development priorities.',\n",
       "       'Business Reasons',\n",
       "       'The development of BDTX-189 was discontinued by the sponsor.',\n",
       "       'Poor enrollment',\n",
       "       'Closed to Enrollment Data Analysis Only. Recruitment was conducted at Kaiser Permanente and there were no MSK patients recruited',\n",
       "       'Part 1 of the study reached the original enrollment goal. The protocol is being amended to begin enrollment for Part 2 soon.',\n",
       "       'Study recommended for closure', 'Business priorities',\n",
       "       'Technical problem with blood plasma samples obtained from the hospital.',\n",
       "       'Evolving data with Ipatasertib that changes the known risk / benefit background in pursuing future studies.',\n",
       "       'Study is part of PhD trajectory and currently the achievability is under question.',\n",
       "       'no funding', 'sponsor on campus training restrictions'],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['WhyStopped'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b87bcf2-5f88-4058-b827-54db0db3e4f5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "d2eb4ca9-117e-48f0-ab2c-04d37b627890",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32                                     Pandemic situation\n",
       "47      Technical problem with plasma blood samples ob...\n",
       "50          study did not start and is currently on pause\n",
       "54      Principal investigator left the study institut...\n",
       "84      The study was stopped prematurely due to insuf...\n",
       "                              ...                        \n",
       "4967    Technical problem with blood plasma samples ob...\n",
       "4968    Evolving data with Ipatasertib that changes th...\n",
       "4970    Study is part of PhD trajectory and currently ...\n",
       "4981                                           no funding\n",
       "4986              sponsor on campus training restrictions\n",
       "Name: WhyStopped, Length: 320, dtype: object"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Extract WhyStopped column and drop null values\n",
    "df_text = (df['WhyStopped'])\n",
    "df_text = df_text.dropna()\n",
    "df_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "8a706f28-ffdd-44e7-a08e-ceee98c57e8a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['32', 'Pandemic', 'situation', '47', 'Technical', 'problem', 'with', 'plasma', 'blood', 'samples', 'ob', '...', '50', 'study', 'did', 'not', 'start', 'and', 'is', 'currently', 'on', 'pause', '54', 'Principal', 'investigator', 'left', 'the', 'study', 'institut', '...', '84', 'The', 'study', 'was', 'stopped', 'prematurely', 'due', 'to', 'insuf', '...', '...', '4967', 'Technical', 'problem', 'with', 'blood', 'plasma', 'samples', 'ob', '...', '4968', 'Evolving', 'data', 'with', 'Ipatasertib', 'that', 'changes', 'th', '...', '4970', 'Study', 'is', 'part', 'of', 'PhD', 'trajectory', 'and', 'currently', '...', '4981', 'no', 'funding', '4986', 'sponsor', 'on', 'campus', 'training', 'restrictions', 'Name', ':', 'WhyStopped', ',', 'Length', ':', '320', ',', 'dtype', ':', 'object']\n"
     ]
    }
   ],
   "source": [
    "# Tokenize words from WhyStopped responses\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "words = str(df_text)\n",
    "stopped_text = word_tokenize(words)\n",
    "print(stopped_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "aecf2510-e1e8-4e96-b80a-f18f7c6e4ee9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/alejandra/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Import stopwords\n",
    "from nltk.corpus import stopwords\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "id": "80809018-dad6-451b-8390-a8805ba9e001",
   "metadata": {},
   "outputs": [],
   "source": [
    "stopWords = set(stopwords.words('english'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "id": "7cca3238-ab37-4b8f-85f6-2224d8ff363e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['32',\n",
       " 'Pandemic',\n",
       " 'situation',\n",
       " '47',\n",
       " 'Technical',\n",
       " 'problem',\n",
       " 'plasma',\n",
       " 'blood',\n",
       " 'samples',\n",
       " 'ob',\n",
       " '...',\n",
       " '50',\n",
       " 'study',\n",
       " 'start',\n",
       " 'currently',\n",
       " 'pause',\n",
       " '54',\n",
       " 'Principal',\n",
       " 'investigator',\n",
       " 'left',\n",
       " 'study',\n",
       " 'institut',\n",
       " '...',\n",
       " '84',\n",
       " 'The',\n",
       " 'study',\n",
       " 'stopped',\n",
       " 'prematurely',\n",
       " 'due',\n",
       " 'insuf',\n",
       " '...',\n",
       " '...',\n",
       " '4967',\n",
       " 'Technical',\n",
       " 'problem',\n",
       " 'blood',\n",
       " 'plasma',\n",
       " 'samples',\n",
       " 'ob',\n",
       " '...',\n",
       " '4968',\n",
       " 'Evolving',\n",
       " 'data',\n",
       " 'Ipatasertib',\n",
       " 'changes',\n",
       " 'th',\n",
       " '...',\n",
       " '4970',\n",
       " 'Study',\n",
       " 'part',\n",
       " 'PhD',\n",
       " 'trajectory',\n",
       " 'currently',\n",
       " '...',\n",
       " '4981',\n",
       " 'funding',\n",
       " '4986',\n",
       " 'sponsor',\n",
       " 'campus',\n",
       " 'training',\n",
       " 'restrictions',\n",
       " 'Name',\n",
       " ':',\n",
       " 'WhyStopped',\n",
       " ',',\n",
       " 'Length',\n",
       " ':',\n",
       " '320',\n",
       " ',',\n",
       " 'dtype',\n",
       " ':',\n",
       " 'object']"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Loop through tokenized words to filter out stopwords and append to new list\n",
    "filtered_text = []\n",
    "\n",
    "for w in stopped_text:\n",
    "    if w not in stopWords:\n",
    "        filtered_text.append(w)\n",
    "\n",
    "filtered_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "id": "df155ea4-7ade-4d4e-a609-3333e9d1cbdf",
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax (1124360912.py, line 2)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  File \u001b[0;32m\"/var/folders/lq/l3dyt3mn5_j1r4dkkpkp7k_c0000gn/T/ipykernel_10142/1124360912.py\"\u001b[0;36m, line \u001b[0;32m2\u001b[0m\n\u001b[0;31m    pip install wordcloud\u001b[0m\n\u001b[0m              ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
     ]
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "pip install wordcloud\n",
    "from wordcloud import WordCloud"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "704d7e63-26b3-4db6-9899-f00a2902ba83",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "028f842a-6f23-46c4-b543-de1d545176ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "vec = CountVectorizer()\n",
    "X = vec.fit_transform(filtered_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "d73f6815-e6c0-4abf-a859-537959162e2b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>32</th>\n",
       "      <th>320</th>\n",
       "      <th>47</th>\n",
       "      <th>4967</th>\n",
       "      <th>4968</th>\n",
       "      <th>4970</th>\n",
       "      <th>4981</th>\n",
       "      <th>4986</th>\n",
       "      <th>50</th>\n",
       "      <th>54</th>\n",
       "      <th>...</th>\n",
       "      <th>sponsor</th>\n",
       "      <th>start</th>\n",
       "      <th>stopped</th>\n",
       "      <th>study</th>\n",
       "      <th>technical</th>\n",
       "      <th>th</th>\n",
       "      <th>the</th>\n",
       "      <th>training</th>\n",
       "      <th>trajectory</th>\n",
       "      <th>whystopped</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 50 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   32  320  47  4967  4968  4970  4981  4986  50  54  ...  sponsor  start  \\\n",
       "0   0    0   0     0     0     0     0     0   0   0  ...        0      0   \n",
       "1   1    0   0     0     0     0     0     0   0   0  ...        0      0   \n",
       "2   0    0   0     0     0     0     0     0   0   0  ...        0      0   \n",
       "3   0    0   0     0     0     0     0     0   0   0  ...        0      0   \n",
       "4   0    0   1     0     0     0     0     0   0   0  ...        0      0   \n",
       "\n",
       "   stopped  study  technical  th  the  training  trajectory  whystopped  \n",
       "0        0      0          0   0    0         0           0           0  \n",
       "1        0      0          0   0    0         0           0           0  \n",
       "2        0      0          0   0    0         0           0           0  \n",
       "3        0      0          0   0    0         0           0           0  \n",
       "4        0      0          0   0    0         0           0           0  \n",
       "\n",
       "[5 rows x 50 columns]"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_filtered = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())\n",
    "df_filtered.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "92537ee9-5541-4d62-b5a5-5f46ac7027d8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                                     Pandemic situation\n",
       "1      Technical problem with plasma blood samples ob...\n",
       "2          study did not start and is currently on pause\n",
       "3      Principal investigator left the study institut...\n",
       "4      The study was stopped prematurely due to insuf...\n",
       "                             ...                        \n",
       "315    Technical problem with blood plasma samples ob...\n",
       "316    Evolving data with Ipatasertib that changes th...\n",
       "317    Study is part of PhD trajectory and currently ...\n",
       "318                                           no funding\n",
       "319              sponsor on campus training restrictions\n",
       "Name: WhyStopped, Length: 320, dtype: object"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_text_stopped = df_text_stopped.reset_index(drop=True)\n",
    "df_text_stopped"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "f354eed4-acef-413d-b665-09fc443aa635",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dtype('O')"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_text_stopped.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "822485eb-84d6-4c48-aa5b-351b80969ca7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'str'>\n"
     ]
    }
   ],
   "source": [
    "# Change datatype to string\n",
    "df_text_stopped = str(df_text_stopped)\n",
    "print(type(df_text_stopped))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "0726633f-c545-461c-92a4-c8f147689b96",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                             whystopped\n",
      "32                                   pandemic situation\n",
      "47    technical problem with plasma blood samples ob...\n",
      "50        study did not start and is currently on pause\n",
      "54    principal investigator left the study institut...\n",
      "84    the study was stopped prematurely due to insuf...\n",
      "...                                                 ...\n",
      "4967  technical problem with blood plasma samples ob...\n",
      "4968  evolving data with ipatasertib that changes th...\n",
      "4970  study is part of phd trajectory and currently ...\n",
      "4981                                         no funding\n",
      "4986            sponsor on campus training restrictions\n",
      "\n",
      "[320 rows x 1 columns]\n"
     ]
    }
   ],
   "source": [
    "# Lowercase\n",
    "print(df_text.lower())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "61036baa-509d-497f-b156-21790645d3c9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "with -->     3\n",
      "study -->     3\n",
      "Technical -->     2\n",
      "problem -->     2\n",
      "plasma -->     2\n",
      "blood -->     2\n",
      "samples -->     2\n",
      "ob   -->     2\n",
      "and  -->     2\n",
      "is   -->     2\n",
      "currently -->     2\n",
      "on   -->     2\n",
      "32   -->     1\n",
      "Pandemic -->     1\n",
      "situation -->     1\n",
      "47   -->     1\n",
      "50   -->     1\n",
      "did  -->     1\n",
      "not  -->     1\n",
      "start -->     1\n",
      "pause -->     1\n",
      "54   -->     1\n",
      "Principal -->     1\n",
      "investigator -->     1\n",
      "left -->     1\n",
      "the  -->     1\n",
      "institut -->     1\n",
      "84   -->     1\n",
      "The  -->     1\n",
      "was  -->     1\n",
      "stopped -->     1\n",
      "prematurely -->     1\n",
      "due  -->     1\n",
      "to   -->     1\n",
      "insuf -->     1\n",
      "4967 -->     1\n",
      "4968 -->     1\n",
      "Evolving -->     1\n",
      "data -->     1\n",
      "Ipatasertib -->     1\n",
      "that -->     1\n",
      "changes -->     1\n",
      "th   -->     1\n",
      "4970 -->     1\n",
      "Study -->     1\n",
      "part -->     1\n",
      "of   -->     1\n",
      "PhD  -->     1\n",
      "trajectory -->     1\n",
      "4981 -->     1\n",
      "no   -->     1\n",
      "funding -->     1\n",
      "4986 -->     1\n",
      "sponsor -->     1\n",
      "campus -->     1\n",
      "training -->     1\n",
      "restrictions -->     1\n",
      "Name -->     1\n",
      "WhyStopped -->     1\n",
      "Length -->     1\n",
      "320  -->     1\n",
      "dtype -->     1\n",
      "object -->     1\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "import re\n",
    "\n",
    "text = str(df_text_stopped)\n",
    " \n",
    "words = re.findall('\\w+', text)\n",
    "freq = Counter(words).most_common()\n",
    "for word,count in freq:\n",
    "    print(f'{word:<4} {\"-->\":^4} {count:>4}')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "7f8104ed-cbcc-46db-a63e-2a4bebc04ff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "23806159-3e14-4102-b2f7-0f4320166935",
   "metadata": {},
   "outputs": [],
   "source": [
    "stop_words = nltk.corpus.stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "ed4c4847-4e82-4478-9fb0-38542510e5ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "20eb017e-f4e3-4e21-95ef-640bbdb9685c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['WhyStopped',\n",
       " '32',\n",
       " 'Pandemic',\n",
       " 'situation',\n",
       " '47',\n",
       " 'Technical',\n",
       " 'problem',\n",
       " 'plasma',\n",
       " 'blood',\n",
       " 'samples',\n",
       " 'ob',\n",
       " '...',\n",
       " '50',\n",
       " 'study',\n",
       " 'start',\n",
       " 'currently',\n",
       " 'pause',\n",
       " '54',\n",
       " 'Principal',\n",
       " 'investigator',\n",
       " 'left',\n",
       " 'study',\n",
       " 'institut',\n",
       " '...',\n",
       " '84',\n",
       " 'study',\n",
       " 'stopped',\n",
       " 'prematurely',\n",
       " 'due',\n",
       " 'insuf',\n",
       " '...',\n",
       " '...',\n",
       " '...',\n",
       " '4967',\n",
       " 'Technical',\n",
       " 'problem',\n",
       " 'blood',\n",
       " 'plasma',\n",
       " 'samples',\n",
       " 'ob',\n",
       " '...',\n",
       " '4968',\n",
       " 'Evolving',\n",
       " 'data',\n",
       " 'Ipatasertib',\n",
       " 'changes',\n",
       " 'th',\n",
       " '...',\n",
       " '4970',\n",
       " 'Study',\n",
       " 'part',\n",
       " 'PhD',\n",
       " 'trajectory',\n",
       " 'currently',\n",
       " '...',\n",
       " '4981',\n",
       " 'funding',\n",
       " '4986',\n",
       " 'sponsor',\n",
       " 'campus',\n",
       " 'training',\n",
       " 'restrictions',\n",
       " '[',\n",
       " '320',\n",
       " 'rows',\n",
       " 'x',\n",
       " '1',\n",
       " 'columns',\n",
       " ']',\n",
       " 'WhyStopped',\n",
       " '32',\n",
       " 'Pandemic',\n",
       " 'situation',\n",
       " '47',\n",
       " 'Technical',\n",
       " 'problem',\n",
       " 'plasma',\n",
       " 'blood',\n",
       " 'samples',\n",
       " 'ob',\n",
       " '...',\n",
       " '50',\n",
       " 'study',\n",
       " 'start',\n",
       " 'currently',\n",
       " 'pause',\n",
       " '54',\n",
       " 'Principal',\n",
       " 'investigator',\n",
       " 'left',\n",
       " 'study',\n",
       " 'institut',\n",
       " '...',\n",
       " '84',\n",
       " 'study',\n",
       " 'stopped',\n",
       " 'prematurely',\n",
       " 'due',\n",
       " 'insuf',\n",
       " '...',\n",
       " '...',\n",
       " '...',\n",
       " '4967',\n",
       " 'Technical',\n",
       " 'problem',\n",
       " 'blood',\n",
       " 'plasma',\n",
       " 'samples',\n",
       " 'ob',\n",
       " '...',\n",
       " '4968',\n",
       " 'Evolving',\n",
       " 'data',\n",
       " 'Ipatasertib',\n",
       " 'changes',\n",
       " 'th',\n",
       " '...',\n",
       " '4970',\n",
       " 'Study',\n",
       " 'part',\n",
       " 'PhD',\n",
       " 'trajectory',\n",
       " 'currently',\n",
       " '...',\n",
       " '4981',\n",
       " 'funding',\n",
       " '4986',\n",
       " 'sponsor',\n",
       " 'campus',\n",
       " 'training',\n",
       " 'restrictions',\n",
       " '[',\n",
       " '320',\n",
       " 'rows',\n",
       " 'x',\n",
       " '1',\n",
       " 'columns',\n",
       " ']']"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Tokenize the sentence\n",
    "words = word_tokenize(df_text)\n",
    "for w in words:\n",
    "    if w.lower() not in stop_words:\n",
    "        filtered_list.append(w)\n",
    "        \n",
    "filtered_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "e72a70f1-cafb-419a-b583-ec4109f9b27f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>WhyStopped</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Pandemic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>situation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>47</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0\n",
       "0  WhyStopped\n",
       "1          32\n",
       "2    Pandemic\n",
       "3   situation\n",
       "4          47"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filtered_list = pd.DataFrame(filtered_list)\n",
    "filtered_list.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "fa96faae-5434-4c58-b7bf-651a2feec739",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "vect = CountVectorizer()  \n",
    "vects = vect.fit_transform(df_text)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46d746d2-96b1-4e5d-b874-5f10057371f5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "644c76c0-ec8e-41ab-89bf-5e969b8510e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "td = pd.DataFrame(vects.todense()).iloc[:5]  \n",
    "td.columns = vect.get_feature_names()\n",
    "term_document_matrix = td.T\n",
    "term_document_matrix.columns = ['Doc '+str(i) for i in range(1)]\n",
    "term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "cb7105fd-da70-41f9-8a06-6b3872f9425c",
   "metadata": {},
   "outputs": [],
   "source": [
    "term_document_matrix = term_document_matrix.sort_values(by ='total_count',ascending=False)[:25] \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "e4fd7e94-a5a2-4c16-9e5a-8b9dce849e85",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "            Doc 0\n",
      "whystopped      1\n"
     ]
    }
   ],
   "source": [
    "print(term_document_matrix.drop(columns=['total_count']).head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76510805-d23e-49bf-b76a-85e0cee01f93",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aaeb122a-07d5-4913-a3ba-95346ad8ad87",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reduce dimensionality/complexity\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08019cff-15e1-49f8-98b3-ee4e36d36983",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "988ef047-97d7-487e-970f-7066ad188d5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove stop words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd9a1eec-220f-4e66-ada6-6df19296ac30",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create equivalence classes(lemmatize/stem)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a696263d-6bc4-4d39-92a8-fc7d328884e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter by Frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "758dfa0b-9a3b-4460-9caa-34cda2f6b469",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create the document feature matrix"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mlenv",
   "language": "python",
   "name": "mlenv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}