Download this file

1734 lines (1733 with data), 57.5 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "77bf9149-a3d2-4246-9f11-f0fbec29b227",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import dependencies\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "06a126db-bf6e-4a50-9ec4-5be50de9a426",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>OrgStudyId</th>\n",
       "      <th>Gender</th>\n",
       "      <th>MinimumAge</th>\n",
       "      <th>HealthyVolunteers</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>Female</td>\n",
       "      <td>18 Years</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>Female</td>\n",
       "      <td>18 Years</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Female</td>\n",
       "      <td>20 Years</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>Female</td>\n",
       "      <td>18 Years</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>241391</td>\n",
       "      <td>Female</td>\n",
       "      <td>30 Years</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Rank       OrgStudyId  Gender MinimumAge           HealthyVolunteers\n",
       "0     1  BTX-BCI-016-PRT  Female   18 Years                          No\n",
       "1     2      2018-TJ-BCD  Female   18 Years                          No\n",
       "2     3    Breast cancer  Female   20 Years  Accepts Healthy Volunteers\n",
       "3     4         BC-BOMET  Female   18 Years                          No\n",
       "4     5           241391  Female   30 Years  Accepts Healthy Volunteers"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "participant_df = pd.read_csv('Tables/participant_df.csv')\n",
    "participant_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "04e3f08d-4393-4b5e-a740-013f14937303",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>OrgStudyId</th>\n",
       "      <th>BriefTitle</th>\n",
       "      <th>StartDate</th>\n",
       "      <th>CompletionDate</th>\n",
       "      <th>OverallStatus</th>\n",
       "      <th>StudyType</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>Breast Cancer Index (BCI) Registry</td>\n",
       "      <td>April 14, 2021</td>\n",
       "      <td>December 2028</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>Diagnosis Value of SEMA4C in Breast Cancer</td>\n",
       "      <td>September 1, 2023</td>\n",
       "      <td>September 1, 2024</td>\n",
       "      <td>Not yet recruiting</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Role of Sorcin and Annexin A3 in Breast Cancer...</td>\n",
       "      <td>January 20, 2019</td>\n",
       "      <td>September 30, 2019</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>Evaluation of Prognostic Factors: From Breast ...</td>\n",
       "      <td>January 13, 2020</td>\n",
       "      <td>November 12, 2024</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>241391</td>\n",
       "      <td>A Study to Identify Breast Cancer (IDBC)</td>\n",
       "      <td>January 24, 2019</td>\n",
       "      <td>December 31, 2022</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Rank       OrgStudyId                                         BriefTitle  \\\n",
       "0     1  BTX-BCI-016-PRT                 Breast Cancer Index (BCI) Registry   \n",
       "1     2      2018-TJ-BCD         Diagnosis Value of SEMA4C in Breast Cancer   \n",
       "2     3    Breast cancer  Role of Sorcin and Annexin A3 in Breast Cancer...   \n",
       "3     4         BC-BOMET  Evaluation of Prognostic Factors: From Breast ...   \n",
       "4     5           241391           A Study to Identify Breast Cancer (IDBC)   \n",
       "\n",
       "           StartDate      CompletionDate       OverallStatus      StudyType  \n",
       "0     April 14, 2021       December 2028          Recruiting  Observational  \n",
       "1  September 1, 2023   September 1, 2024  Not yet recruiting  Observational  \n",
       "2   January 20, 2019  September 30, 2019      Unknown status  Observational  \n",
       "3   January 13, 2020   November 12, 2024          Recruiting  Observational  \n",
       "4   January 24, 2019   December 31, 2022      Unknown status  Observational  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "registration_df = pd.read_csv('Tables/registration_df.csv')\n",
    "registration_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "864e15c8-f9c3-4b36-a582-e999078f9f2a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>OrgStudyId</th>\n",
       "      <th>IsFDARegulatedDrug</th>\n",
       "      <th>IsFDARegulatedDevice</th>\n",
       "      <th>ResponsiblePartyType</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>241391</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Rank       OrgStudyId IsFDARegulatedDrug IsFDARegulatedDevice  \\\n",
       "0     1  BTX-BCI-016-PRT                 No                   No   \n",
       "1     2      2018-TJ-BCD                 No                   No   \n",
       "2     3    Breast cancer                 No                   No   \n",
       "3     4         BC-BOMET                 No                   No   \n",
       "4     5           241391                 No                   No   \n",
       "\n",
       "     ResponsiblePartyType  \n",
       "0                 Sponsor  \n",
       "1  Principal Investigator  \n",
       "2  Principal Investigator  \n",
       "3                 Sponsor  \n",
       "4                 Sponsor  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "study_info_df = pd.read_csv('Tables/study_details_df.csv')\n",
    "study_info_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "68139eaa-2286-4400-867d-01f43ed93e32",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>OrgStudyId</th>\n",
       "      <th>StudyType</th>\n",
       "      <th>ArmGroupType</th>\n",
       "      <th>InterventionType</th>\n",
       "      <th>DesignInterventionModel</th>\n",
       "      <th>DesignObservationalModel</th>\n",
       "      <th>TargetDuration</th>\n",
       "      <th>SamplingMethod</th>\n",
       "      <th>Phase</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Only</td>\n",
       "      <td>5 Years</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test, Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Genetic, Other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Other, Other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>241391</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cohort</td>\n",
       "      <td>1 Year</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Rank       OrgStudyId      StudyType ArmGroupType  \\\n",
       "0     1  BTX-BCI-016-PRT  Observational          NaN   \n",
       "1     2      2018-TJ-BCD  Observational          NaN   \n",
       "2     3    Breast cancer  Observational          NaN   \n",
       "3     4         BC-BOMET  Observational          NaN   \n",
       "4     5           241391  Observational          NaN   \n",
       "\n",
       "                   InterventionType DesignInterventionModel  \\\n",
       "0                   Diagnostic Test                     NaN   \n",
       "1  Diagnostic Test, Diagnostic Test                     NaN   \n",
       "2                    Genetic, Other                     NaN   \n",
       "3                      Other, Other                     NaN   \n",
       "4                   Diagnostic Test                     NaN   \n",
       "\n",
       "  DesignObservationalModel TargetDuration          SamplingMethod Phase  \n",
       "0                Case-Only        5 Years  Non-Probability Sample   NaN  \n",
       "1             Case-Control            NaN  Non-Probability Sample   NaN  \n",
       "2             Case-Control            NaN      Probability Sample   NaN  \n",
       "3             Case-Control            NaN  Non-Probability Sample   NaN  \n",
       "4                   Cohort         1 Year  Non-Probability Sample   NaN  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "study_method_df = pd.read_csv('Tables/study_method_df.csv')\n",
    "study_method_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "f9a563a8-cc94-45f1-b9e1-66225dcf8c5c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Rank</th>\n",
       "      <th>OrgStudyId</th>\n",
       "      <th>Gender</th>\n",
       "      <th>MinimumAge</th>\n",
       "      <th>HealthyVolunteers</th>\n",
       "      <th>Rank</th>\n",
       "      <th>OrgStudyId</th>\n",
       "      <th>BriefTitle</th>\n",
       "      <th>StartDate</th>\n",
       "      <th>CompletionDate</th>\n",
       "      <th>...</th>\n",
       "      <th>Rank</th>\n",
       "      <th>OrgStudyId</th>\n",
       "      <th>StudyType</th>\n",
       "      <th>ArmGroupType</th>\n",
       "      <th>InterventionType</th>\n",
       "      <th>DesignInterventionModel</th>\n",
       "      <th>DesignObservationalModel</th>\n",
       "      <th>TargetDuration</th>\n",
       "      <th>SamplingMethod</th>\n",
       "      <th>Phase</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>Female</td>\n",
       "      <td>18 Years</td>\n",
       "      <td>No</td>\n",
       "      <td>1</td>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>Breast Cancer Index (BCI) Registry</td>\n",
       "      <td>April 14, 2021</td>\n",
       "      <td>December 2028</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Only</td>\n",
       "      <td>5 Years</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>Female</td>\n",
       "      <td>18 Years</td>\n",
       "      <td>No</td>\n",
       "      <td>2</td>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>Diagnosis Value of SEMA4C in Breast Cancer</td>\n",
       "      <td>September 1, 2023</td>\n",
       "      <td>September 1, 2024</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test, Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Female</td>\n",
       "      <td>20 Years</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "      <td>3</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Role of Sorcin and Annexin A3 in Breast Cancer...</td>\n",
       "      <td>January 20, 2019</td>\n",
       "      <td>September 30, 2019</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Genetic, Other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>Female</td>\n",
       "      <td>18 Years</td>\n",
       "      <td>No</td>\n",
       "      <td>4</td>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>Evaluation of Prognostic Factors: From Breast ...</td>\n",
       "      <td>January 13, 2020</td>\n",
       "      <td>November 12, 2024</td>\n",
       "      <td>...</td>\n",
       "      <td>4</td>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Other, Other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>241391</td>\n",
       "      <td>Female</td>\n",
       "      <td>30 Years</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "      <td>5</td>\n",
       "      <td>241391</td>\n",
       "      <td>A Study to Identify Breast Cancer (IDBC)</td>\n",
       "      <td>January 24, 2019</td>\n",
       "      <td>December 31, 2022</td>\n",
       "      <td>...</td>\n",
       "      <td>5</td>\n",
       "      <td>241391</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cohort</td>\n",
       "      <td>1 Year</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 27 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  Rank       OrgStudyId  Gender MinimumAge           HealthyVolunteers Rank  \\\n",
       "0    1  BTX-BCI-016-PRT  Female   18 Years                          No    1   \n",
       "1    2      2018-TJ-BCD  Female   18 Years                          No    2   \n",
       "2    3    Breast cancer  Female   20 Years  Accepts Healthy Volunteers    3   \n",
       "3    4         BC-BOMET  Female   18 Years                          No    4   \n",
       "4    5           241391  Female   30 Years  Accepts Healthy Volunteers    5   \n",
       "\n",
       "        OrgStudyId                                         BriefTitle  \\\n",
       "0  BTX-BCI-016-PRT                 Breast Cancer Index (BCI) Registry   \n",
       "1      2018-TJ-BCD         Diagnosis Value of SEMA4C in Breast Cancer   \n",
       "2    Breast cancer  Role of Sorcin and Annexin A3 in Breast Cancer...   \n",
       "3         BC-BOMET  Evaluation of Prognostic Factors: From Breast ...   \n",
       "4           241391           A Study to Identify Breast Cancer (IDBC)   \n",
       "\n",
       "           StartDate      CompletionDate  ... Rank       OrgStudyId  \\\n",
       "0     April 14, 2021       December 2028  ...    1  BTX-BCI-016-PRT   \n",
       "1  September 1, 2023   September 1, 2024  ...    2      2018-TJ-BCD   \n",
       "2   January 20, 2019  September 30, 2019  ...    3    Breast cancer   \n",
       "3   January 13, 2020   November 12, 2024  ...    4         BC-BOMET   \n",
       "4   January 24, 2019   December 31, 2022  ...    5           241391   \n",
       "\n",
       "       StudyType ArmGroupType                  InterventionType  \\\n",
       "0  Observational          NaN                   Diagnostic Test   \n",
       "1  Observational          NaN  Diagnostic Test, Diagnostic Test   \n",
       "2  Observational          NaN                    Genetic, Other   \n",
       "3  Observational          NaN                      Other, Other   \n",
       "4  Observational          NaN                   Diagnostic Test   \n",
       "\n",
       "  DesignInterventionModel DesignObservationalModel  TargetDuration  \\\n",
       "0                     NaN                Case-Only         5 Years   \n",
       "1                     NaN             Case-Control             NaN   \n",
       "2                     NaN             Case-Control             NaN   \n",
       "3                     NaN             Case-Control             NaN   \n",
       "4                     NaN                   Cohort          1 Year   \n",
       "\n",
       "           SamplingMethod Phase  \n",
       "0  Non-Probability Sample   NaN  \n",
       "1  Non-Probability Sample   NaN  \n",
       "2      Probability Sample   NaN  \n",
       "3  Non-Probability Sample   NaN  \n",
       "4  Non-Probability Sample   NaN  \n",
       "\n",
       "[5 rows x 27 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ensemble_df = pd.concat([participant_df, registration_df, study_info_df, study_method_df], axis = 1)\n",
    "ensemble_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "38d999fa-6f3a-41c8-bf21-c4a5af127d96",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Gender',\n",
       " 'MinimumAge',\n",
       " 'HealthyVolunteers',\n",
       " 'StartDate',\n",
       " 'CompletionDate',\n",
       " 'OverallStatus',\n",
       " 'StudyType',\n",
       " 'IsFDARegulatedDrug',\n",
       " 'IsFDARegulatedDevice',\n",
       " 'ResponsiblePartyType',\n",
       " 'StudyType',\n",
       " 'ArmGroupType',\n",
       " 'InterventionType',\n",
       " 'DesignInterventionModel',\n",
       " 'DesignObservationalModel',\n",
       " 'TargetDuration',\n",
       " 'SamplingMethod',\n",
       " 'Phase']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ensemble_df = ensemble_df.drop(['BriefTitle', 'Rank', 'OrgStudyId'], axis = 1)\n",
    "list(ensemble_df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "8a2cb06c-e200-478a-862f-50501e48f6d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5003, 18)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ensemble_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "6a18a028-83a3-4faa-a746-197f0094bd30",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gender</th>\n",
       "      <th>MinimumAge</th>\n",
       "      <th>HealthyVolunteers</th>\n",
       "      <th>StartDate</th>\n",
       "      <th>CompletionDate</th>\n",
       "      <th>OverallStatus</th>\n",
       "      <th>StudyType</th>\n",
       "      <th>IsFDARegulatedDrug</th>\n",
       "      <th>IsFDARegulatedDevice</th>\n",
       "      <th>ResponsiblePartyType</th>\n",
       "      <th>StudyType</th>\n",
       "      <th>ArmGroupType</th>\n",
       "      <th>InterventionType</th>\n",
       "      <th>DesignInterventionModel</th>\n",
       "      <th>DesignObservationalModel</th>\n",
       "      <th>TargetDuration</th>\n",
       "      <th>SamplingMethod</th>\n",
       "      <th>Phase</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Female</td>\n",
       "      <td>18 Years</td>\n",
       "      <td>No</td>\n",
       "      <td>April 14, 2021</td>\n",
       "      <td>December 2028</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>Observational</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Only</td>\n",
       "      <td>5 Years</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Female</td>\n",
       "      <td>18 Years</td>\n",
       "      <td>No</td>\n",
       "      <td>September 1, 2023</td>\n",
       "      <td>September 1, 2024</td>\n",
       "      <td>Not yet recruiting</td>\n",
       "      <td>Observational</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test, Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Female</td>\n",
       "      <td>20 Years</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "      <td>January 20, 2019</td>\n",
       "      <td>September 30, 2019</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>Observational</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Genetic, Other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Female</td>\n",
       "      <td>18 Years</td>\n",
       "      <td>No</td>\n",
       "      <td>January 13, 2020</td>\n",
       "      <td>November 12, 2024</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>Observational</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Other, Other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Female</td>\n",
       "      <td>30 Years</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "      <td>January 24, 2019</td>\n",
       "      <td>December 31, 2022</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>Observational</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cohort</td>\n",
       "      <td>1 Year</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Gender MinimumAge           HealthyVolunteers          StartDate  \\\n",
       "0  Female   18 Years                          No     April 14, 2021   \n",
       "1  Female   18 Years                          No  September 1, 2023   \n",
       "2  Female   20 Years  Accepts Healthy Volunteers   January 20, 2019   \n",
       "3  Female   18 Years                          No   January 13, 2020   \n",
       "4  Female   30 Years  Accepts Healthy Volunteers   January 24, 2019   \n",
       "\n",
       "       CompletionDate       OverallStatus      StudyType IsFDARegulatedDrug  \\\n",
       "0       December 2028          Recruiting  Observational                 No   \n",
       "1   September 1, 2024  Not yet recruiting  Observational                 No   \n",
       "2  September 30, 2019      Unknown status  Observational                 No   \n",
       "3   November 12, 2024          Recruiting  Observational                 No   \n",
       "4   December 31, 2022      Unknown status  Observational                 No   \n",
       "\n",
       "  IsFDARegulatedDevice    ResponsiblePartyType      StudyType ArmGroupType  \\\n",
       "0                   No                 Sponsor  Observational          NaN   \n",
       "1                   No  Principal Investigator  Observational          NaN   \n",
       "2                   No  Principal Investigator  Observational          NaN   \n",
       "3                   No                 Sponsor  Observational          NaN   \n",
       "4                   No                 Sponsor  Observational          NaN   \n",
       "\n",
       "                   InterventionType DesignInterventionModel  \\\n",
       "0                   Diagnostic Test                     NaN   \n",
       "1  Diagnostic Test, Diagnostic Test                     NaN   \n",
       "2                    Genetic, Other                     NaN   \n",
       "3                      Other, Other                     NaN   \n",
       "4                   Diagnostic Test                     NaN   \n",
       "\n",
       "  DesignObservationalModel TargetDuration          SamplingMethod Phase  \n",
       "0                Case-Only        5 Years  Non-Probability Sample   NaN  \n",
       "1             Case-Control            NaN  Non-Probability Sample   NaN  \n",
       "2             Case-Control            NaN      Probability Sample   NaN  \n",
       "3             Case-Control            NaN  Non-Probability Sample   NaN  \n",
       "4                   Cohort         1 Year  Non-Probability Sample   NaN  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ensemble_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "a0b0e07c-bf15-41d7-88d2-e30b101e3000",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Gender                         3\n",
       "MinimumAge                    46\n",
       "HealthyVolunteers              2\n",
       "StartDate                   1531\n",
       "CompletionDate              1566\n",
       "OverallStatus                  9\n",
       "StudyType                      2\n",
       "IsFDARegulatedDrug             2\n",
       "IsFDARegulatedDevice           2\n",
       "ResponsiblePartyType           3\n",
       "StudyType                      2\n",
       "ArmGroupType                 161\n",
       "InterventionType             422\n",
       "DesignInterventionModel        5\n",
       "DesignObservationalModel       7\n",
       "TargetDuration                27\n",
       "SamplingMethod                 2\n",
       "Phase                          8\n",
       "dtype: int64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ensemble_df.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "5648c1c4-9380-409a-8d12-d8267c3cda5e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gender</th>\n",
       "      <th>HealthyVolunteers</th>\n",
       "      <th>OverallStatus</th>\n",
       "      <th>IsFDARegulatedDrug</th>\n",
       "      <th>IsFDARegulatedDevice</th>\n",
       "      <th>ResponsiblePartyType</th>\n",
       "      <th>StudyType</th>\n",
       "      <th>StudyType</th>\n",
       "      <th>DesignInterventionModel</th>\n",
       "      <th>DesignObservationalModel</th>\n",
       "      <th>SamplingMethod</th>\n",
       "      <th>Phase</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "      <td>Observational</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Only</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Not yet recruiting</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "      <td>Observational</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Female</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "      <td>Observational</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "      <td>Observational</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Female</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "      <td>Observational</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cohort</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Gender           HealthyVolunteers       OverallStatus IsFDARegulatedDrug  \\\n",
       "0  Female                          No          Recruiting                 No   \n",
       "1  Female                          No  Not yet recruiting                 No   \n",
       "2  Female  Accepts Healthy Volunteers      Unknown status                 No   \n",
       "3  Female                          No          Recruiting                 No   \n",
       "4  Female  Accepts Healthy Volunteers      Unknown status                 No   \n",
       "\n",
       "  IsFDARegulatedDevice    ResponsiblePartyType      StudyType      StudyType  \\\n",
       "0                   No                 Sponsor  Observational  Observational   \n",
       "1                   No  Principal Investigator  Observational  Observational   \n",
       "2                   No  Principal Investigator  Observational  Observational   \n",
       "3                   No                 Sponsor  Observational  Observational   \n",
       "4                   No                 Sponsor  Observational  Observational   \n",
       "\n",
       "  DesignInterventionModel DesignObservationalModel          SamplingMethod  \\\n",
       "0                     NaN                Case-Only  Non-Probability Sample   \n",
       "1                     NaN             Case-Control  Non-Probability Sample   \n",
       "2                     NaN             Case-Control      Probability Sample   \n",
       "3                     NaN             Case-Control  Non-Probability Sample   \n",
       "4                     NaN                   Cohort  Non-Probability Sample   \n",
       "\n",
       "  Phase  \n",
       "0   NaN  \n",
       "1   NaN  \n",
       "2   NaN  \n",
       "3   NaN  \n",
       "4   NaN  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model = ensemble_df[['Gender','HealthyVolunteers', 'OverallStatus', 'IsFDARegulatedDrug', 'IsFDARegulatedDevice', 'ResponsiblePartyType','StudyType',\n",
    "                         'DesignInterventionModel', 'DesignObservationalModel', 'SamplingMethod', 'Phase']].copy()\n",
    "cat_model.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "2079cf2d-1000-4fb2-b30b-71324afa601e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gender</th>\n",
       "      <th>HealthyVolunteers</th>\n",
       "      <th>OverallStatus</th>\n",
       "      <th>IsFDARegulatedDrug</th>\n",
       "      <th>IsFDARegulatedDevice</th>\n",
       "      <th>ResponsiblePartyType</th>\n",
       "      <th>StudyType</th>\n",
       "      <th>DesignInterventionModel</th>\n",
       "      <th>DesignObservationalModel</th>\n",
       "      <th>SamplingMethod</th>\n",
       "      <th>Phase</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Only</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Not yet recruiting</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Female</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Female</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cohort</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Gender           HealthyVolunteers       OverallStatus IsFDARegulatedDrug  \\\n",
       "0  Female                          No          Recruiting                 No   \n",
       "1  Female                          No  Not yet recruiting                 No   \n",
       "2  Female  Accepts Healthy Volunteers      Unknown status                 No   \n",
       "3  Female                          No          Recruiting                 No   \n",
       "4  Female  Accepts Healthy Volunteers      Unknown status                 No   \n",
       "\n",
       "  IsFDARegulatedDevice    ResponsiblePartyType      StudyType  \\\n",
       "0                   No                 Sponsor  Observational   \n",
       "1                   No  Principal Investigator  Observational   \n",
       "2                   No  Principal Investigator  Observational   \n",
       "3                   No                 Sponsor  Observational   \n",
       "4                   No                 Sponsor  Observational   \n",
       "\n",
       "  DesignInterventionModel DesignObservationalModel          SamplingMethod  \\\n",
       "0                     NaN                Case-Only  Non-Probability Sample   \n",
       "1                     NaN             Case-Control  Non-Probability Sample   \n",
       "2                     NaN             Case-Control      Probability Sample   \n",
       "3                     NaN             Case-Control  Non-Probability Sample   \n",
       "4                     NaN                   Cohort  Non-Probability Sample   \n",
       "\n",
       "  Phase  \n",
       "0   NaN  \n",
       "1   NaN  \n",
       "2   NaN  \n",
       "3   NaN  \n",
       "4   NaN  "
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model = cat_model.loc[:,~cat_model.columns.duplicated()].copy()\n",
    "cat_model.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "f2891082-fa29-4eee-9222-085362204e7d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Gender                         1\n",
       "HealthyVolunteers             89\n",
       "OverallStatus                  0\n",
       "IsFDARegulatedDrug            51\n",
       "IsFDARegulatedDevice          52\n",
       "ResponsiblePartyType           0\n",
       "StudyType                      0\n",
       "DesignInterventionModel      534\n",
       "DesignObservationalModel    1233\n",
       "SamplingMethod              1232\n",
       "Phase                        534\n",
       "dtype: int64"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "0bd82701-b7f2-4191-be57-d35cdfa0e04c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "No                            1311\n",
       "Accepts Healthy Volunteers     366\n",
       "Name: HealthyVolunteers, dtype: int64"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model['HealthyVolunteers'].value_counts()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "399e609c-bce5-4aa3-9153-39eda87dbedb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Parallel Assignment        533\n",
       "Single Group Assignment    477\n",
       "Sequential Assignment      144\n",
       "Crossover Assignment        57\n",
       "Factorial Assignment        21\n",
       "Name: DesignInterventionModel, dtype: int64"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model['DesignInterventionModel'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "813b0463-48f8-4502-821d-1587f3231945",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cohort                   271\n",
       "Case-Only                 83\n",
       "Case-Control              81\n",
       "Other                     79\n",
       "Ecologic or Community      9\n",
       "Case-Crossover             8\n",
       "Family-Based               2\n",
       "Name: DesignObservationalModel, dtype: int64"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model['DesignObservationalModel'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "52cd2eb6-6f20-4540-8cd8-ea764ab1b197",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Non-Probability Sample    362\n",
       "Probability Sample        172\n",
       "Name: SamplingMethod, dtype: int64"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model['SamplingMethod'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "9811b61e-8d0c-4574-a590-fe25d0d87106",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Recruiting                 522\n",
       "Completed                  309\n",
       "Not yet recruiting         255\n",
       "Active, not recruiting     212\n",
       "Unknown status             202\n",
       "Withdrawn                   97\n",
       "Terminated                  86\n",
       "Enrolling by invitation     55\n",
       "Suspended                   28\n",
       "Name: OverallStatus, dtype: int64"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model['OverallStatus'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "f199b698-c3b7-4a56-a212-6dea51eb5b0c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "No     1255\n",
       "Yes     460\n",
       "Name: IsFDARegulatedDrug, dtype: int64"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model['IsFDARegulatedDrug'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "4279c2e7-eb6f-4cc8-baed-e933fa53d3cc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "No     1512\n",
       "Yes     202\n",
       "Name: IsFDARegulatedDevice, dtype: int64"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model['IsFDARegulatedDevice'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "20949ea3-0de0-42ea-9803-9a1fc99aa24d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Not Applicable      409\n",
       "Phase 2             225\n",
       "Phase 1             193\n",
       "Phase 1, Phase 2    134\n",
       "Phase 3             100\n",
       "Early Phase 1        72\n",
       "Phase 4              63\n",
       "Phase 2, Phase 3     36\n",
       "Name: Phase, dtype: int64"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model['Phase'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "9b5adbbc-bf00-4d1c-bf68-0e5ee442ba66",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Interventional    1237\n",
       "Observational      529\n",
       "Name: StudyType, dtype: int64"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model['StudyType'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "ca652c63-4220-4e78-8920-27bfb3bbc843",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1766, 11)"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_model.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de7f7674-0cdb-4aa0-92e9-ab8401e23d5d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1533f628-4c63-4d9e-85a4-dc0b302b3942",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39ab7359-4893-4402-b83a-84c3d960c583",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50c643d5-d9be-409f-880f-3476bb7a4e55",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate categorical variables list\n",
    "trial_cat = cat_model.dtypes[cat_model.dtypes == \"object\"].index.tolist()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "018850bd-0503-45e8-a04d-2aba12c11472",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Encode labels with getdummies or create a OneHotEncoder instance\n",
    "\n",
    "# Fit and transform the OHE using thr categorical variable list\n",
    "\n",
    "# Add the encoded variable names to the dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f041f78-3a76-42ae-9eac-f36d343c5e47",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge OHE features and drop the originals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53030acd-0124-402b-a001-db2b0676d0f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split preprocessed data into features and target arrays\n",
    "X = \n",
    "\n",
    "y = "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae631b6d-0435-42b2-8df4-b4f821f475f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split data into training and testing datset\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "051a1912-1d40-40b7-a173-7b4f01bc8b01",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a StandardScaler instances\n",
    "scaler = StandardScaler()\n",
    "\n",
    "# Fit the StandardScaler\n",
    "X_scaler = scaler.fit(X_train)\n",
    "\n",
    "# Scale the data\n",
    "X_train_scaled = X_scaler.transform(X_train)\n",
    "X_test_scaled = X_scaler.transform(X_test)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mlenv",
   "language": "python",
   "name": "mlenv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}