1734 lines (1733 with data), 57.5 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "77bf9149-a3d2-4246-9f11-f0fbec29b227",
"metadata": {},
"outputs": [],
"source": [
"# Import dependencies\n",
"import numpy as np\n",
"import pandas as pd\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "06a126db-bf6e-4a50-9ec4-5be50de9a426",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>OrgStudyId</th>\n",
" <th>Gender</th>\n",
" <th>MinimumAge</th>\n",
" <th>HealthyVolunteers</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>Female</td>\n",
" <td>18 Years</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>Female</td>\n",
" <td>18 Years</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Breast cancer</td>\n",
" <td>Female</td>\n",
" <td>20 Years</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>BC-BOMET</td>\n",
" <td>Female</td>\n",
" <td>18 Years</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>241391</td>\n",
" <td>Female</td>\n",
" <td>30 Years</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Rank OrgStudyId Gender MinimumAge HealthyVolunteers\n",
"0 1 BTX-BCI-016-PRT Female 18 Years No\n",
"1 2 2018-TJ-BCD Female 18 Years No\n",
"2 3 Breast cancer Female 20 Years Accepts Healthy Volunteers\n",
"3 4 BC-BOMET Female 18 Years No\n",
"4 5 241391 Female 30 Years Accepts Healthy Volunteers"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"participant_df = pd.read_csv('Tables/participant_df.csv')\n",
"participant_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "04e3f08d-4393-4b5e-a740-013f14937303",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>OrgStudyId</th>\n",
" <th>BriefTitle</th>\n",
" <th>StartDate</th>\n",
" <th>CompletionDate</th>\n",
" <th>OverallStatus</th>\n",
" <th>StudyType</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>Breast Cancer Index (BCI) Registry</td>\n",
" <td>April 14, 2021</td>\n",
" <td>December 2028</td>\n",
" <td>Recruiting</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>Diagnosis Value of SEMA4C in Breast Cancer</td>\n",
" <td>September 1, 2023</td>\n",
" <td>September 1, 2024</td>\n",
" <td>Not yet recruiting</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Breast cancer</td>\n",
" <td>Role of Sorcin and Annexin A3 in Breast Cancer...</td>\n",
" <td>January 20, 2019</td>\n",
" <td>September 30, 2019</td>\n",
" <td>Unknown status</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>BC-BOMET</td>\n",
" <td>Evaluation of Prognostic Factors: From Breast ...</td>\n",
" <td>January 13, 2020</td>\n",
" <td>November 12, 2024</td>\n",
" <td>Recruiting</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>241391</td>\n",
" <td>A Study to Identify Breast Cancer (IDBC)</td>\n",
" <td>January 24, 2019</td>\n",
" <td>December 31, 2022</td>\n",
" <td>Unknown status</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Rank OrgStudyId BriefTitle \\\n",
"0 1 BTX-BCI-016-PRT Breast Cancer Index (BCI) Registry \n",
"1 2 2018-TJ-BCD Diagnosis Value of SEMA4C in Breast Cancer \n",
"2 3 Breast cancer Role of Sorcin and Annexin A3 in Breast Cancer... \n",
"3 4 BC-BOMET Evaluation of Prognostic Factors: From Breast ... \n",
"4 5 241391 A Study to Identify Breast Cancer (IDBC) \n",
"\n",
" StartDate CompletionDate OverallStatus StudyType \n",
"0 April 14, 2021 December 2028 Recruiting Observational \n",
"1 September 1, 2023 September 1, 2024 Not yet recruiting Observational \n",
"2 January 20, 2019 September 30, 2019 Unknown status Observational \n",
"3 January 13, 2020 November 12, 2024 Recruiting Observational \n",
"4 January 24, 2019 December 31, 2022 Unknown status Observational "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"registration_df = pd.read_csv('Tables/registration_df.csv')\n",
"registration_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "864e15c8-f9c3-4b36-a582-e999078f9f2a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>OrgStudyId</th>\n",
" <th>IsFDARegulatedDrug</th>\n",
" <th>IsFDARegulatedDevice</th>\n",
" <th>ResponsiblePartyType</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Breast cancer</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>BC-BOMET</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>241391</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Rank OrgStudyId IsFDARegulatedDrug IsFDARegulatedDevice \\\n",
"0 1 BTX-BCI-016-PRT No No \n",
"1 2 2018-TJ-BCD No No \n",
"2 3 Breast cancer No No \n",
"3 4 BC-BOMET No No \n",
"4 5 241391 No No \n",
"\n",
" ResponsiblePartyType \n",
"0 Sponsor \n",
"1 Principal Investigator \n",
"2 Principal Investigator \n",
"3 Sponsor \n",
"4 Sponsor "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"study_info_df = pd.read_csv('Tables/study_details_df.csv')\n",
"study_info_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "68139eaa-2286-4400-867d-01f43ed93e32",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>OrgStudyId</th>\n",
" <th>StudyType</th>\n",
" <th>ArmGroupType</th>\n",
" <th>InterventionType</th>\n",
" <th>DesignInterventionModel</th>\n",
" <th>DesignObservationalModel</th>\n",
" <th>TargetDuration</th>\n",
" <th>SamplingMethod</th>\n",
" <th>Phase</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Case-Only</td>\n",
" <td>5 Years</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test, Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Breast cancer</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Genetic, Other</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>BC-BOMET</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Other, Other</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>241391</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Cohort</td>\n",
" <td>1 Year</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Rank OrgStudyId StudyType ArmGroupType \\\n",
"0 1 BTX-BCI-016-PRT Observational NaN \n",
"1 2 2018-TJ-BCD Observational NaN \n",
"2 3 Breast cancer Observational NaN \n",
"3 4 BC-BOMET Observational NaN \n",
"4 5 241391 Observational NaN \n",
"\n",
" InterventionType DesignInterventionModel \\\n",
"0 Diagnostic Test NaN \n",
"1 Diagnostic Test, Diagnostic Test NaN \n",
"2 Genetic, Other NaN \n",
"3 Other, Other NaN \n",
"4 Diagnostic Test NaN \n",
"\n",
" DesignObservationalModel TargetDuration SamplingMethod Phase \n",
"0 Case-Only 5 Years Non-Probability Sample NaN \n",
"1 Case-Control NaN Non-Probability Sample NaN \n",
"2 Case-Control NaN Probability Sample NaN \n",
"3 Case-Control NaN Non-Probability Sample NaN \n",
"4 Cohort 1 Year Non-Probability Sample NaN "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"study_method_df = pd.read_csv('Tables/study_method_df.csv')\n",
"study_method_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "f9a563a8-cc94-45f1-b9e1-66225dcf8c5c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>OrgStudyId</th>\n",
" <th>Gender</th>\n",
" <th>MinimumAge</th>\n",
" <th>HealthyVolunteers</th>\n",
" <th>Rank</th>\n",
" <th>OrgStudyId</th>\n",
" <th>BriefTitle</th>\n",
" <th>StartDate</th>\n",
" <th>CompletionDate</th>\n",
" <th>...</th>\n",
" <th>Rank</th>\n",
" <th>OrgStudyId</th>\n",
" <th>StudyType</th>\n",
" <th>ArmGroupType</th>\n",
" <th>InterventionType</th>\n",
" <th>DesignInterventionModel</th>\n",
" <th>DesignObservationalModel</th>\n",
" <th>TargetDuration</th>\n",
" <th>SamplingMethod</th>\n",
" <th>Phase</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>Female</td>\n",
" <td>18 Years</td>\n",
" <td>No</td>\n",
" <td>1</td>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>Breast Cancer Index (BCI) Registry</td>\n",
" <td>April 14, 2021</td>\n",
" <td>December 2028</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Case-Only</td>\n",
" <td>5 Years</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>Female</td>\n",
" <td>18 Years</td>\n",
" <td>No</td>\n",
" <td>2</td>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>Diagnosis Value of SEMA4C in Breast Cancer</td>\n",
" <td>September 1, 2023</td>\n",
" <td>September 1, 2024</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test, Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Breast cancer</td>\n",
" <td>Female</td>\n",
" <td>20 Years</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" <td>3</td>\n",
" <td>Breast cancer</td>\n",
" <td>Role of Sorcin and Annexin A3 in Breast Cancer...</td>\n",
" <td>January 20, 2019</td>\n",
" <td>September 30, 2019</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>Breast cancer</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Genetic, Other</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>BC-BOMET</td>\n",
" <td>Female</td>\n",
" <td>18 Years</td>\n",
" <td>No</td>\n",
" <td>4</td>\n",
" <td>BC-BOMET</td>\n",
" <td>Evaluation of Prognostic Factors: From Breast ...</td>\n",
" <td>January 13, 2020</td>\n",
" <td>November 12, 2024</td>\n",
" <td>...</td>\n",
" <td>4</td>\n",
" <td>BC-BOMET</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Other, Other</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>241391</td>\n",
" <td>Female</td>\n",
" <td>30 Years</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" <td>5</td>\n",
" <td>241391</td>\n",
" <td>A Study to Identify Breast Cancer (IDBC)</td>\n",
" <td>January 24, 2019</td>\n",
" <td>December 31, 2022</td>\n",
" <td>...</td>\n",
" <td>5</td>\n",
" <td>241391</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Cohort</td>\n",
" <td>1 Year</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 27 columns</p>\n",
"</div>"
],
"text/plain": [
" Rank OrgStudyId Gender MinimumAge HealthyVolunteers Rank \\\n",
"0 1 BTX-BCI-016-PRT Female 18 Years No 1 \n",
"1 2 2018-TJ-BCD Female 18 Years No 2 \n",
"2 3 Breast cancer Female 20 Years Accepts Healthy Volunteers 3 \n",
"3 4 BC-BOMET Female 18 Years No 4 \n",
"4 5 241391 Female 30 Years Accepts Healthy Volunteers 5 \n",
"\n",
" OrgStudyId BriefTitle \\\n",
"0 BTX-BCI-016-PRT Breast Cancer Index (BCI) Registry \n",
"1 2018-TJ-BCD Diagnosis Value of SEMA4C in Breast Cancer \n",
"2 Breast cancer Role of Sorcin and Annexin A3 in Breast Cancer... \n",
"3 BC-BOMET Evaluation of Prognostic Factors: From Breast ... \n",
"4 241391 A Study to Identify Breast Cancer (IDBC) \n",
"\n",
" StartDate CompletionDate ... Rank OrgStudyId \\\n",
"0 April 14, 2021 December 2028 ... 1 BTX-BCI-016-PRT \n",
"1 September 1, 2023 September 1, 2024 ... 2 2018-TJ-BCD \n",
"2 January 20, 2019 September 30, 2019 ... 3 Breast cancer \n",
"3 January 13, 2020 November 12, 2024 ... 4 BC-BOMET \n",
"4 January 24, 2019 December 31, 2022 ... 5 241391 \n",
"\n",
" StudyType ArmGroupType InterventionType \\\n",
"0 Observational NaN Diagnostic Test \n",
"1 Observational NaN Diagnostic Test, Diagnostic Test \n",
"2 Observational NaN Genetic, Other \n",
"3 Observational NaN Other, Other \n",
"4 Observational NaN Diagnostic Test \n",
"\n",
" DesignInterventionModel DesignObservationalModel TargetDuration \\\n",
"0 NaN Case-Only 5 Years \n",
"1 NaN Case-Control NaN \n",
"2 NaN Case-Control NaN \n",
"3 NaN Case-Control NaN \n",
"4 NaN Cohort 1 Year \n",
"\n",
" SamplingMethod Phase \n",
"0 Non-Probability Sample NaN \n",
"1 Non-Probability Sample NaN \n",
"2 Probability Sample NaN \n",
"3 Non-Probability Sample NaN \n",
"4 Non-Probability Sample NaN \n",
"\n",
"[5 rows x 27 columns]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble_df = pd.concat([participant_df, registration_df, study_info_df, study_method_df], axis = 1)\n",
"ensemble_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "38d999fa-6f3a-41c8-bf21-c4a5af127d96",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Gender',\n",
" 'MinimumAge',\n",
" 'HealthyVolunteers',\n",
" 'StartDate',\n",
" 'CompletionDate',\n",
" 'OverallStatus',\n",
" 'StudyType',\n",
" 'IsFDARegulatedDrug',\n",
" 'IsFDARegulatedDevice',\n",
" 'ResponsiblePartyType',\n",
" 'StudyType',\n",
" 'ArmGroupType',\n",
" 'InterventionType',\n",
" 'DesignInterventionModel',\n",
" 'DesignObservationalModel',\n",
" 'TargetDuration',\n",
" 'SamplingMethod',\n",
" 'Phase']"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble_df = ensemble_df.drop(['BriefTitle', 'Rank', 'OrgStudyId'], axis = 1)\n",
"list(ensemble_df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "8a2cb06c-e200-478a-862f-50501e48f6d7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(5003, 18)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "6a18a028-83a3-4faa-a746-197f0094bd30",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Gender</th>\n",
" <th>MinimumAge</th>\n",
" <th>HealthyVolunteers</th>\n",
" <th>StartDate</th>\n",
" <th>CompletionDate</th>\n",
" <th>OverallStatus</th>\n",
" <th>StudyType</th>\n",
" <th>IsFDARegulatedDrug</th>\n",
" <th>IsFDARegulatedDevice</th>\n",
" <th>ResponsiblePartyType</th>\n",
" <th>StudyType</th>\n",
" <th>ArmGroupType</th>\n",
" <th>InterventionType</th>\n",
" <th>DesignInterventionModel</th>\n",
" <th>DesignObservationalModel</th>\n",
" <th>TargetDuration</th>\n",
" <th>SamplingMethod</th>\n",
" <th>Phase</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Female</td>\n",
" <td>18 Years</td>\n",
" <td>No</td>\n",
" <td>April 14, 2021</td>\n",
" <td>December 2028</td>\n",
" <td>Recruiting</td>\n",
" <td>Observational</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Case-Only</td>\n",
" <td>5 Years</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Female</td>\n",
" <td>18 Years</td>\n",
" <td>No</td>\n",
" <td>September 1, 2023</td>\n",
" <td>September 1, 2024</td>\n",
" <td>Not yet recruiting</td>\n",
" <td>Observational</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test, Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Female</td>\n",
" <td>20 Years</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" <td>January 20, 2019</td>\n",
" <td>September 30, 2019</td>\n",
" <td>Unknown status</td>\n",
" <td>Observational</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Genetic, Other</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Female</td>\n",
" <td>18 Years</td>\n",
" <td>No</td>\n",
" <td>January 13, 2020</td>\n",
" <td>November 12, 2024</td>\n",
" <td>Recruiting</td>\n",
" <td>Observational</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Other, Other</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Female</td>\n",
" <td>30 Years</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" <td>January 24, 2019</td>\n",
" <td>December 31, 2022</td>\n",
" <td>Unknown status</td>\n",
" <td>Observational</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Cohort</td>\n",
" <td>1 Year</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Gender MinimumAge HealthyVolunteers StartDate \\\n",
"0 Female 18 Years No April 14, 2021 \n",
"1 Female 18 Years No September 1, 2023 \n",
"2 Female 20 Years Accepts Healthy Volunteers January 20, 2019 \n",
"3 Female 18 Years No January 13, 2020 \n",
"4 Female 30 Years Accepts Healthy Volunteers January 24, 2019 \n",
"\n",
" CompletionDate OverallStatus StudyType IsFDARegulatedDrug \\\n",
"0 December 2028 Recruiting Observational No \n",
"1 September 1, 2024 Not yet recruiting Observational No \n",
"2 September 30, 2019 Unknown status Observational No \n",
"3 November 12, 2024 Recruiting Observational No \n",
"4 December 31, 2022 Unknown status Observational No \n",
"\n",
" IsFDARegulatedDevice ResponsiblePartyType StudyType ArmGroupType \\\n",
"0 No Sponsor Observational NaN \n",
"1 No Principal Investigator Observational NaN \n",
"2 No Principal Investigator Observational NaN \n",
"3 No Sponsor Observational NaN \n",
"4 No Sponsor Observational NaN \n",
"\n",
" InterventionType DesignInterventionModel \\\n",
"0 Diagnostic Test NaN \n",
"1 Diagnostic Test, Diagnostic Test NaN \n",
"2 Genetic, Other NaN \n",
"3 Other, Other NaN \n",
"4 Diagnostic Test NaN \n",
"\n",
" DesignObservationalModel TargetDuration SamplingMethod Phase \n",
"0 Case-Only 5 Years Non-Probability Sample NaN \n",
"1 Case-Control NaN Non-Probability Sample NaN \n",
"2 Case-Control NaN Probability Sample NaN \n",
"3 Case-Control NaN Non-Probability Sample NaN \n",
"4 Cohort 1 Year Non-Probability Sample NaN "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "a0b0e07c-bf15-41d7-88d2-e30b101e3000",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Gender 3\n",
"MinimumAge 46\n",
"HealthyVolunteers 2\n",
"StartDate 1531\n",
"CompletionDate 1566\n",
"OverallStatus 9\n",
"StudyType 2\n",
"IsFDARegulatedDrug 2\n",
"IsFDARegulatedDevice 2\n",
"ResponsiblePartyType 3\n",
"StudyType 2\n",
"ArmGroupType 161\n",
"InterventionType 422\n",
"DesignInterventionModel 5\n",
"DesignObservationalModel 7\n",
"TargetDuration 27\n",
"SamplingMethod 2\n",
"Phase 8\n",
"dtype: int64"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ensemble_df.nunique()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "5648c1c4-9380-409a-8d12-d8267c3cda5e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Gender</th>\n",
" <th>HealthyVolunteers</th>\n",
" <th>OverallStatus</th>\n",
" <th>IsFDARegulatedDrug</th>\n",
" <th>IsFDARegulatedDevice</th>\n",
" <th>ResponsiblePartyType</th>\n",
" <th>StudyType</th>\n",
" <th>StudyType</th>\n",
" <th>DesignInterventionModel</th>\n",
" <th>DesignObservationalModel</th>\n",
" <th>SamplingMethod</th>\n",
" <th>Phase</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Female</td>\n",
" <td>No</td>\n",
" <td>Recruiting</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" <td>Observational</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Case-Only</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Female</td>\n",
" <td>No</td>\n",
" <td>Not yet recruiting</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" <td>Observational</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Female</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" <td>Unknown status</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" <td>Observational</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Female</td>\n",
" <td>No</td>\n",
" <td>Recruiting</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" <td>Observational</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Female</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" <td>Unknown status</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" <td>Observational</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Cohort</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Gender HealthyVolunteers OverallStatus IsFDARegulatedDrug \\\n",
"0 Female No Recruiting No \n",
"1 Female No Not yet recruiting No \n",
"2 Female Accepts Healthy Volunteers Unknown status No \n",
"3 Female No Recruiting No \n",
"4 Female Accepts Healthy Volunteers Unknown status No \n",
"\n",
" IsFDARegulatedDevice ResponsiblePartyType StudyType StudyType \\\n",
"0 No Sponsor Observational Observational \n",
"1 No Principal Investigator Observational Observational \n",
"2 No Principal Investigator Observational Observational \n",
"3 No Sponsor Observational Observational \n",
"4 No Sponsor Observational Observational \n",
"\n",
" DesignInterventionModel DesignObservationalModel SamplingMethod \\\n",
"0 NaN Case-Only Non-Probability Sample \n",
"1 NaN Case-Control Non-Probability Sample \n",
"2 NaN Case-Control Probability Sample \n",
"3 NaN Case-Control Non-Probability Sample \n",
"4 NaN Cohort Non-Probability Sample \n",
"\n",
" Phase \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model = ensemble_df[['Gender','HealthyVolunteers', 'OverallStatus', 'IsFDARegulatedDrug', 'IsFDARegulatedDevice', 'ResponsiblePartyType','StudyType',\n",
" 'DesignInterventionModel', 'DesignObservationalModel', 'SamplingMethod', 'Phase']].copy()\n",
"cat_model.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "2079cf2d-1000-4fb2-b30b-71324afa601e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Gender</th>\n",
" <th>HealthyVolunteers</th>\n",
" <th>OverallStatus</th>\n",
" <th>IsFDARegulatedDrug</th>\n",
" <th>IsFDARegulatedDevice</th>\n",
" <th>ResponsiblePartyType</th>\n",
" <th>StudyType</th>\n",
" <th>DesignInterventionModel</th>\n",
" <th>DesignObservationalModel</th>\n",
" <th>SamplingMethod</th>\n",
" <th>Phase</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Female</td>\n",
" <td>No</td>\n",
" <td>Recruiting</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Case-Only</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Female</td>\n",
" <td>No</td>\n",
" <td>Not yet recruiting</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Female</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" <td>Unknown status</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Female</td>\n",
" <td>No</td>\n",
" <td>Recruiting</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Female</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" <td>Unknown status</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Cohort</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Gender HealthyVolunteers OverallStatus IsFDARegulatedDrug \\\n",
"0 Female No Recruiting No \n",
"1 Female No Not yet recruiting No \n",
"2 Female Accepts Healthy Volunteers Unknown status No \n",
"3 Female No Recruiting No \n",
"4 Female Accepts Healthy Volunteers Unknown status No \n",
"\n",
" IsFDARegulatedDevice ResponsiblePartyType StudyType \\\n",
"0 No Sponsor Observational \n",
"1 No Principal Investigator Observational \n",
"2 No Principal Investigator Observational \n",
"3 No Sponsor Observational \n",
"4 No Sponsor Observational \n",
"\n",
" DesignInterventionModel DesignObservationalModel SamplingMethod \\\n",
"0 NaN Case-Only Non-Probability Sample \n",
"1 NaN Case-Control Non-Probability Sample \n",
"2 NaN Case-Control Probability Sample \n",
"3 NaN Case-Control Non-Probability Sample \n",
"4 NaN Cohort Non-Probability Sample \n",
"\n",
" Phase \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model = cat_model.loc[:,~cat_model.columns.duplicated()].copy()\n",
"cat_model.head()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "f2891082-fa29-4eee-9222-085362204e7d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Gender 1\n",
"HealthyVolunteers 89\n",
"OverallStatus 0\n",
"IsFDARegulatedDrug 51\n",
"IsFDARegulatedDevice 52\n",
"ResponsiblePartyType 0\n",
"StudyType 0\n",
"DesignInterventionModel 534\n",
"DesignObservationalModel 1233\n",
"SamplingMethod 1232\n",
"Phase 534\n",
"dtype: int64"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "0bd82701-b7f2-4191-be57-d35cdfa0e04c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"No 1311\n",
"Accepts Healthy Volunteers 366\n",
"Name: HealthyVolunteers, dtype: int64"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model['HealthyVolunteers'].value_counts()\n"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "399e609c-bce5-4aa3-9153-39eda87dbedb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Parallel Assignment 533\n",
"Single Group Assignment 477\n",
"Sequential Assignment 144\n",
"Crossover Assignment 57\n",
"Factorial Assignment 21\n",
"Name: DesignInterventionModel, dtype: int64"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model['DesignInterventionModel'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "813b0463-48f8-4502-821d-1587f3231945",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Cohort 271\n",
"Case-Only 83\n",
"Case-Control 81\n",
"Other 79\n",
"Ecologic or Community 9\n",
"Case-Crossover 8\n",
"Family-Based 2\n",
"Name: DesignObservationalModel, dtype: int64"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model['DesignObservationalModel'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "52cd2eb6-6f20-4540-8cd8-ea764ab1b197",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Non-Probability Sample 362\n",
"Probability Sample 172\n",
"Name: SamplingMethod, dtype: int64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model['SamplingMethod'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "9811b61e-8d0c-4574-a590-fe25d0d87106",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Recruiting 522\n",
"Completed 309\n",
"Not yet recruiting 255\n",
"Active, not recruiting 212\n",
"Unknown status 202\n",
"Withdrawn 97\n",
"Terminated 86\n",
"Enrolling by invitation 55\n",
"Suspended 28\n",
"Name: OverallStatus, dtype: int64"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model['OverallStatus'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "f199b698-c3b7-4a56-a212-6dea51eb5b0c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"No 1255\n",
"Yes 460\n",
"Name: IsFDARegulatedDrug, dtype: int64"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model['IsFDARegulatedDrug'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "4279c2e7-eb6f-4cc8-baed-e933fa53d3cc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"No 1512\n",
"Yes 202\n",
"Name: IsFDARegulatedDevice, dtype: int64"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model['IsFDARegulatedDevice'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "20949ea3-0de0-42ea-9803-9a1fc99aa24d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Not Applicable 409\n",
"Phase 2 225\n",
"Phase 1 193\n",
"Phase 1, Phase 2 134\n",
"Phase 3 100\n",
"Early Phase 1 72\n",
"Phase 4 63\n",
"Phase 2, Phase 3 36\n",
"Name: Phase, dtype: int64"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model['Phase'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "9b5adbbc-bf00-4d1c-bf68-0e5ee442ba66",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Interventional 1237\n",
"Observational 529\n",
"Name: StudyType, dtype: int64"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model['StudyType'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "ca652c63-4220-4e78-8920-27bfb3bbc843",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1766, 11)"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_model.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de7f7674-0cdb-4aa0-92e9-ab8401e23d5d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1533f628-4c63-4d9e-85a4-dc0b302b3942",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "39ab7359-4893-4402-b83a-84c3d960c583",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "50c643d5-d9be-409f-880f-3476bb7a4e55",
"metadata": {},
"outputs": [],
"source": [
"# Generate categorical variables list\n",
"trial_cat = cat_model.dtypes[cat_model.dtypes == \"object\"].index.tolist()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "018850bd-0503-45e8-a04d-2aba12c11472",
"metadata": {},
"outputs": [],
"source": [
"# Encode labels with getdummies or create a OneHotEncoder instance\n",
"\n",
"# Fit and transform the OHE using thr categorical variable list\n",
"\n",
"# Add the encoded variable names to the dataframe"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f041f78-3a76-42ae-9eac-f36d343c5e47",
"metadata": {},
"outputs": [],
"source": [
"# Merge OHE features and drop the originals"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53030acd-0124-402b-a001-db2b0676d0f3",
"metadata": {},
"outputs": [],
"source": [
"# Split preprocessed data into features and target arrays\n",
"X = \n",
"\n",
"y = "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae631b6d-0435-42b2-8df4-b4f821f475f4",
"metadata": {},
"outputs": [],
"source": [
"# Split data into training and testing datset\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "051a1912-1d40-40b7-a173-7b4f01bc8b01",
"metadata": {},
"outputs": [],
"source": [
"# Create a StandardScaler instances\n",
"scaler = StandardScaler()\n",
"\n",
"# Fit the StandardScaler\n",
"X_scaler = scaler.fit(X_train)\n",
"\n",
"# Scale the data\n",
"X_train_scaled = X_scaler.transform(X_train)\n",
"X_test_scaled = X_scaler.transform(X_test)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "mlenv",
"language": "python",
"name": "mlenv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}