1582 lines (1581 with data), 57.1 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "71d97e4b",
"metadata": {},
"outputs": [],
"source": [
"# Import dependencies\n",
"import requests\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3be0e3f3",
"metadata": {},
"outputs": [],
"source": [
"# Base url for the api call\n",
"url = \"https://clinicaltrials.gov/api/query/study_fields\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "76d2866c",
"metadata": {},
"outputs": [],
"source": [
"# Headers for json call\n",
"headers = {'Content-Type': 'application/json', \n",
" 'Accept': 'application/json'}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ef615845",
"metadata": {},
"outputs": [],
"source": [
"# Extract data and create DataFrame\n",
"def create_df(x):\n",
"\n",
" # Create a list to hold the dictionaries from the json response\n",
" sourced_data = []\n",
"\n",
" # Set a counter to increase the ranks with each iteration\n",
" # Table search criteria limited to 'breast cancer' and from 01/01/2018\n",
" counter = 0\n",
"\n",
" while True:\n",
" # Define the parameters of the url\n",
" params = {'expr': 'breast cancer AND AREA[StartDate]RANGE[01/01/2018,MAX]', \n",
" 'fields': ','.join(x), \n",
" 'min_rnk': 1 + 1000 * counter, \n",
" 'max_rnk': 1000 + 1000 * counter, \n",
" 'fmt': 'json'}\n",
"\n",
" # Make the call \n",
" response = requests.get(url, \n",
" headers = headers, \n",
" params = params)\n",
"\n",
" \n",
"\n",
" # Increment the counter\n",
" counter += 1\n",
" \n",
" # Check to see if more data was returned for the current response\n",
" if response.json()['StudyFieldsResponse']['NStudiesReturned'] == 0:\n",
" # If there was no new data returned, break out of the while loop\n",
" break\n",
"\n",
" # Add the data from the api call to the sourced_data list\n",
" sourced_data += response.json()['StudyFieldsResponse']['StudyFields']\n",
" return sourced_data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6b40dded",
"metadata": {},
"outputs": [],
"source": [
"# Define function to clean DataFrame removing unnecessary characters for analysis\n",
"\n",
"def clean_data(df, fields):\n",
" # Change datatype of the values, preparing for reg expression\n",
" df[fields] = df[fields].astype(str)\n",
"\n",
" # Remove any unnecessary characters and turn blank values with NaN (null)\n",
" for i in fields:\n",
" df[i] = df[i].str.replace(\"^\\[.|.\\]$|'\",\"\")\n",
" df[i] = df[i].replace('',np.nan)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1381ab8d",
"metadata": {},
"outputs": [],
"source": [
"# Define function to clean DataFrame removing unnecessary characters for analysis\n",
"\n",
"def clean_data_years(df, fields):\n",
" # Change datatype of the values, preparing for reg expression\n",
" df[fields] = df[fields].astype(str)\n",
"\n",
" # Remove any unnecessary characters and turn blank values with NaN (null)\n",
" for i in fields:\n",
" df[i] = df[i].str.replace('Years', '')\n",
" df[i] = df[i].str.replace('Year', '')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "616c10f9",
"metadata": {},
"outputs": [],
"source": [
"# List of fields for registration_info table\n",
"reg_info_cols = ['OrgStudyId',\n",
" 'BriefTitle',\n",
" 'StartDate',\n",
" 'CompletionDate',\n",
" 'OverallStatus',\n",
" 'StudyType']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "bfa052e9",
"metadata": {},
"outputs": [],
"source": [
"# Call the function the create DataFrame for registration_info table\n",
"registration_df = pd.DataFrame(create_df(reg_info_cols))\n",
"registration_df.set_index('Rank', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "6d61f82d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\gahyo\\anaconda3\\envs\\mlenv\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: The default value of regex will change from True to False in a future version.\n",
" if __name__ == \"__main__\":\n"
]
}
],
"source": [
"# Clean DataFrame for table A\n",
"clean_data(registration_df, reg_info_cols)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "087b4636",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Title</th>\n",
" <th>Start_Date</th>\n",
" <th>Completion_Date</th>\n",
" <th>Status</th>\n",
" <th>Study_Type</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>Breast Cancer Index (BCI) Registry</td>\n",
" <td>2021-04-14</td>\n",
" <td>2028-12-01</td>\n",
" <td>Recruiting</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>Diagnosis Value of SEMA4C in Breast Cancer</td>\n",
" <td>2023-09-01</td>\n",
" <td>2024-09-01</td>\n",
" <td>Not yet recruiting</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Breast cancer</td>\n",
" <td>Role of Sorcin and Annexin A3 in Breast Cancer...</td>\n",
" <td>2019-01-20</td>\n",
" <td>2019-09-30</td>\n",
" <td>Unknown status</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BC-BOMET</td>\n",
" <td>Evaluation of Prognostic Factors: From Breast ...</td>\n",
" <td>2020-01-13</td>\n",
" <td>2024-11-12</td>\n",
" <td>Recruiting</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>241391</td>\n",
" <td>A Study to Identify Breast Cancer (IDBC)</td>\n",
" <td>2019-01-24</td>\n",
" <td>2022-12-31</td>\n",
" <td>Unknown status</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5025</th>\n",
" <td>32900654326</td>\n",
" <td>TPVB, PECSB, ESPB for Postmastectmy Pain</td>\n",
" <td>2019-04-10</td>\n",
" <td>2021-08-10</td>\n",
" <td>Completed</td>\n",
" <td>Interventional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5026</th>\n",
" <td>ReDA 13176</td>\n",
" <td>A Randomized Phase III Trial of Stereotactic A...</td>\n",
" <td>2023-04-01</td>\n",
" <td>2029-04-01</td>\n",
" <td>Not yet recruiting</td>\n",
" <td>Interventional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5027</th>\n",
" <td>64121317.4.1001.5330</td>\n",
" <td>Nivolumab in Prostate Cancer With DNA Repair D...</td>\n",
" <td>2018-06-01</td>\n",
" <td>2022-03-01</td>\n",
" <td>Active, not recruiting</td>\n",
" <td>Interventional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5028</th>\n",
" <td>3-2020-0038</td>\n",
" <td>Intraoperative Radiation Therapy for Resectabl...</td>\n",
" <td>2020-04-02</td>\n",
" <td>2026-04-01</td>\n",
" <td>Recruiting</td>\n",
" <td>Interventional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5029</th>\n",
" <td>2018PS02</td>\n",
" <td>Pharmacist and Data Driven Quality Improvement...</td>\n",
" <td>2020-09-16</td>\n",
" <td>2021-06-30</td>\n",
" <td>Completed</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5019 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" ID \\\n",
"Index \n",
"1 BTX-BCI-016-PRT \n",
"2 2018-TJ-BCD \n",
"3 Breast cancer \n",
"4 BC-BOMET \n",
"5 241391 \n",
"... ... \n",
"5025 32900654326 \n",
"5026 ReDA 13176 \n",
"5027 64121317.4.1001.5330 \n",
"5028 3-2020-0038 \n",
"5029 2018PS02 \n",
"\n",
" Title Start_Date \\\n",
"Index \n",
"1 Breast Cancer Index (BCI) Registry 2021-04-14 \n",
"2 Diagnosis Value of SEMA4C in Breast Cancer 2023-09-01 \n",
"3 Role of Sorcin and Annexin A3 in Breast Cancer... 2019-01-20 \n",
"4 Evaluation of Prognostic Factors: From Breast ... 2020-01-13 \n",
"5 A Study to Identify Breast Cancer (IDBC) 2019-01-24 \n",
"... ... ... \n",
"5025 TPVB, PECSB, ESPB for Postmastectmy Pain 2019-04-10 \n",
"5026 A Randomized Phase III Trial of Stereotactic A... 2023-04-01 \n",
"5027 Nivolumab in Prostate Cancer With DNA Repair D... 2018-06-01 \n",
"5028 Intraoperative Radiation Therapy for Resectabl... 2020-04-02 \n",
"5029 Pharmacist and Data Driven Quality Improvement... 2020-09-16 \n",
"\n",
" Completion_Date Status Study_Type \n",
"Index \n",
"1 2028-12-01 Recruiting Observational \n",
"2 2024-09-01 Not yet recruiting Observational \n",
"3 2019-09-30 Unknown status Observational \n",
"4 2024-11-12 Recruiting Observational \n",
"5 2022-12-31 Unknown status Observational \n",
"... ... ... ... \n",
"5025 2021-08-10 Completed Interventional \n",
"5026 2029-04-01 Not yet recruiting Interventional \n",
"5027 2022-03-01 Active, not recruiting Interventional \n",
"5028 2026-04-01 Recruiting Interventional \n",
"5029 2021-06-30 Completed Observational \n",
"\n",
"[5019 rows x 6 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Renaming the columns for registration_info table and convertion to datetime and removing duplicates\n",
"registration_df.columns = ['ID', 'Title', 'Start_Date', 'Completion_Date', 'Status', 'Study_Type']\n",
"registration_df.index.names = ['Index']\n",
"registration_df['Start_Date'] = pd.to_datetime(registration_df['Start_Date'])\n",
"registration_df['Completion_Date'] = pd.to_datetime(registration_df['Completion_Date'])\n",
"registration_df = registration_df.drop_duplicates(subset=['ID'], keep='first')\n",
"registration_df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2f40948f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Title</th>\n",
" <th>Start_Date</th>\n",
" <th>Completion_Date</th>\n",
" <th>Actual_Duration</th>\n",
" <th>Status</th>\n",
" <th>Study_Type</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>Breast Cancer Index (BCI) Registry</td>\n",
" <td>2021-04-14</td>\n",
" <td>2028-12-01</td>\n",
" <td>7.638356</td>\n",
" <td>Recruiting</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>Diagnosis Value of SEMA4C in Breast Cancer</td>\n",
" <td>2023-09-01</td>\n",
" <td>2024-09-01</td>\n",
" <td>1.002740</td>\n",
" <td>Not yet recruiting</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Breast cancer</td>\n",
" <td>Role of Sorcin and Annexin A3 in Breast Cancer...</td>\n",
" <td>2019-01-20</td>\n",
" <td>2019-09-30</td>\n",
" <td>0.693151</td>\n",
" <td>Unknown status</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BC-BOMET</td>\n",
" <td>Evaluation of Prognostic Factors: From Breast ...</td>\n",
" <td>2020-01-13</td>\n",
" <td>2024-11-12</td>\n",
" <td>4.835616</td>\n",
" <td>Recruiting</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>241391</td>\n",
" <td>A Study to Identify Breast Cancer (IDBC)</td>\n",
" <td>2019-01-24</td>\n",
" <td>2022-12-31</td>\n",
" <td>3.936986</td>\n",
" <td>Unknown status</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5025</th>\n",
" <td>32900654326</td>\n",
" <td>TPVB, PECSB, ESPB for Postmastectmy Pain</td>\n",
" <td>2019-04-10</td>\n",
" <td>2021-08-10</td>\n",
" <td>2.336986</td>\n",
" <td>Completed</td>\n",
" <td>Interventional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5026</th>\n",
" <td>ReDA 13176</td>\n",
" <td>A Randomized Phase III Trial of Stereotactic A...</td>\n",
" <td>2023-04-01</td>\n",
" <td>2029-04-01</td>\n",
" <td>6.005479</td>\n",
" <td>Not yet recruiting</td>\n",
" <td>Interventional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5027</th>\n",
" <td>64121317.4.1001.5330</td>\n",
" <td>Nivolumab in Prostate Cancer With DNA Repair D...</td>\n",
" <td>2018-06-01</td>\n",
" <td>2022-03-01</td>\n",
" <td>3.750685</td>\n",
" <td>Active, not recruiting</td>\n",
" <td>Interventional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5028</th>\n",
" <td>3-2020-0038</td>\n",
" <td>Intraoperative Radiation Therapy for Resectabl...</td>\n",
" <td>2020-04-02</td>\n",
" <td>2026-04-01</td>\n",
" <td>6.000000</td>\n",
" <td>Recruiting</td>\n",
" <td>Interventional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5029</th>\n",
" <td>2018PS02</td>\n",
" <td>Pharmacist and Data Driven Quality Improvement...</td>\n",
" <td>2020-09-16</td>\n",
" <td>2021-06-30</td>\n",
" <td>0.786301</td>\n",
" <td>Completed</td>\n",
" <td>Observational</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5019 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" ID \\\n",
"Index \n",
"1 BTX-BCI-016-PRT \n",
"2 2018-TJ-BCD \n",
"3 Breast cancer \n",
"4 BC-BOMET \n",
"5 241391 \n",
"... ... \n",
"5025 32900654326 \n",
"5026 ReDA 13176 \n",
"5027 64121317.4.1001.5330 \n",
"5028 3-2020-0038 \n",
"5029 2018PS02 \n",
"\n",
" Title Start_Date \\\n",
"Index \n",
"1 Breast Cancer Index (BCI) Registry 2021-04-14 \n",
"2 Diagnosis Value of SEMA4C in Breast Cancer 2023-09-01 \n",
"3 Role of Sorcin and Annexin A3 in Breast Cancer... 2019-01-20 \n",
"4 Evaluation of Prognostic Factors: From Breast ... 2020-01-13 \n",
"5 A Study to Identify Breast Cancer (IDBC) 2019-01-24 \n",
"... ... ... \n",
"5025 TPVB, PECSB, ESPB for Postmastectmy Pain 2019-04-10 \n",
"5026 A Randomized Phase III Trial of Stereotactic A... 2023-04-01 \n",
"5027 Nivolumab in Prostate Cancer With DNA Repair D... 2018-06-01 \n",
"5028 Intraoperative Radiation Therapy for Resectabl... 2020-04-02 \n",
"5029 Pharmacist and Data Driven Quality Improvement... 2020-09-16 \n",
"\n",
" Completion_Date Actual_Duration Status Study_Type \n",
"Index \n",
"1 2028-12-01 7.638356 Recruiting Observational \n",
"2 2024-09-01 1.002740 Not yet recruiting Observational \n",
"3 2019-09-30 0.693151 Unknown status Observational \n",
"4 2024-11-12 4.835616 Recruiting Observational \n",
"5 2022-12-31 3.936986 Unknown status Observational \n",
"... ... ... ... ... \n",
"5025 2021-08-10 2.336986 Completed Interventional \n",
"5026 2029-04-01 6.005479 Not yet recruiting Interventional \n",
"5027 2022-03-01 3.750685 Active, not recruiting Interventional \n",
"5028 2026-04-01 6.000000 Recruiting Interventional \n",
"5029 2021-06-30 0.786301 Completed Observational \n",
"\n",
"[5019 rows x 7 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# add new column 'Actual Duration' calculating the difference between start date and completion date \n",
"registration_df['Actual_Duration'] = (registration_df['Completion_Date'] - registration_df['Start_Date'])\n",
"registration_df['Actual_Duration'] = registration_df['Actual_Duration'].dt.total_seconds() / (365*24*60*60)\n",
"\n",
"#------------------------------------------------------\n",
"\n",
"# Reorder the columns \n",
"registration_df = registration_df[['ID', 'Title', 'Start_Date', 'Completion_Date', 'Actual_Duration', 'Status', 'Study_Type']]\n",
"registration_df\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "951d30a2",
"metadata": {},
"outputs": [],
"source": [
"# List of fields for participant table\n",
"participant_cols = ['OrgStudyId',\n",
" 'Gender',\n",
" 'MinimumAge',\n",
" 'HealthyVolunteers']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "dfad12a1",
"metadata": {},
"outputs": [],
"source": [
"# Call the function the create DataFrame for participant table\n",
"participant_df = pd.DataFrame(create_df(participant_cols))\n",
"participant_df.set_index('Rank', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "26b5fa0d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\gahyo\\anaconda3\\envs\\mlenv\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: The default value of regex will change from True to False in a future version.\n",
" if __name__ == \"__main__\":\n"
]
}
],
"source": [
"# Clean DataFrame for participant table\n",
"clean_data(participant_df, participant_cols)\n",
"clean_data_years(participant_df, participant_cols)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "dacb6b5b",
"metadata": {},
"outputs": [],
"source": [
"# Minimum Age: Change any months values into 0 years\n",
"for i, rows in participant_df.iterrows():\n",
" if 'Months' in participant_df.loc[i].MinimumAge:\n",
" participant_df.loc[i].MinimumAge = 0\n",
" elif 'Month' in participant_df.loc[i].MinimumAge:\n",
" participant_df.loc[i].MinimumAge = 0\n",
" else:\n",
" continue"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "f307440f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Gender</th>\n",
" <th>Minimum_Age</th>\n",
" <th>Healthy_Volunteers</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Breast cancer</td>\n",
" <td>Female</td>\n",
" <td>20</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BC-BOMET</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>241391</td>\n",
" <td>Female</td>\n",
" <td>30</td>\n",
" <td>Accepts Healthy Volunteers</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5025</th>\n",
" <td>32900654326</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5026</th>\n",
" <td>ReDA 13176</td>\n",
" <td>All</td>\n",
" <td>18</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5027</th>\n",
" <td>64121317.4.1001.5330</td>\n",
" <td>Male</td>\n",
" <td>18</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5028</th>\n",
" <td>3-2020-0038</td>\n",
" <td>All</td>\n",
" <td>20</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5029</th>\n",
" <td>2018PS02</td>\n",
" <td>All</td>\n",
" <td>nan</td>\n",
" <td>nan</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5019 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" ID Gender Minimum_Age Healthy_Volunteers\n",
"Index \n",
"1 BTX-BCI-016-PRT Female 18 No\n",
"2 2018-TJ-BCD Female 18 No\n",
"3 Breast cancer Female 20 Accepts Healthy Volunteers\n",
"4 BC-BOMET Female 18 No\n",
"5 241391 Female 30 Accepts Healthy Volunteers\n",
"... ... ... ... ...\n",
"5025 32900654326 Female 18 No\n",
"5026 ReDA 13176 All 18 No\n",
"5027 64121317.4.1001.5330 Male 18 No\n",
"5028 3-2020-0038 All 20 No\n",
"5029 2018PS02 All nan nan\n",
"\n",
"[5019 rows x 4 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Renaming the columns for participant table and getting rid of rows\n",
"participant_df.columns = ['ID', 'Gender', 'Minimum_Age', 'Healthy_Volunteers']\n",
"participant_df.index.names = ['Index']\n",
"participant_df = participant_df.drop_duplicates(subset=['ID'], keep='first')\n",
"participant_df\n",
"\n",
"# ** below code can be used if Minimum Age needs to be converted into integer\n",
"# participant_df['Minimum_Age'] = participant_df['Minimum_Age'].astype(float).astype('Int64')\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "fd1e1bf7",
"metadata": {},
"outputs": [],
"source": [
"# List of fields for study detail table\n",
"study_detail_cols = ['OrgStudyId',\n",
" 'IsFDARegulatedDrug',\n",
" 'IsFDARegulatedDevice',\n",
" 'ResponsiblePartyType']"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "c5919043",
"metadata": {},
"outputs": [],
"source": [
"# Call the function the create DataFrame for study method table\n",
"study_details_df = pd.DataFrame(create_df(study_detail_cols))\n",
"study_details_df.set_index('Rank', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "1b609c17",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\gahyo\\anaconda3\\envs\\mlenv\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: The default value of regex will change from True to False in a future version.\n",
" if __name__ == \"__main__\":\n"
]
}
],
"source": [
"# Clean DataFrame for study method table\n",
"clean_data(study_details_df, study_detail_cols)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "e9fa270b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>FDA_Regulated_Drug</th>\n",
" <th>FDA_Regulated_Device</th>\n",
" <th>Responsible_Party</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Breast cancer</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BC-BOMET</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>241391</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5025</th>\n",
" <td>32900654326</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5026</th>\n",
" <td>ReDA 13176</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor-Investigator</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5027</th>\n",
" <td>64121317.4.1001.5330</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>Sponsor</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5028</th>\n",
" <td>3-2020-0038</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Sponsor-Investigator</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5029</th>\n",
" <td>2018PS02</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Principal Investigator</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5019 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" ID FDA_Regulated_Drug FDA_Regulated_Device \\\n",
"Index \n",
"1 BTX-BCI-016-PRT No No \n",
"2 2018-TJ-BCD No No \n",
"3 Breast cancer No No \n",
"4 BC-BOMET No No \n",
"5 241391 No No \n",
"... ... ... ... \n",
"5025 32900654326 No No \n",
"5026 ReDA 13176 No No \n",
"5027 64121317.4.1001.5330 Yes No \n",
"5028 3-2020-0038 No No \n",
"5029 2018PS02 No No \n",
"\n",
" Responsible_Party \n",
"Index \n",
"1 Sponsor \n",
"2 Principal Investigator \n",
"3 Principal Investigator \n",
"4 Sponsor \n",
"5 Sponsor \n",
"... ... \n",
"5025 Principal Investigator \n",
"5026 Sponsor-Investigator \n",
"5027 Sponsor \n",
"5028 Sponsor-Investigator \n",
"5029 Principal Investigator \n",
"\n",
"[5019 rows x 4 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Renaming the columns for study detail table and removing duplicates\n",
"study_details_df.columns = ['ID', 'FDA_Regulated_Drug', 'FDA_Regulated_Device', 'Responsible_Party']\n",
"study_details_df.index.names = ['Index']\n",
"study_details_df = study_details_df.drop_duplicates(subset=['ID'], keep='first')\n",
"study_details_df"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "cf276005",
"metadata": {},
"outputs": [],
"source": [
"# List of fields for study method table\n",
"study_method_cols = ['OrgStudyId',\n",
" 'StudyType',\n",
" 'ArmGroupType',\n",
" 'InterventionType',\n",
" 'DesignInterventionModel',\n",
" 'DesignObservationalModel',\n",
" 'TargetDuration',\n",
" 'SamplingMethod',\n",
" 'Phase']"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "3c65381a",
"metadata": {},
"outputs": [],
"source": [
"# Call function to create DataFrame for table D\n",
"study_method_df = pd.DataFrame(create_df(study_method_cols))\n",
"study_method_df.set_index('Rank',inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "cb290e3e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\gahyo\\anaconda3\\envs\\mlenv\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: The default value of regex will change from True to False in a future version.\n",
" if __name__ == \"__main__\":\n"
]
}
],
"source": [
"# Clean DataFrame for study method table\n",
"clean_data(study_method_df, study_method_cols)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "294adaef",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Study_Type</th>\n",
" <th>Arm_Group_Type</th>\n",
" <th>Intervention_Type</th>\n",
" <th>Interventional_Study_Model</th>\n",
" <th>Observational_Study_Model</th>\n",
" <th>Target_Duration</th>\n",
" <th>Sampling_Method</th>\n",
" <th>Phase</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Case-Only</td>\n",
" <td>5 Years</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Breast cancer</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Genetic</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BC-BOMET</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Other</td>\n",
" <td>NaN</td>\n",
" <td>Case-Control</td>\n",
" <td>NaN</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>241391</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Diagnostic Test</td>\n",
" <td>NaN</td>\n",
" <td>Cohort</td>\n",
" <td>1 Year</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5025</th>\n",
" <td>32900654326</td>\n",
" <td>Interventional</td>\n",
" <td>Placebo Comparator</td>\n",
" <td>Procedure</td>\n",
" <td>Parallel Assignment</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Not Applicable</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5026</th>\n",
" <td>ReDA 13176</td>\n",
" <td>Interventional</td>\n",
" <td>Active Comparator</td>\n",
" <td>Radiation</td>\n",
" <td>Parallel Assignment</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Phase 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5027</th>\n",
" <td>64121317.4.1001.5330</td>\n",
" <td>Interventional</td>\n",
" <td>Experimental</td>\n",
" <td>Drug</td>\n",
" <td>Single Group Assignment</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Phase 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5028</th>\n",
" <td>3-2020-0038</td>\n",
" <td>Interventional</td>\n",
" <td>Experimental</td>\n",
" <td>Radiation</td>\n",
" <td>Single Group Assignment</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Not Applicable</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5029</th>\n",
" <td>2018PS02</td>\n",
" <td>Observational</td>\n",
" <td>NaN</td>\n",
" <td>Behavioral</td>\n",
" <td>NaN</td>\n",
" <td>Ecologic or Community</td>\n",
" <td>NaN</td>\n",
" <td>Non-Probability Sample</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5019 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" ID Study_Type Arm_Group_Type \\\n",
"Index \n",
"1 BTX-BCI-016-PRT Observational NaN \n",
"2 2018-TJ-BCD Observational NaN \n",
"3 Breast cancer Observational NaN \n",
"4 BC-BOMET Observational NaN \n",
"5 241391 Observational NaN \n",
"... ... ... ... \n",
"5025 32900654326 Interventional Placebo Comparator \n",
"5026 ReDA 13176 Interventional Active Comparator \n",
"5027 64121317.4.1001.5330 Interventional Experimental \n",
"5028 3-2020-0038 Interventional Experimental \n",
"5029 2018PS02 Observational NaN \n",
"\n",
" Intervention_Type Interventional_Study_Model Observational_Study_Model \\\n",
"Index \n",
"1 Diagnostic Test NaN Case-Only \n",
"2 Diagnostic Test NaN Case-Control \n",
"3 Genetic NaN Case-Control \n",
"4 Other NaN Case-Control \n",
"5 Diagnostic Test NaN Cohort \n",
"... ... ... ... \n",
"5025 Procedure Parallel Assignment NaN \n",
"5026 Radiation Parallel Assignment NaN \n",
"5027 Drug Single Group Assignment NaN \n",
"5028 Radiation Single Group Assignment NaN \n",
"5029 Behavioral NaN Ecologic or Community \n",
"\n",
" Target_Duration Sampling_Method Phase \n",
"Index \n",
"1 5 Years Non-Probability Sample NaN \n",
"2 NaN Non-Probability Sample NaN \n",
"3 NaN Probability Sample NaN \n",
"4 NaN Non-Probability Sample NaN \n",
"5 1 Year Non-Probability Sample NaN \n",
"... ... ... ... \n",
"5025 NaN NaN Not Applicable \n",
"5026 NaN NaN Phase 3 \n",
"5027 NaN NaN Phase 2 \n",
"5028 NaN NaN Not Applicable \n",
"5029 NaN Non-Probability Sample NaN \n",
"\n",
"[5019 rows x 9 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Renaming the columns for table D and removing duplicates and simplifyingstudy_method_df.columns = ['ID', 'Study_Type', 'Arm_Group_Type', 'Intervention_Type', 'Interventional_Study_Model', 'Observational_Study_Model', 'Target_Duration', 'Sampling_Method', 'Phase']\n",
"study_method_df.index.names = ['Index']\n",
"study_method_df.columns = ['ID', 'Study_Type', 'Arm_Group_Type', 'Intervention_Type', 'Interventional_Study_Model', 'Observational_Study_Model', 'Target_Duration', 'Sampling_Method', 'Phase']\n",
"study_method_df = study_method_df.drop_duplicates(subset=['ID'], keep='first')\n",
"study_method_df['Arm_Group_Type'] = study_method_df['Arm_Group_Type'].str.split(',').str[0]\n",
"study_method_df['Intervention_Type'] = study_method_df['Intervention_Type'].str.split(',').str[0]\n",
"study_method_df"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "31753f41",
"metadata": {},
"outputs": [],
"source": [
"# List of fields for study detail table and export dataset into DataFrame\n",
"text_analysis_cols = ['OrgStudyId',\n",
" 'WhyStopped', \n",
" 'EnrollmentCount',\n",
" 'PrimaryOutcomeMeasure',]\n",
"text_analysis_df = pd.DataFrame(create_df(text_analysis_cols))\n",
"text_analysis_df.set_index('Rank', inplace=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "8651a223",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\gahyo\\anaconda3\\envs\\mlenv\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: The default value of regex will change from True to False in a future version.\n",
" if __name__ == \"__main__\":\n"
]
}
],
"source": [
"clean_data(text_analysis_df, text_analysis_cols)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "09e7c3bc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Why_Stopped</th>\n",
" <th>Enrollment_Count</th>\n",
" <th>Primary_Outcome_Measure</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BTX-BCI-016-PRT</td>\n",
" <td>NaN</td>\n",
" <td>3000</td>\n",
" <td>To determine BCI test performance by evaluatin...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-TJ-BCD</td>\n",
" <td>NaN</td>\n",
" <td>2300</td>\n",
" <td>Diagnostic potential of SEMA4C as a biomarker ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Breast cancer</td>\n",
" <td>NaN</td>\n",
" <td>80</td>\n",
" <td>Role of SORCIN in patients with breast cancer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BC-BOMET</td>\n",
" <td>NaN</td>\n",
" <td>30</td>\n",
" <td>SENP1 expression</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>241391</td>\n",
" <td>NaN</td>\n",
" <td>600</td>\n",
" <td>Performance of the Syantra DX Breast Cancer te...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5025</th>\n",
" <td>32900654326</td>\n",
" <td>NaN</td>\n",
" <td>80</td>\n",
" <td>The primary outcome of the study will be durat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5026</th>\n",
" <td>ReDA 13176</td>\n",
" <td>NaN</td>\n",
" <td>180</td>\n",
" <td>Overall Survival</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5027</th>\n",
" <td>64121317.4.1001.5330</td>\n",
" <td>NaN</td>\n",
" <td>38</td>\n",
" <td>PSA response rate</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5028</th>\n",
" <td>3-2020-0038</td>\n",
" <td>NaN</td>\n",
" <td>80</td>\n",
" <td>3-year local recurrence rate</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5029</th>\n",
" <td>2018PS02</td>\n",
" <td>NaN</td>\n",
" <td>19</td>\n",
" <td>Initially targeted drug therapy risks in patie...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5019 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" ID Why_Stopped Enrollment_Count \\\n",
"Index \n",
"1 BTX-BCI-016-PRT NaN 3000 \n",
"2 2018-TJ-BCD NaN 2300 \n",
"3 Breast cancer NaN 80 \n",
"4 BC-BOMET NaN 30 \n",
"5 241391 NaN 600 \n",
"... ... ... ... \n",
"5025 32900654326 NaN 80 \n",
"5026 ReDA 13176 NaN 180 \n",
"5027 64121317.4.1001.5330 NaN 38 \n",
"5028 3-2020-0038 NaN 80 \n",
"5029 2018PS02 NaN 19 \n",
"\n",
" Primary_Outcome_Measure \n",
"Index \n",
"1 To determine BCI test performance by evaluatin... \n",
"2 Diagnostic potential of SEMA4C as a biomarker ... \n",
"3 Role of SORCIN in patients with breast cancer \n",
"4 SENP1 expression \n",
"5 Performance of the Syantra DX Breast Cancer te... \n",
"... ... \n",
"5025 The primary outcome of the study will be durat... \n",
"5026 Overall Survival \n",
"5027 PSA response rate \n",
"5028 3-year local recurrence rate \n",
"5029 Initially targeted drug therapy risks in patie... \n",
"\n",
"[5019 rows x 4 columns]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Renaming the columns for table E and removing duplicates\n",
"text_analysis_df.columns = ['ID', 'Why_Stopped','Enrollment_Count', 'Primary_Outcome_Measure']\n",
"text_analysis_df.index.names = ['Index']\n",
"text_analysis_df = text_analysis_df.drop_duplicates(subset=['ID'], keep='first')\n",
"text_analysis_df"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "0b08919a",
"metadata": {},
"outputs": [],
"source": [
"# Export DataFrames as csv files \n",
"study_method_df.to_csv('Tables/study_method_df.csv',index=False)\n",
"text_analysis_df.to_csv('Tables/free_text_df.csv', index=False)\n",
"registration_df.to_csv('Tables/registration_df.csv', index=False)\n",
"participant_df.to_csv('Tables/participant_df.csv', index=False)\n",
"study_details_df.to_csv('Tables/study_details_df.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}