[954dc3]: / ETL / ETL Official.ipynb

Download this file

1582 lines (1581 with data), 57.1 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "71d97e4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import dependencies\n",
    "import requests\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3be0e3f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Base url for the api call\n",
    "url = \"https://clinicaltrials.gov/api/query/study_fields\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "76d2866c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Headers for json call\n",
    "headers = {'Content-Type': 'application/json', \n",
    "           'Accept': 'application/json'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ef615845",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract data and create DataFrame\n",
    "def create_df(x):\n",
    "\n",
    "    # Create a list to hold the dictionaries from the json response\n",
    "    sourced_data = []\n",
    "\n",
    "    # Set a counter to increase the ranks with each iteration\n",
    "    # Table search criteria limited to 'breast cancer' and from 01/01/2018\n",
    "    counter = 0\n",
    "\n",
    "    while True:\n",
    "        # Define the parameters of the url\n",
    "        params = {'expr': 'breast cancer AND AREA[StartDate]RANGE[01/01/2018,MAX]', \n",
    "                'fields': ','.join(x), \n",
    "                'min_rnk': 1 + 1000 * counter,  \n",
    "                'max_rnk': 1000 + 1000 * counter, \n",
    "                'fmt': 'json'}\n",
    "\n",
    "        # Make the call \n",
    "        response = requests.get(url, \n",
    "                                headers = headers, \n",
    "                                params = params)\n",
    "\n",
    "        \n",
    "\n",
    "        # Increment the counter\n",
    "        counter += 1\n",
    "        \n",
    "        # Check to see if more data was returned for the current response\n",
    "        if response.json()['StudyFieldsResponse']['NStudiesReturned'] == 0:\n",
    "            # If there was no new data returned, break out of the while loop\n",
    "            break\n",
    "\n",
    "        # Add the data from the api call to the sourced_data list\n",
    "        sourced_data += response.json()['StudyFieldsResponse']['StudyFields']\n",
    "    return sourced_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6b40dded",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define function to clean DataFrame removing unnecessary characters for analysis\n",
    "\n",
    "def clean_data(df, fields):\n",
    "    # Change datatype of the values, preparing for reg expression\n",
    "    df[fields] = df[fields].astype(str)\n",
    "\n",
    "    # Remove any unnecessary characters and turn blank values with NaN (null)\n",
    "    for i in fields:\n",
    "        df[i] = df[i].str.replace(\"^\\[.|.\\]$|'\",\"\")\n",
    "        df[i] = df[i].replace('',np.nan)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1381ab8d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define function to clean DataFrame removing unnecessary characters for analysis\n",
    "\n",
    "def clean_data_years(df, fields):\n",
    "    # Change datatype of the values, preparing for reg expression\n",
    "    df[fields] = df[fields].astype(str)\n",
    "\n",
    "    # Remove any unnecessary characters and turn blank values with NaN (null)\n",
    "    for i in fields:\n",
    "        df[i] = df[i].str.replace('Years', '')\n",
    "        df[i] = df[i].str.replace('Year', '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "616c10f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# List of fields for registration_info table\n",
    "reg_info_cols = ['OrgStudyId',\n",
    "               'BriefTitle',\n",
    "               'StartDate',\n",
    "               'CompletionDate',\n",
    "               'OverallStatus',\n",
    "               'StudyType']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "bfa052e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Call the function the create DataFrame for registration_info table\n",
    "registration_df = pd.DataFrame(create_df(reg_info_cols))\n",
    "registration_df.set_index('Rank', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6d61f82d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\gahyo\\anaconda3\\envs\\mlenv\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  if __name__ == \"__main__\":\n"
     ]
    }
   ],
   "source": [
    "# Clean DataFrame for table A\n",
    "clean_data(registration_df, reg_info_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "087b4636",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Title</th>\n",
       "      <th>Start_Date</th>\n",
       "      <th>Completion_Date</th>\n",
       "      <th>Status</th>\n",
       "      <th>Study_Type</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>Breast Cancer Index (BCI) Registry</td>\n",
       "      <td>2021-04-14</td>\n",
       "      <td>2028-12-01</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>Diagnosis Value of SEMA4C in Breast Cancer</td>\n",
       "      <td>2023-09-01</td>\n",
       "      <td>2024-09-01</td>\n",
       "      <td>Not yet recruiting</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Role of Sorcin and Annexin A3 in Breast Cancer...</td>\n",
       "      <td>2019-01-20</td>\n",
       "      <td>2019-09-30</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>Evaluation of Prognostic Factors: From Breast ...</td>\n",
       "      <td>2020-01-13</td>\n",
       "      <td>2024-11-12</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>241391</td>\n",
       "      <td>A Study to Identify Breast Cancer (IDBC)</td>\n",
       "      <td>2019-01-24</td>\n",
       "      <td>2022-12-31</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5025</th>\n",
       "      <td>32900654326</td>\n",
       "      <td>TPVB, PECSB, ESPB for Postmastectmy Pain</td>\n",
       "      <td>2019-04-10</td>\n",
       "      <td>2021-08-10</td>\n",
       "      <td>Completed</td>\n",
       "      <td>Interventional</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5026</th>\n",
       "      <td>ReDA 13176</td>\n",
       "      <td>A Randomized Phase III Trial of Stereotactic A...</td>\n",
       "      <td>2023-04-01</td>\n",
       "      <td>2029-04-01</td>\n",
       "      <td>Not yet recruiting</td>\n",
       "      <td>Interventional</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5027</th>\n",
       "      <td>64121317.4.1001.5330</td>\n",
       "      <td>Nivolumab in Prostate Cancer With DNA Repair D...</td>\n",
       "      <td>2018-06-01</td>\n",
       "      <td>2022-03-01</td>\n",
       "      <td>Active, not recruiting</td>\n",
       "      <td>Interventional</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5028</th>\n",
       "      <td>3-2020-0038</td>\n",
       "      <td>Intraoperative Radiation Therapy for Resectabl...</td>\n",
       "      <td>2020-04-02</td>\n",
       "      <td>2026-04-01</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>Interventional</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5029</th>\n",
       "      <td>2018PS02</td>\n",
       "      <td>Pharmacist and Data Driven Quality Improvement...</td>\n",
       "      <td>2020-09-16</td>\n",
       "      <td>2021-06-30</td>\n",
       "      <td>Completed</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5019 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                         ID  \\\n",
       "Index                         \n",
       "1           BTX-BCI-016-PRT   \n",
       "2               2018-TJ-BCD   \n",
       "3             Breast cancer   \n",
       "4                  BC-BOMET   \n",
       "5                    241391   \n",
       "...                     ...   \n",
       "5025            32900654326   \n",
       "5026             ReDA 13176   \n",
       "5027   64121317.4.1001.5330   \n",
       "5028            3-2020-0038   \n",
       "5029               2018PS02   \n",
       "\n",
       "                                                   Title Start_Date  \\\n",
       "Index                                                                 \n",
       "1                     Breast Cancer Index (BCI) Registry 2021-04-14   \n",
       "2             Diagnosis Value of SEMA4C in Breast Cancer 2023-09-01   \n",
       "3      Role of Sorcin and Annexin A3 in Breast Cancer... 2019-01-20   \n",
       "4      Evaluation of Prognostic Factors: From Breast ... 2020-01-13   \n",
       "5               A Study to Identify Breast Cancer (IDBC) 2019-01-24   \n",
       "...                                                  ...        ...   \n",
       "5025            TPVB, PECSB, ESPB for Postmastectmy Pain 2019-04-10   \n",
       "5026   A Randomized Phase III Trial of Stereotactic A... 2023-04-01   \n",
       "5027   Nivolumab in Prostate Cancer With DNA Repair D... 2018-06-01   \n",
       "5028   Intraoperative Radiation Therapy for Resectabl... 2020-04-02   \n",
       "5029   Pharmacist and Data Driven Quality Improvement... 2020-09-16   \n",
       "\n",
       "      Completion_Date                  Status      Study_Type  \n",
       "Index                                                          \n",
       "1          2028-12-01              Recruiting   Observational  \n",
       "2          2024-09-01      Not yet recruiting   Observational  \n",
       "3          2019-09-30          Unknown status   Observational  \n",
       "4          2024-11-12              Recruiting   Observational  \n",
       "5          2022-12-31          Unknown status   Observational  \n",
       "...               ...                     ...             ...  \n",
       "5025       2021-08-10               Completed  Interventional  \n",
       "5026       2029-04-01      Not yet recruiting  Interventional  \n",
       "5027       2022-03-01  Active, not recruiting  Interventional  \n",
       "5028       2026-04-01              Recruiting  Interventional  \n",
       "5029       2021-06-30               Completed   Observational  \n",
       "\n",
       "[5019 rows x 6 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Renaming the columns for registration_info table and convertion to datetime and removing duplicates\n",
    "registration_df.columns = ['ID', 'Title', 'Start_Date', 'Completion_Date', 'Status', 'Study_Type']\n",
    "registration_df.index.names = ['Index']\n",
    "registration_df['Start_Date'] = pd.to_datetime(registration_df['Start_Date'])\n",
    "registration_df['Completion_Date'] = pd.to_datetime(registration_df['Completion_Date'])\n",
    "registration_df = registration_df.drop_duplicates(subset=['ID'], keep='first')\n",
    "registration_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2f40948f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Title</th>\n",
       "      <th>Start_Date</th>\n",
       "      <th>Completion_Date</th>\n",
       "      <th>Actual_Duration</th>\n",
       "      <th>Status</th>\n",
       "      <th>Study_Type</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>Breast Cancer Index (BCI) Registry</td>\n",
       "      <td>2021-04-14</td>\n",
       "      <td>2028-12-01</td>\n",
       "      <td>7.638356</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>Diagnosis Value of SEMA4C in Breast Cancer</td>\n",
       "      <td>2023-09-01</td>\n",
       "      <td>2024-09-01</td>\n",
       "      <td>1.002740</td>\n",
       "      <td>Not yet recruiting</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Role of Sorcin and Annexin A3 in Breast Cancer...</td>\n",
       "      <td>2019-01-20</td>\n",
       "      <td>2019-09-30</td>\n",
       "      <td>0.693151</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>Evaluation of Prognostic Factors: From Breast ...</td>\n",
       "      <td>2020-01-13</td>\n",
       "      <td>2024-11-12</td>\n",
       "      <td>4.835616</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>241391</td>\n",
       "      <td>A Study to Identify Breast Cancer (IDBC)</td>\n",
       "      <td>2019-01-24</td>\n",
       "      <td>2022-12-31</td>\n",
       "      <td>3.936986</td>\n",
       "      <td>Unknown status</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5025</th>\n",
       "      <td>32900654326</td>\n",
       "      <td>TPVB, PECSB, ESPB for Postmastectmy Pain</td>\n",
       "      <td>2019-04-10</td>\n",
       "      <td>2021-08-10</td>\n",
       "      <td>2.336986</td>\n",
       "      <td>Completed</td>\n",
       "      <td>Interventional</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5026</th>\n",
       "      <td>ReDA 13176</td>\n",
       "      <td>A Randomized Phase III Trial of Stereotactic A...</td>\n",
       "      <td>2023-04-01</td>\n",
       "      <td>2029-04-01</td>\n",
       "      <td>6.005479</td>\n",
       "      <td>Not yet recruiting</td>\n",
       "      <td>Interventional</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5027</th>\n",
       "      <td>64121317.4.1001.5330</td>\n",
       "      <td>Nivolumab in Prostate Cancer With DNA Repair D...</td>\n",
       "      <td>2018-06-01</td>\n",
       "      <td>2022-03-01</td>\n",
       "      <td>3.750685</td>\n",
       "      <td>Active, not recruiting</td>\n",
       "      <td>Interventional</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5028</th>\n",
       "      <td>3-2020-0038</td>\n",
       "      <td>Intraoperative Radiation Therapy for Resectabl...</td>\n",
       "      <td>2020-04-02</td>\n",
       "      <td>2026-04-01</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>Recruiting</td>\n",
       "      <td>Interventional</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5029</th>\n",
       "      <td>2018PS02</td>\n",
       "      <td>Pharmacist and Data Driven Quality Improvement...</td>\n",
       "      <td>2020-09-16</td>\n",
       "      <td>2021-06-30</td>\n",
       "      <td>0.786301</td>\n",
       "      <td>Completed</td>\n",
       "      <td>Observational</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5019 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                         ID  \\\n",
       "Index                         \n",
       "1           BTX-BCI-016-PRT   \n",
       "2               2018-TJ-BCD   \n",
       "3             Breast cancer   \n",
       "4                  BC-BOMET   \n",
       "5                    241391   \n",
       "...                     ...   \n",
       "5025            32900654326   \n",
       "5026             ReDA 13176   \n",
       "5027   64121317.4.1001.5330   \n",
       "5028            3-2020-0038   \n",
       "5029               2018PS02   \n",
       "\n",
       "                                                   Title Start_Date  \\\n",
       "Index                                                                 \n",
       "1                     Breast Cancer Index (BCI) Registry 2021-04-14   \n",
       "2             Diagnosis Value of SEMA4C in Breast Cancer 2023-09-01   \n",
       "3      Role of Sorcin and Annexin A3 in Breast Cancer... 2019-01-20   \n",
       "4      Evaluation of Prognostic Factors: From Breast ... 2020-01-13   \n",
       "5               A Study to Identify Breast Cancer (IDBC) 2019-01-24   \n",
       "...                                                  ...        ...   \n",
       "5025            TPVB, PECSB, ESPB for Postmastectmy Pain 2019-04-10   \n",
       "5026   A Randomized Phase III Trial of Stereotactic A... 2023-04-01   \n",
       "5027   Nivolumab in Prostate Cancer With DNA Repair D... 2018-06-01   \n",
       "5028   Intraoperative Radiation Therapy for Resectabl... 2020-04-02   \n",
       "5029   Pharmacist and Data Driven Quality Improvement... 2020-09-16   \n",
       "\n",
       "      Completion_Date  Actual_Duration                  Status      Study_Type  \n",
       "Index                                                                           \n",
       "1          2028-12-01         7.638356              Recruiting   Observational  \n",
       "2          2024-09-01         1.002740      Not yet recruiting   Observational  \n",
       "3          2019-09-30         0.693151          Unknown status   Observational  \n",
       "4          2024-11-12         4.835616              Recruiting   Observational  \n",
       "5          2022-12-31         3.936986          Unknown status   Observational  \n",
       "...               ...              ...                     ...             ...  \n",
       "5025       2021-08-10         2.336986               Completed  Interventional  \n",
       "5026       2029-04-01         6.005479      Not yet recruiting  Interventional  \n",
       "5027       2022-03-01         3.750685  Active, not recruiting  Interventional  \n",
       "5028       2026-04-01         6.000000              Recruiting  Interventional  \n",
       "5029       2021-06-30         0.786301               Completed   Observational  \n",
       "\n",
       "[5019 rows x 7 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add new column 'Actual Duration' calculating the difference between start date and completion date \n",
    "registration_df['Actual_Duration'] = (registration_df['Completion_Date'] - registration_df['Start_Date'])\n",
    "registration_df['Actual_Duration'] = registration_df['Actual_Duration'].dt.total_seconds() / (365*24*60*60)\n",
    "\n",
    "#------------------------------------------------------\n",
    "\n",
    "# Reorder the columns \n",
    "registration_df = registration_df[['ID', 'Title', 'Start_Date', 'Completion_Date', 'Actual_Duration', 'Status', 'Study_Type']]\n",
    "registration_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "951d30a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# List of fields for participant table\n",
    "participant_cols = ['OrgStudyId',\n",
    "               'Gender',\n",
    "               'MinimumAge',\n",
    "               'HealthyVolunteers']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "dfad12a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Call the function the create DataFrame for participant table\n",
    "participant_df = pd.DataFrame(create_df(participant_cols))\n",
    "participant_df.set_index('Rank', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "26b5fa0d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\gahyo\\anaconda3\\envs\\mlenv\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  if __name__ == \"__main__\":\n"
     ]
    }
   ],
   "source": [
    "# Clean DataFrame for participant table\n",
    "clean_data(participant_df, participant_cols)\n",
    "clean_data_years(participant_df, participant_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "dacb6b5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Minimum Age: Change any months values into 0 years\n",
    "for i, rows in participant_df.iterrows():\n",
    "    if 'Months' in participant_df.loc[i].MinimumAge:\n",
    "        participant_df.loc[i].MinimumAge = 0\n",
    "    elif 'Month' in participant_df.loc[i].MinimumAge:\n",
    "        participant_df.loc[i].MinimumAge = 0\n",
    "    else:\n",
    "        continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f307440f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Minimum_Age</th>\n",
       "      <th>Healthy_Volunteers</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>Female</td>\n",
       "      <td>18</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>Female</td>\n",
       "      <td>18</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Female</td>\n",
       "      <td>20</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>Female</td>\n",
       "      <td>18</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>241391</td>\n",
       "      <td>Female</td>\n",
       "      <td>30</td>\n",
       "      <td>Accepts Healthy Volunteers</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5025</th>\n",
       "      <td>32900654326</td>\n",
       "      <td>Female</td>\n",
       "      <td>18</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5026</th>\n",
       "      <td>ReDA 13176</td>\n",
       "      <td>All</td>\n",
       "      <td>18</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5027</th>\n",
       "      <td>64121317.4.1001.5330</td>\n",
       "      <td>Male</td>\n",
       "      <td>18</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5028</th>\n",
       "      <td>3-2020-0038</td>\n",
       "      <td>All</td>\n",
       "      <td>20</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5029</th>\n",
       "      <td>2018PS02</td>\n",
       "      <td>All</td>\n",
       "      <td>nan</td>\n",
       "      <td>nan</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5019 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                         ID  Gender Minimum_Age          Healthy_Volunteers\n",
       "Index                                                                      \n",
       "1           BTX-BCI-016-PRT  Female         18                           No\n",
       "2               2018-TJ-BCD  Female         18                           No\n",
       "3             Breast cancer  Female         20   Accepts Healthy Volunteers\n",
       "4                  BC-BOMET  Female         18                           No\n",
       "5                    241391  Female         30   Accepts Healthy Volunteers\n",
       "...                     ...     ...         ...                         ...\n",
       "5025            32900654326  Female         18                           No\n",
       "5026             ReDA 13176     All         18                           No\n",
       "5027   64121317.4.1001.5330    Male         18                           No\n",
       "5028            3-2020-0038     All         20                           No\n",
       "5029               2018PS02     All         nan                         nan\n",
       "\n",
       "[5019 rows x 4 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Renaming the columns for participant table and getting rid of rows\n",
    "participant_df.columns = ['ID', 'Gender', 'Minimum_Age', 'Healthy_Volunteers']\n",
    "participant_df.index.names = ['Index']\n",
    "participant_df = participant_df.drop_duplicates(subset=['ID'], keep='first')\n",
    "participant_df\n",
    "\n",
    "# ** below code can be used if Minimum Age needs to be converted into integer\n",
    "#  participant_df['Minimum_Age'] = participant_df['Minimum_Age'].astype(float).astype('Int64')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "fd1e1bf7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# List of fields for study detail table\n",
    "study_detail_cols = ['OrgStudyId',\n",
    "               'IsFDARegulatedDrug',\n",
    "               'IsFDARegulatedDevice',\n",
    "               'ResponsiblePartyType']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c5919043",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Call the function the create DataFrame for study method table\n",
    "study_details_df = pd.DataFrame(create_df(study_detail_cols))\n",
    "study_details_df.set_index('Rank', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "1b609c17",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\gahyo\\anaconda3\\envs\\mlenv\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  if __name__ == \"__main__\":\n"
     ]
    }
   ],
   "source": [
    "# Clean DataFrame for study method table\n",
    "clean_data(study_details_df, study_detail_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "e9fa270b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>FDA_Regulated_Drug</th>\n",
       "      <th>FDA_Regulated_Device</th>\n",
       "      <th>Responsible_Party</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>241391</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5025</th>\n",
       "      <td>32900654326</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5026</th>\n",
       "      <td>ReDA 13176</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor-Investigator</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5027</th>\n",
       "      <td>64121317.4.1001.5330</td>\n",
       "      <td>Yes</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5028</th>\n",
       "      <td>3-2020-0038</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Sponsor-Investigator</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5029</th>\n",
       "      <td>2018PS02</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Principal Investigator</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5019 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                         ID FDA_Regulated_Drug FDA_Regulated_Device  \\\n",
       "Index                                                                 \n",
       "1           BTX-BCI-016-PRT                 No                   No   \n",
       "2               2018-TJ-BCD                 No                   No   \n",
       "3             Breast cancer                 No                   No   \n",
       "4                  BC-BOMET                 No                   No   \n",
       "5                    241391                 No                   No   \n",
       "...                     ...                ...                  ...   \n",
       "5025            32900654326                 No                   No   \n",
       "5026             ReDA 13176                 No                   No   \n",
       "5027   64121317.4.1001.5330                Yes                   No   \n",
       "5028            3-2020-0038                 No                   No   \n",
       "5029               2018PS02                 No                   No   \n",
       "\n",
       "            Responsible_Party  \n",
       "Index                          \n",
       "1                     Sponsor  \n",
       "2      Principal Investigator  \n",
       "3      Principal Investigator  \n",
       "4                     Sponsor  \n",
       "5                     Sponsor  \n",
       "...                       ...  \n",
       "5025   Principal Investigator  \n",
       "5026     Sponsor-Investigator  \n",
       "5027                  Sponsor  \n",
       "5028     Sponsor-Investigator  \n",
       "5029   Principal Investigator  \n",
       "\n",
       "[5019 rows x 4 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Renaming the columns for study detail table and removing duplicates\n",
    "study_details_df.columns = ['ID', 'FDA_Regulated_Drug', 'FDA_Regulated_Device', 'Responsible_Party']\n",
    "study_details_df.index.names = ['Index']\n",
    "study_details_df = study_details_df.drop_duplicates(subset=['ID'], keep='first')\n",
    "study_details_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "cf276005",
   "metadata": {},
   "outputs": [],
   "source": [
    "# List of fields for study method table\n",
    "study_method_cols = ['OrgStudyId',\n",
    "          'StudyType',\n",
    "          'ArmGroupType',\n",
    "          'InterventionType',\n",
    "          'DesignInterventionModel',\n",
    "          'DesignObservationalModel',\n",
    "          'TargetDuration',\n",
    "          'SamplingMethod',\n",
    "          'Phase']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "3c65381a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Call function to create DataFrame for table D\n",
    "study_method_df = pd.DataFrame(create_df(study_method_cols))\n",
    "study_method_df.set_index('Rank',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "cb290e3e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\gahyo\\anaconda3\\envs\\mlenv\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  if __name__ == \"__main__\":\n"
     ]
    }
   ],
   "source": [
    "# Clean DataFrame for study method table\n",
    "clean_data(study_method_df, study_method_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "294adaef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Study_Type</th>\n",
       "      <th>Arm_Group_Type</th>\n",
       "      <th>Intervention_Type</th>\n",
       "      <th>Interventional_Study_Model</th>\n",
       "      <th>Observational_Study_Model</th>\n",
       "      <th>Target_Duration</th>\n",
       "      <th>Sampling_Method</th>\n",
       "      <th>Phase</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Only</td>\n",
       "      <td>5 Years</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Genetic</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Case-Control</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>241391</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Diagnostic Test</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cohort</td>\n",
       "      <td>1 Year</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5025</th>\n",
       "      <td>32900654326</td>\n",
       "      <td>Interventional</td>\n",
       "      <td>Placebo Comparator</td>\n",
       "      <td>Procedure</td>\n",
       "      <td>Parallel Assignment</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Not Applicable</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5026</th>\n",
       "      <td>ReDA 13176</td>\n",
       "      <td>Interventional</td>\n",
       "      <td>Active Comparator</td>\n",
       "      <td>Radiation</td>\n",
       "      <td>Parallel Assignment</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Phase 3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5027</th>\n",
       "      <td>64121317.4.1001.5330</td>\n",
       "      <td>Interventional</td>\n",
       "      <td>Experimental</td>\n",
       "      <td>Drug</td>\n",
       "      <td>Single Group Assignment</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Phase 2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5028</th>\n",
       "      <td>3-2020-0038</td>\n",
       "      <td>Interventional</td>\n",
       "      <td>Experimental</td>\n",
       "      <td>Radiation</td>\n",
       "      <td>Single Group Assignment</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Not Applicable</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5029</th>\n",
       "      <td>2018PS02</td>\n",
       "      <td>Observational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Behavioral</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Ecologic or Community</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Non-Probability Sample</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5019 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                         ID      Study_Type      Arm_Group_Type  \\\n",
       "Index                                                             \n",
       "1           BTX-BCI-016-PRT   Observational                 NaN   \n",
       "2               2018-TJ-BCD   Observational                 NaN   \n",
       "3             Breast cancer   Observational                 NaN   \n",
       "4                  BC-BOMET   Observational                 NaN   \n",
       "5                    241391   Observational                 NaN   \n",
       "...                     ...             ...                 ...   \n",
       "5025            32900654326  Interventional  Placebo Comparator   \n",
       "5026             ReDA 13176  Interventional   Active Comparator   \n",
       "5027   64121317.4.1001.5330  Interventional        Experimental   \n",
       "5028            3-2020-0038  Interventional        Experimental   \n",
       "5029               2018PS02   Observational                 NaN   \n",
       "\n",
       "      Intervention_Type Interventional_Study_Model Observational_Study_Model  \\\n",
       "Index                                                                          \n",
       "1       Diagnostic Test                        NaN                 Case-Only   \n",
       "2       Diagnostic Test                        NaN              Case-Control   \n",
       "3               Genetic                        NaN              Case-Control   \n",
       "4                 Other                        NaN              Case-Control   \n",
       "5       Diagnostic Test                        NaN                    Cohort   \n",
       "...                 ...                        ...                       ...   \n",
       "5025          Procedure        Parallel Assignment                       NaN   \n",
       "5026          Radiation        Parallel Assignment                       NaN   \n",
       "5027               Drug    Single Group Assignment                       NaN   \n",
       "5028          Radiation    Single Group Assignment                       NaN   \n",
       "5029         Behavioral                        NaN     Ecologic or Community   \n",
       "\n",
       "      Target_Duration         Sampling_Method           Phase  \n",
       "Index                                                          \n",
       "1             5 Years  Non-Probability Sample             NaN  \n",
       "2                 NaN  Non-Probability Sample             NaN  \n",
       "3                 NaN      Probability Sample             NaN  \n",
       "4                 NaN  Non-Probability Sample             NaN  \n",
       "5              1 Year  Non-Probability Sample             NaN  \n",
       "...               ...                     ...             ...  \n",
       "5025              NaN                     NaN  Not Applicable  \n",
       "5026              NaN                     NaN         Phase 3  \n",
       "5027              NaN                     NaN         Phase 2  \n",
       "5028              NaN                     NaN  Not Applicable  \n",
       "5029              NaN  Non-Probability Sample             NaN  \n",
       "\n",
       "[5019 rows x 9 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Renaming the columns for table D and removing duplicates and simplifyingstudy_method_df.columns = ['ID', 'Study_Type', 'Arm_Group_Type', 'Intervention_Type', 'Interventional_Study_Model', 'Observational_Study_Model', 'Target_Duration', 'Sampling_Method', 'Phase']\n",
    "study_method_df.index.names = ['Index']\n",
    "study_method_df.columns = ['ID', 'Study_Type', 'Arm_Group_Type', 'Intervention_Type', 'Interventional_Study_Model', 'Observational_Study_Model', 'Target_Duration', 'Sampling_Method', 'Phase']\n",
    "study_method_df = study_method_df.drop_duplicates(subset=['ID'], keep='first')\n",
    "study_method_df['Arm_Group_Type'] = study_method_df['Arm_Group_Type'].str.split(',').str[0]\n",
    "study_method_df['Intervention_Type'] = study_method_df['Intervention_Type'].str.split(',').str[0]\n",
    "study_method_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "31753f41",
   "metadata": {},
   "outputs": [],
   "source": [
    "# List of fields for study detail table and export dataset into DataFrame\n",
    "text_analysis_cols = ['OrgStudyId',\n",
    "          'WhyStopped',            \n",
    "          'EnrollmentCount',\n",
    "          'PrimaryOutcomeMeasure',]\n",
    "text_analysis_df = pd.DataFrame(create_df(text_analysis_cols))\n",
    "text_analysis_df.set_index('Rank', inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "8651a223",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\gahyo\\anaconda3\\envs\\mlenv\\lib\\site-packages\\ipykernel_launcher.py:9: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  if __name__ == \"__main__\":\n"
     ]
    }
   ],
   "source": [
    "clean_data(text_analysis_df, text_analysis_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "09e7c3bc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Why_Stopped</th>\n",
       "      <th>Enrollment_Count</th>\n",
       "      <th>Primary_Outcome_Measure</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BTX-BCI-016-PRT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3000</td>\n",
       "      <td>To determine BCI test performance by evaluatin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-TJ-BCD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2300</td>\n",
       "      <td>Diagnostic potential of SEMA4C as a biomarker ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Breast cancer</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80</td>\n",
       "      <td>Role of SORCIN in patients with breast cancer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BC-BOMET</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30</td>\n",
       "      <td>SENP1 expression</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>241391</td>\n",
       "      <td>NaN</td>\n",
       "      <td>600</td>\n",
       "      <td>Performance of the Syantra DX Breast Cancer te...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5025</th>\n",
       "      <td>32900654326</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80</td>\n",
       "      <td>The primary outcome of the study will be durat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5026</th>\n",
       "      <td>ReDA 13176</td>\n",
       "      <td>NaN</td>\n",
       "      <td>180</td>\n",
       "      <td>Overall Survival</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5027</th>\n",
       "      <td>64121317.4.1001.5330</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38</td>\n",
       "      <td>PSA response rate</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5028</th>\n",
       "      <td>3-2020-0038</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80</td>\n",
       "      <td>3-year local recurrence rate</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5029</th>\n",
       "      <td>2018PS02</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19</td>\n",
       "      <td>Initially targeted drug therapy risks in patie...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5019 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                         ID Why_Stopped Enrollment_Count  \\\n",
       "Index                                                      \n",
       "1           BTX-BCI-016-PRT         NaN             3000   \n",
       "2               2018-TJ-BCD         NaN             2300   \n",
       "3             Breast cancer         NaN               80   \n",
       "4                  BC-BOMET         NaN               30   \n",
       "5                    241391         NaN              600   \n",
       "...                     ...         ...              ...   \n",
       "5025            32900654326         NaN               80   \n",
       "5026             ReDA 13176         NaN              180   \n",
       "5027   64121317.4.1001.5330         NaN               38   \n",
       "5028            3-2020-0038         NaN               80   \n",
       "5029               2018PS02         NaN               19   \n",
       "\n",
       "                                 Primary_Outcome_Measure  \n",
       "Index                                                     \n",
       "1      To determine BCI test performance by evaluatin...  \n",
       "2      Diagnostic potential of SEMA4C as a biomarker ...  \n",
       "3          Role of SORCIN in patients with breast cancer  \n",
       "4                                       SENP1 expression  \n",
       "5      Performance of the Syantra DX Breast Cancer te...  \n",
       "...                                                  ...  \n",
       "5025   The primary outcome of the study will be durat...  \n",
       "5026                                    Overall Survival  \n",
       "5027                                   PSA response rate  \n",
       "5028                        3-year local recurrence rate  \n",
       "5029   Initially targeted drug therapy risks in patie...  \n",
       "\n",
       "[5019 rows x 4 columns]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Renaming the columns for table E and removing duplicates\n",
    "text_analysis_df.columns = ['ID', 'Why_Stopped','Enrollment_Count', 'Primary_Outcome_Measure']\n",
    "text_analysis_df.index.names = ['Index']\n",
    "text_analysis_df = text_analysis_df.drop_duplicates(subset=['ID'], keep='first')\n",
    "text_analysis_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "0b08919a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Export DataFrames as csv files \n",
    "study_method_df.to_csv('Tables/study_method_df.csv',index=False)\n",
    "text_analysis_df.to_csv('Tables/free_text_df.csv', index=False)\n",
    "registration_df.to_csv('Tables/registration_df.csv', index=False)\n",
    "participant_df.to_csv('Tables/participant_df.csv', index=False)\n",
    "study_details_df.to_csv('Tables/study_details_df.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}