--- a +++ b/notebooks/scraping.ipynb @@ -0,0 +1,1027 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.support.ui import Select\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "from selenium.common.exceptions import TimeoutException\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "//*[@id=\"adv-check-status\"]/div/div[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Radio button for 'Recruiting and not yet recruiting studies' is detected and visible: True\n" + ] + } + ], + "source": [ + "driver = webdriver.Chrome()\n", + "driver.set_window_size(1120, 1000)\n", + "driver.get(\"https://clinicaltrials.gov\")\n", + "\n", + "try:\n", + " # Wait for the radio group container to be visible\n", + " WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"main-content\"]/ctg-home/div/div[2]/ctg-home-search-panel/div/div[2]/ctg-search-filters-form/div[2]'))\n", + " )\n", + "\n", + " # Option 1: Wait for the radio button using ID\n", + " recruiting_radio_button = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"adv-check-status\"]/div/div[2]'))\n", + " )\n", + "\n", + " # Option 2: Wait for the label and locate the radio button\n", + " # recruiting_radio_label = WebDriverWait(driver, 20).until(\n", + " # EC.element_to_be_clickable((By.XPATH, \"//label[@for='adv-radio-status1']\"))\n", + " # )\n", + " # recruiting_radio_button = driver.find_element(By.XPATH, \"//input[@id='adv-radio-status1']\")\n", + "\n", + " # Print if the radio button is displayed\n", + " print(\"Radio button for 'Recruiting and not yet recruiting studies' is detected and visible:\", recruiting_radio_button.is_displayed())\n", + " \n", + " # Scroll the page to bring the element into view before clicking\n", + " driver.execute_script(\"arguments[0].scrollIntoView(true);\", recruiting_radio_button)\n", + "\n", + " # Click the button after scrolling to it\n", + " recruiting_radio_button.click()\n", + "\n", + "\n", + "except TimeoutException:\n", + " print(\"The radio button was not found or is not interactable.\")\n", + " print(driver.page_source)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recruiting option selected successfully.\n", + "Search button clicked, navigating to the results page.\n" + ] + } + ], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "from selenium.common.exceptions import TimeoutException\n", + "import time\n", + "\n", + "# Set up the driver and open the page\n", + "driver = webdriver.Chrome()\n", + "driver.set_window_size(1120, 1000)\n", + "driver.get(\"https://clinicaltrials.gov\")\n", + "\n", + "try:\n", + " # Wait for the radio group container to be visible\n", + " WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"main-content\"]/ctg-home/div/div[2]/ctg-home-search-panel/div/div[2]/ctg-search-filters-form/div[2]'))\n", + " )\n", + "\n", + " # Scroll down slowly to the radio button section\n", + " driver.execute_script(\"window.scrollBy(0, 600);\") # Adjust scrolling value as needed\n", + " time.sleep(1) # Give time for the scrolling action to be visually clear\n", + "\n", + " # Locate and click the 'Recruiting and not yet recruiting studies' radio button\n", + " recruiting_radio_button = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"adv-check-status\"]/div/div[2]')) #//*[@id=\"adv-check-status\"]/div/div[2]\n", + " # //*[@id=\"adv-check-status\"]/div/div[2]\n", + " )\n", + "\n", + " # Scroll the page to bring the radio button into view and click it\n", + " driver.execute_script(\"arguments[0].scrollIntoView(true);\", recruiting_radio_button)\n", + " time.sleep(1) # Pause before interacting\n", + "\n", + " # Ensure the 'Recruiting and not yet recruiting' radio button is selected\n", + " recruiting_radio_button.click()\n", + " time.sleep(1) # Wait a bit to confirm the selection\n", + " \n", + " # Optionally verify the selection\n", + " selected_status = driver.find_element(By.XPATH, '//*[@id=\"adv-radio-status1\"]').is_selected()\n", + " if selected_status:\n", + " print(\"Recruiting option selected successfully.\")\n", + " else:\n", + " print(\"Failed to select Recruiting option.\")\n", + "\n", + " # Now, wait for the search button to be visible\n", + " search_button = WebDriverWait(driver, 20).until(\n", + " EC.element_to_be_clickable((By.XPATH, '//*[@id=\"main-content\"]/ctg-home/div/div[2]/ctg-home-search-panel/div/div[3]/div/div/button'))\n", + " )\n", + "\n", + " # Scroll the page to bring the search button into view (just in case)\n", + " driver.execute_script(\"arguments[0].scrollIntoView(true);\", search_button)\n", + " time.sleep(1) # Wait a bit before clicking\n", + "\n", + " # Click the search button\n", + " search_button.click()\n", + " print(\"Search button clicked, navigating to the results page.\")\n", + "\n", + "except TimeoutException:\n", + " print(\"The search button was not found or is not interactable.\")\n", + " print(driver.page_source)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recruiting option selected successfully.\n", + "Search button clicked, navigating to the results page.\n", + "'Not yet recruiting' checkbox was already unselected.\n", + "An error occurred: Message: \n", + "\n" + ] + } + ], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "import time\n", + "\n", + "# Set up the driver and open the main Clinical Trials page\n", + "driver = webdriver.Chrome()\n", + "driver.set_window_size(1120, 1000)\n", + "driver.get(\"https://clinicaltrials.gov\")\n", + "\n", + "try:\n", + " # ----- FIRST PAGE -----\n", + " # Wait for the radio group container to be visible\n", + " WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"main-content\"]/ctg-home/div/div[2]/ctg-home-search-panel/div/div[2]/ctg-search-filters-form/div[2]'))\n", + " )\n", + "\n", + " # Scroll down slowly to the radio button section\n", + " driver.execute_script(\"window.scrollBy(0, 600);\") # Adjust scrolling value as needed\n", + " time.sleep(1) # Give time for the scrolling action to be visually clear\n", + "\n", + "\n", + " # Locate and click the 'Recruiting and not yet recruiting studies' radio button\n", + " recruiting_radio_button = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"adv-check-status\"]/div/div[2]'))\n", + " )\n", + " \n", + " # Scroll the page to bring the radio button into view and click it\n", + " driver.execute_script(\"arguments[0].scrollIntoView(true);\", recruiting_radio_button)\n", + " time.sleep(1) # Pause before interacting\n", + " \n", + " # Ensure the 'Recruiting and not yet recruiting' radio button is selected\n", + " recruiting_radio_button.click()\n", + " time.sleep(1) # Wait a bit to confirm the selection\n", + " \n", + " # Optionally verify the selection\n", + " selected_status = driver.find_element(By.XPATH, '//*[@id=\"adv-radio-status1\"]').is_selected()\n", + " if selected_status:\n", + " print(\"Recruiting option selected successfully.\")\n", + " else:\n", + " print(\"Failed to select Recruiting option.\")\n", + " \n", + " \n", + " # Now, wait for the search button to be visible\n", + " search_button = WebDriverWait(driver, 20).until(\n", + " EC.element_to_be_clickable((By.XPATH, '//*[@id=\"main-content\"]/ctg-home/div/div[2]/ctg-home-search-panel/div/div[3]/div/div/button'))\n", + " )\n", + " \n", + " # Scroll the page to bring the search button into view (just in case)\n", + " driver.execute_script(\"arguments[0].scrollIntoView(true);\", search_button)\n", + " time.sleep(1) # Wait a bit before clicking\n", + " \n", + " # Click the search button\n", + " search_button.click()\n", + " print(\"Search button clicked, navigating to the results page.\")\n", + "\n", + " \n", + " # Wait for the second page to load\n", + " time.sleep(5) # You may want to adjust this based on your internet speed\n", + "\n", + " # ----- SECOND PAGE -----\n", + " # Wait for the filter container to be visible on the second page\n", + " filter_container = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"main-content\"]/ctg-search-results-page/div[2]/section/div[1]/ctg-focus-your-search-panel/div/div[2]/ctg-search-filters-form'))\n", + " )\n", + "\n", + " # Scroll to the filter container\n", + " driver.execute_script(\"arguments[0].scrollIntoView(true);\", filter_container)\n", + " time.sleep(1)\n", + "\n", + " # Locate the \"Not yet recruiting\" checkbox and unselect it if it's selected\n", + " not_yet_recruiting_checkbox = WebDriverWait(driver, 20).until(\n", + " EC.element_to_be_clickable((By.XPATH, '//*[@id=\"adv-check-status\"]/div[2]/div[1]'))\n", + " )\n", + " \n", + " # Unselect the checkbox if it's selected\n", + " if not_yet_recruiting_checkbox.is_selected():\n", + " not_yet_recruiting_checkbox.click()\n", + " print(\"Unselected the 'Not yet recruiting' checkbox.\")\n", + " else:\n", + " print(\"'Not yet recruiting' checkbox was already unselected.\")\n", + "\n", + " # Scroll to the Apply Filters button\n", + " apply_filters_button = WebDriverWait(driver, 20).until(\n", + " EC.element_to_be_clickable((By.XPATH, '//*[@id=\"apply-filters\"]'))\n", + " )\n", + " driver.execute_script(\"arguments[0].scrollIntoView(true);\", apply_filters_button)\n", + " time.sleep(1)\n", + "\n", + " # Click the Apply Filters button\n", + " apply_filters_button.click()\n", + " print(\"Clicked the 'Apply filters' button.\")\n", + "\n", + "except Exception as e:\n", + " print(\"An error occurred:\", e)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for the filter container to load...\n", + "Filter container loaded successfully.\n", + "Verifying the 'Not yet recruiting' checkbox...\n", + "Checkbox 'Not yet recruiting' found.\n", + "Checkbox is already unselected.\n", + "Verifying the 'Apply Filters' button...\n", + "An error occurred: Message: \n", + "\n", + "Script completed. Keeping the browser window open.\n" + ] + } + ], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "import time\n", + "\n", + "# Set up the driver and open the specific page directly\n", + "driver = webdriver.Chrome()\n", + "driver.set_window_size(1120, 1000)\n", + "driver.get(\"https://clinicaltrials.gov/search?aggFilters=status:not%20rec\")\n", + "\n", + "try:\n", + " # ----- STEP 1: Verify filter container -----\n", + " print(\"Waiting for the filter container to load...\")\n", + " filter_container = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"main-content\"]/ctg-search-results-page/div[2]/section/div[1]/ctg-focus-your-search-panel/div/div[2]/ctg-search-filters-form'))\n", + " )\n", + " print(\"Filter container loaded successfully.\")\n", + " time.sleep(2) # Pause for visual confirmation\n", + "\n", + " # ----- STEP 2: Verify 'Not yet recruiting' checkbox -----\n", + " print(\"Verifying the 'Not yet recruiting' checkbox...\")\n", + " not_yet_recruiting_checkbox = WebDriverWait(driver, 20).until(\n", + " EC.presence_of_element_located((By.XPATH, '//*[@id=\"adv-check-status\"]/div[2]/div[1]'))\n", + " )\n", + " print(\"Checkbox 'Not yet recruiting' found.\")\n", + " time.sleep(2) # Pause for visual confirmation\n", + "\n", + " # Check if it's selected, and unselect if it is\n", + " if not_yet_recruiting_checkbox.is_selected():\n", + " print(\"Checkbox is selected, unselecting it...\")\n", + " not_yet_recruiting_checkbox.click()\n", + " print(\"Checkbox unselected successfully.\")\n", + " else:\n", + " print(\"Checkbox is already unselected.\")\n", + " time.sleep(2) # Pause for visual confirmation\n", + "\n", + " # ----- STEP 3: Verify 'Apply Filters' button -----\n", + " print(\"Verifying the 'Apply Filters' button...\")\n", + " apply_filters_button = WebDriverWait(driver, 20).until(\n", + " EC.element_to_be_clickable((By.XPATH, '//*[@id=\"apply-filters\"]'))\n", + " )\n", + " print(\"'Apply Filters' button is available.\")\n", + " time.sleep(2) # Pause for visual confirmation\n", + "\n", + " # ----- STEP 4: Click 'Apply Filters' -----\n", + " print(\"Clicking the 'Apply Filters' button...\")\n", + " apply_filters_button.click()\n", + " print(\"Clicked 'Apply Filters'.\")\n", + " time.sleep(2) # Pause for visual confirmation of the results after applying the filter\n", + "\n", + "except Exception as e:\n", + " print(\"An error occurred:\", e)\n", + "\n", + "finally:\n", + " # Keep the browser open\n", + " print(\"Script completed. Keeping the browser window open.\")\n", + " input(\"Press Enter to close the browser window...\")\n", + " driver.quit() # Only quits when you press Enter\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filter container loaded successfully.\n", + "Scrolling the filter container...\n", + "Script completed. Keeping the browser window open.\n" + ] + } + ], + "source": [ + "# ----- FIRST PAGE -----\n", + "# Wait for the filter group container to be visible\n", + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "import time\n", + "\n", + "# Set up the driver\n", + "driver = webdriver.Chrome()\n", + "driver.set_window_size(1120, 1000)\n", + "\n", + "# Navigate to the page\n", + "driver.get(\"https://clinicaltrials.gov/search?aggFilters=status:not%20rec\")\n", + "\n", + "try:\n", + " # Step 1: Wait for the filter container to load\n", + " filter_container = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"main-content\"]/ctg-search-results-page/div[2]/section/div[1]/ctg-focus-your-search-panel/div/div[2]/ctg-search-filters-form'))\n", + " )\n", + " print(\"Filter container loaded successfully.\")\n", + " time.sleep(2)\n", + "\n", + " # Step 2: Scroll within the filter container using JavaScript\n", + " print(\"Scrolling the filter container...\")\n", + " \n", + " # Scroll by a specific amount (e.g., 500 pixels)\n", + " driver.execute_script(\"arguments[0].scrollTop = arguments[0].scrollTop + 500;\", filter_container)\n", + " \n", + " # Alternatively, you can scroll to the bottom of the container:\n", + " # driver.execute_script(\"arguments[0].scrollTop = arguments[0].scrollHeight;\", filter_container)\n", + " \n", + " time.sleep(2) # Pause for visual confirmation\n", + "\n", + "except Exception as e:\n", + " print(\"An error occurred:\", e)\n", + "\n", + "finally:\n", + " # Keep the browser open\n", + " print(\"Script completed. Keeping the browser window open.\")\n", + " input(\"Press Enter to close the browser window...\")\n", + " driver.quit()\n", + " # Pause for visual confirmation\n", + "\n", + "\n", + "# # Locate and click the 'Recruiting and not yet recruiting studies' radio button\n", + "# recruiting_radio_button = WebDriverWait(driver, 20).until(\n", + "# EC.visibility_of_element_located((By.XPATH, '//*[@id=\"adv-check-status\"]/div/div[2]'))\n", + "# )\n", + "\n", + "# # Scroll the page to bring the radio button into view and click it\n", + "# driver.execute_script(\"arguments[0].scrollIntoView(true);\", recruiting_radio_button)\n", + "# time.sleep(1) # Pause before interacting\n", + "\n", + "# # Ensure the 'Recruiting and not yet recruiting' radio button is selected\n", + "# recruiting_radio_button.click()\n", + "# time.sleep(1) # Wait a bit to confirm the selection\n", + "\n", + "# # Optionally verify the selection\n", + "# selected_status = driver.find_element(By.XPATH, '//*[@id=\"adv-radio-status1\"]').is_selected()\n", + "# if selected_status:\n", + "# print(\"Recruiting option selected successfully.\")\n", + "# else:\n", + "# print(\"Failed to select Recruiting option.\")\n", + " \n", + "\n", + "# # Now, wait for the search button to be visible\n", + "# search_button = WebDriverWait(driver, 20).until(\n", + "# EC.element_to_be_clickable((By.XPATH, '//*[@id=\"main-content\"]/ctg-home/div/div[2]/ctg-home-search-panel/div/div[3]/div/div/button'))\n", + "# )\n", + "\n", + "# # Scroll the page to bring the search button into view (just in case)\n", + "# driver.execute_script(\"arguments[0].scrollIntoView(true);\", search_button)\n", + "# time.sleep(1) # Wait a bit before clicking\n", + "\n", + "# # Click the search button\n", + "# search_button.click()\n", + "# print(\"Search button clicked, navigating to the results page.\")\n", + "\n", + "\n", + "# # Wait for the second page to load\n", + "# time.sleep(5) # You may want to adjust this based on your internet speed\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filter container loaded successfully.\n", + "Script completed. Keeping the browser window open.\n" + ] + } + ], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "import time\n", + "\n", + "# Set up the driver\n", + "driver = webdriver.Chrome()\n", + "driver.set_window_size(1120, 1000)\n", + "\n", + "# Navigate to the page\n", + "driver.get(\"https://clinicaltrials.gov/search?aggFilters=status:not%20rec\")\n", + "\n", + "try:\n", + " # Wait for the filter container to be visible\n", + " filter_container = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"main-content\"]/ctg-search-results-page/div[2]/section/div[1]/ctg-focus-your-search-panel/div/div[2]/ctg-search-filters-form'))\n", + " )\n", + " \n", + " print(\"Filter container loaded successfully.\")\n", + " \n", + " # Highlight the filter container by adding a red border around it\n", + " driver.execute_script(\"arguments[0].style.border='3px solid red'\", filter_container)\n", + " \n", + " # Pause to visually inspect the browser\n", + " time.sleep(5) # Adjust this if needed for more time to inspect\n", + "\n", + "except Exception as e:\n", + " print(\"An error occurred:\", e)\n", + "\n", + "finally:\n", + " # Keep the browser open for visual confirmation\n", + " print(\"Script completed. Keeping the browser window open.\")\n", + " input(\"Press Enter to close the browser window...\")\n", + " driver.quit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not Yet Recruiting button loaded successfully.\n", + "Script completed. Keeping the browser window open.\n" + ] + } + ], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "import time\n", + "\n", + "# Set up the driver\n", + "driver = webdriver.Chrome()\n", + "driver.set_window_size(1120, 1000)\n", + "\n", + "# Navigate to the page\n", + "driver.get(\"https://clinicaltrials.gov/search?aggFilters=status:not%20rec\")\n", + "\n", + "try:\n", + " # Wait for the \"Not Yet Recruiting\" radio button to be visible\n", + " not_yet_recruiting_button = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"adv-check-status\"]/div[2]/div[1]'))\n", + " )\n", + " \n", + " print(\"Not Yet Recruiting button loaded successfully.\")\n", + " \n", + " # Highlight the \"Not Yet Recruiting\" button by adding a red border around it\n", + " driver.execute_script(\"arguments[0].style.border='3px solid red'\", not_yet_recruiting_button)\n", + " \n", + " # Pause to visually inspect the browser\n", + " time.sleep(5) # Adjust this if needed for more time to inspect\n", + "\n", + "except Exception as e:\n", + " print(\"An error occurred:\", e)\n", + "\n", + "finally:\n", + " # Keep the browser open for visual confirmation\n", + " print(\"Script completed. Keeping the browser window open.\")\n", + " input(\"Press Enter to close the browser window...\")\n", + " driver.quit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not Yet Recruiting checkbox loaded successfully.\n", + "Not Yet Recruiting checkbox is currently selected. Unselecting it...\n", + "Not Yet Recruiting checkbox has been unselected.\n", + "Script completed. Keeping the browser window open.\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "Interrupted by user", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[35], line 50\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 48\u001b[0m \u001b[38;5;66;03m# Keep the browser open for visual confirmation\u001b[39;00m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mScript completed. Keeping the browser window open.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 50\u001b[0m \u001b[38;5;28;43minput\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPress Enter to close the browser window...\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 51\u001b[0m driver\u001b[38;5;241m.\u001b[39mquit()\n", + "File \u001b[0;32m~/Desktop/patient-trials-matching/env/lib/python3.9/site-packages/ipykernel/kernelbase.py:1282\u001b[0m, in \u001b[0;36mKernel.raw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 1280\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraw_input was called, but this frontend does not support input requests.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1281\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m StdinNotImplementedError(msg)\n\u001b[0;32m-> 1282\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_input_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1283\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1284\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_parent_ident\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshell\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1285\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_parent\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshell\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1286\u001b[0m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1287\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/patient-trials-matching/env/lib/python3.9/site-packages/ipykernel/kernelbase.py:1325\u001b[0m, in \u001b[0;36mKernel._input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1323\u001b[0m \u001b[38;5;66;03m# re-raise KeyboardInterrupt, to truncate traceback\u001b[39;00m\n\u001b[1;32m 1324\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInterrupted by user\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1325\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m(msg) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1327\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlog\u001b[38;5;241m.\u001b[39mwarning(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid Message:\u001b[39m\u001b[38;5;124m\"\u001b[39m, exc_info\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: Interrupted by user" + ] + } + ], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "import time\n", + "\n", + "# Set up the driver\n", + "driver = webdriver.Chrome()\n", + "driver.set_window_size(1120, 1000)\n", + "\n", + "# Navigate to the page\n", + "driver.get(\"https://clinicaltrials.gov/search?aggFilters=status:not%20rec\")\n", + "\n", + "try:\n", + " # Wait for the \"Not Yet Recruiting\" checkbox to be visible\n", + " not_yet_recruiting_checkbox = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"adv-check-status\"]/div[2]/div[1]'))\n", + " )\n", + " \n", + " print(\"Not Yet Recruiting checkbox loaded successfully.\")\n", + " \n", + " # Check if the checkbox is selected\n", + " checkbox_input = not_yet_recruiting_checkbox.find_element(By.TAG_NAME, 'input')\n", + " \n", + " if checkbox_input.is_selected():\n", + " print(\"Not Yet Recruiting checkbox is currently selected. Unselecting it...\")\n", + " \n", + " # Scroll into view\n", + " driver.execute_script(\"arguments[0].scrollIntoView(true);\", not_yet_recruiting_checkbox)\n", + " \n", + " # Use JavaScript to click the checkbox to unselect it\n", + " driver.execute_script(\"arguments[0].click();\", checkbox_input)\n", + "\n", + " # Highlight it after unselecting\n", + " driver.execute_script(\"arguments[0].style.border='3px solid red'\", not_yet_recruiting_checkbox)\n", + "\n", + " print(\"Not Yet Recruiting checkbox has been unselected.\")\n", + " else:\n", + " print(\"Not Yet Recruiting checkbox is already unselected.\")\n", + "\n", + " # Pause to visually inspect the browser\n", + " time.sleep(5) # Adjust this if needed for more time to inspect\n", + "\n", + "except Exception as e:\n", + " print(\"An error occurred:\", e)\n", + "\n", + "finally:\n", + " # Keep the browser open for visual confirmation\n", + " print(\"Script completed. Keeping the browser window open.\")\n", + " input(\"Press Enter to close the browser window...\")\n", + " driver.quit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Eligibility Criteria container loaded successfully.\n", + "Eligibility Criteria:\n", + "Description\n", + "Inclusion Criteria:\n", + "Age ≥ 18 years at inclusion date\n", + "Schirmer's test ≤ 5 mm for both eyes\n", + "NIBUT ≤ 10 s for both eyes\n", + "Previous positive result of blood test for sample anti-Ro (SS-A) or anti-La (SS-B), as indicated by medical record or lab result shown by the subject.\n", + "The study subject reports having understood and have signed the Informed Consent Form (ICF) and is willing to comply with all investigation visits and assessments.\n", + "Women of childbearing potential must agree to use a reliable, medically approved form of contraception during the study participation until end of study.\n", + "Anticipated compliance with prescribed treatment and follow-up.\n", + "Exclusion Criteria:\n", + "Recently (12 months prior enrolment) undergone nasal, sinus, or ocular surgery.\n", + "Presence of an ocular or respiratory condition that could affect the study parameters such as active ocular infection/inflammation, glaucoma, diabetic retinopathy, or upper respiratory tract infection per the Investigator's judgement.\n", + "The study subject has a cognitive incapacity or language barrier precluding adequate understanding or cooperation.\n", + "Any severe diseases interfering with the performance, evaluation, and outcome of the clinical evaluation.\n", + "The study subject is considered by the Investigator to be unsuitable to participate in the investigation for any other reason.\n", + "Previous (within 30 days prior to enrolment) and concurrent treatment with another investigational drug/s or device/s.\n", + "Subject is pregnant or lactating or planning to get pregnant during the duration of the study.\n", + "Show less\n", + "Ages Eligible for Study\n", + "18 Years and older (Adult, Older Adult )\n", + "Sexes Eligible for Study\n", + "All\n", + "Accepts Healthy Volunteers\n", + "No\n" + ] + } + ], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "import time\n", + "\n", + "# Set up the Selenium WebDriver\n", + "driver = webdriver.Chrome()\n", + "\n", + "# Navigate to the webpage\n", + "driver.get('https://clinicaltrials.gov/study/NCT06626477#participation-criteria')\n", + "\n", + "# Wait for the \"Eligibility Criteria\" container to load\n", + "try:\n", + " eligibility_criteria_container = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"participation-criteria\"]/ctg-participation-criteria/div[2]/div/div[2]'))\n", + " )\n", + " \n", + " print(\"Eligibility Criteria container loaded successfully.\")\n", + " \n", + " # Extract the text from the eligibility container\n", + " eligibility_text = eligibility_criteria_container.text\n", + " print(\"Eligibility Criteria:\")\n", + " print(eligibility_text)\n", + " \n", + " # Optionally save the data to a text file\n", + " with open('eligibility_criteria.txt', 'w') as file:\n", + " file.write(eligibility_text)\n", + " \n", + "except Exception as e:\n", + " print(\"An error occurred:\", e)\n", + "\n", + "finally:\n", + " # Pause to visually inspect the browser\n", + " time.sleep(5)\n", + "\n", + " # Close the browser\n", + " driver.quit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Eligibility Criteria container loaded successfully.\n", + "[]\n", + "[]\n", + "\n", + "Data successfully written to eligibility_criteria_dynamic.txt\n" + ] + } + ], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "import time\n", + "\n", + "# Set up the Selenium WebDriver\n", + "driver = webdriver.Chrome()\n", + "\n", + "# Navigate to the webpage\n", + "driver.get('https://clinicaltrials.gov/study/NCT06626386?aggFilters=status:rec&rank=2#participation-criteria')\n", + "\n", + "# Wait for the Eligibility Criteria container to load\n", + "try:\n", + " eligibility_container = WebDriverWait(driver, 20).until(\n", + " EC.visibility_of_element_located((By.XPATH, '//*[@id=\"participation-criteria\"]/ctg-participation-criteria/div[2]/div/div[2]'))\n", + " )\n", + " \n", + " print(\"Eligibility Criteria container loaded successfully.\")\n", + " \n", + " # Find and extract the headers (like \"Description\", \"Ages Eligible for Study\", etc.)\n", + " headers = driver.find_elements(By.XPATH, '//*[@id=\"participation-criteria\"]/ctg-participation-criteria/div[2]/div/div[2]//dt')\n", + " \n", + " # Find and extract the content corresponding to each header\n", + " contents = driver.find_elements(By.XPATH, '//*[@id=\"participation-criteria\"]/ctg-participation-criteria/div[2]/div/div[2]//dd')\n", + " \n", + " print(headers)\n", + " print(contents)\n", + " \n", + " # Create a dictionary to store criteria dynamically\n", + " criteria_data = {}\n", + " \n", + " # Loop over the headers and their respective contents\n", + " for header, content in zip(headers, contents):\n", + " header_text = header.text.strip() # Get the header text\n", + " content_text = content.text.strip() # Get the corresponding content text\n", + " \n", + " # Store in dictionary\n", + " criteria_data[header_text] = content_text\n", + "\n", + " # Dynamically generate formatted criteria text\n", + " formatted_criteria = \"\"\n", + " for header, content in criteria_data.items():\n", + " formatted_criteria += f\"{header}: {content}\\n\\n\" # Add each header-content pair to the formatted string\n", + "\n", + " # Print the formatted output to verify\n", + " print(formatted_criteria)\n", + " \n", + " # Save the output to a text file\n", + " with open('eligibility_criteria_dynamic.txt', 'w') as file:\n", + " file.write(formatted_criteria)\n", + " \n", + " print(\"Data successfully written to eligibility_criteria_dynamic.txt\")\n", + " \n", + "except Exception as e:\n", + " print(\"An error occurred:\", e)\n", + "\n", + "finally:\n", + " # Pause to visually inspect the browser\n", + " time.sleep(5)\n", + "\n", + " # Close the browser\n", + " driver.quit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Inclusion/Exclusion Criteria container loaded successfully.\n", + "Inclusion/Exclusion Criteria:\n", + "Description\n", + "Inclusion Criteria:\n", + "Men aged from 45 to 70 years\n", + "Informed consent for PSA measurement for prostate cancer screening\n", + "Exclusion Criteria:\n", + "Patients unable to give consent\n", + "Show less\n", + "Study Population\n", + "Male patients between the ages of 40 and 70 who receive a PSA test as part of their routine examination\n", + "Other Criteria container loaded successfully.\n", + "Other Criteria:\n", + "Ages Eligible for Study\n", + "45 Years to 70 Years (Adult, Older Adult )\n", + "Sexes Eligible for Study\n", + "Male\n", + "Accepts Healthy Volunteers\n", + "No\n", + "Sampling Method\n", + "Non-Probability Sample\n", + "Data successfully written to clinical_trial_criteria.txt\n" + ] + } + ], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "import time\n", + "\n", + "# Initialize the WebDriver (Make sure to specify the correct path to your chromedriver)\n", + "driver = webdriver.Chrome()\n", + "\n", + "# Open the clinical trials page\n", + "driver.get(\"https://clinicaltrials.gov/study/NCT06626386?aggFilters=status:rec&rank=2#participation-criteria\")\n", + "\n", + "# Wait for the page to load completely\n", + "time.sleep(5)\n", + "\n", + "try:\n", + " # Wait for the inclusion/exclusion criteria container to load\n", + " inclusion_exclusion_criteria = WebDriverWait(driver, 20).until(\n", + " EC.presence_of_element_located((By.XPATH, '//*[@id=\"participation-criteria\"]/ctg-participation-criteria/div[2]/div/div[2]/div[1]'))\n", + " )\n", + " print(\"Inclusion/Exclusion Criteria container loaded successfully.\")\n", + "\n", + " # Scraping inclusion/exclusion criteria\n", + " inclusion_exclusion_text = inclusion_exclusion_criteria.text\n", + " print(\"Inclusion/Exclusion Criteria:\")\n", + " print(inclusion_exclusion_text)\n", + "\n", + " # Wait for the other criteria container to load\n", + " other_criteria = WebDriverWait(driver, 20).until(\n", + " EC.presence_of_element_located((By.XPATH, '//*[@id=\"participation-criteria\"]/ctg-participation-criteria/div[2]/div/div[2]/div[2]'))\n", + " )\n", + " print(\"Other Criteria container loaded successfully.\")\n", + "\n", + " # Scraping other criteria\n", + " other_criteria_text = other_criteria.text\n", + " print(\"Other Criteria:\")\n", + " print(other_criteria_text)\n", + "\n", + " # Writing the scraped data to a text file\n", + " with open('clinical_trial_criteria.txt', 'w') as file:\n", + " file.write(\"Inclusion/Exclusion Criteria:\\n\")\n", + " file.write(inclusion_exclusion_text + \"\\n\\n\")\n", + " file.write(\"Other Criteria:\\n\")\n", + " file.write(other_criteria_text)\n", + "\n", + " print(\"Data successfully written to clinical_trial_criteria.txt\")\n", + "\n", + "except Exception as e:\n", + " print(f\"An error occurred: {str(e)}\")\n", + "\n", + "finally:\n", + " # Optionally, keep the browser open to see the result\n", + " time.sleep(10) # Adjust the sleep time as needed\n", + "\n", + " # Close the browser\n", + " driver.quit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Inclusion Criteria element is now in view.\n", + "Inclusion Criteria:\n", + "Men aged from 45 to 70 years\n", + "Informed consent for PSA measurement for prostate cancer screening\n", + "Exclusion Criteria element is now in view.\n", + "Exclusion Criteria:\n", + "Patients unable to give consent\n", + "Other Criteria container loaded successfully.\n", + "Other Criteria:\n", + "Ages Eligible for Study\n", + "45 Years to 70 Years (Adult, Older Adult )\n", + "Sexes Eligible for Study\n", + "Male\n", + "Accepts Healthy Volunteers\n", + "No\n", + "Sampling Method\n", + "Non-Probability Sample\n", + "Data successfully written to clinical_trial_criteria.txt\n" + ] + } + ], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "import time\n", + "\n", + "# Initialize the WebDriver (Make sure to specify the correct path to your chromedriver)\n", + "driver = webdriver.Chrome()\n", + "\n", + "# Open the clinical trials page\n", + "driver.get(\"https://clinicaltrials.gov/study/NCT06626386?aggFilters=status:rec&rank=2#participation-criteria\")\n", + "\n", + "# Wait for the page to load completely\n", + "time.sleep(5)\n", + "\n", + "try:\n", + " # Scroll the page to bring inclusion criteria into view\n", + " inclusion_element = WebDriverWait(driver, 20).until(\n", + " EC.presence_of_element_located((By.XPATH, '//*[@id=\"eligibility-criteria-description\"]/div/div/ul[1]')) #//*[@id=\"eligibility-criteria-description\"]/div/div/ul[1]\n", + " )\n", + " driver.execute_script(\"arguments[0].scrollIntoView(true);\", inclusion_element)\n", + " print(\"Inclusion Criteria element is now in view.\")\n", + "\n", + " # Scraping inclusion criteria\n", + " inclusion_criteria_text = inclusion_element.text\n", + " print(\"Inclusion Criteria:\")\n", + " print(inclusion_criteria_text)\n", + "\n", + " # Scroll to exclusion criteria\n", + " exclusion_element = WebDriverWait(driver, 20).until(\n", + " EC.presence_of_element_located((By.XPATH, '//*[@id=\"eligibility-criteria-description\"]/div/div/ul[2]'))\n", + " )\n", + " driver.execute_script(\"arguments[0].scrollIntoView(true);\", exclusion_element)\n", + " print(\"Exclusion Criteria element is now in view.\")\n", + "\n", + " # Scraping exclusion criteria\n", + " exclusion_criteria_text = exclusion_element.text\n", + " print(\"Exclusion Criteria:\")\n", + " print(exclusion_criteria_text)\n", + " \n", + " other_criteria = WebDriverWait(driver, 20).until(\n", + " EC.presence_of_element_located((By.XPATH, '//*[@id=\"participation-criteria\"]/ctg-participation-criteria/div[2]/div/div[2]/div[2]'))\n", + " )\n", + " print(\"Other Criteria container loaded successfully.\")\n", + " \n", + " other_criteria_text = other_criteria.text\n", + " print(\"Other Criteria:\")\n", + " print(other_criteria_text)\n", + "\n", + " # Writing the scraped data to a text file\n", + " with open('clinical_trial_criteria.txt', 'w') as file:\n", + " file.write(\"Inclusion Criteria:\\n\")\n", + " file.write(inclusion_criteria_text + \"\\n\\n\")\n", + " file.write(\"Exclusion Criteria:\\n\")\n", + " file.write(exclusion_criteria_text + \"\\n\\n\")\n", + " file.write(\"Other Criteria:\\n\")\n", + " file.write(other_criteria_text)\n", + "\n", + " print(\"Data successfully written to clinical_trial_criteria.txt\")\n", + "\n", + "except Exception as e:\n", + " print(f\"An error occurred: {str(e)}\")\n", + "\n", + "finally:\n", + " # Optionally, keep the browser open to see the result\n", + " time.sleep(5) # Adjust the sleep time as needed\n", + "\n", + " # Close the browser\n", + " driver.quit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scraping Environment", + "language": "python", + "name": "scraping_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}