In [1]:
import os
import json
import xml.etree.ElementTree as ET

In [12]:
xml_path = '/Users/bharathbeeravelly/Desktop/patient-trials-matching/data/xml-exploration-2.xml'

tree = ET.parse(xml_path)
root = tree.getroot()

ns = {'hl7': 'urn:hl7-org:v3'}

sections = ['Allergies and Adverse Reactions', 'Medications', 'Diagnostic Results', 'Problems', 'Surgeries', 'Vital Signs', 'Immunizations']

In [13]:
# Find the recordTarget element and navigate inside it
record_target = root.find('hl7:recordTarget/hl7:patientRole/hl7:patient', ns)

In [14]:
# Initialize the patient_data dictionary
patient_data = {}

In [15]:
def extract_section_data(section_title, section):
    """Extracts information from a given section and appends it to the patient_data dictionary."""
    print(f"Extracting Section: {section_title}")

    # Use section_title as the data_key for patient_data
    data_key = section_title

    # Initialize the list for the specified data_key if it doesn't exist
    if data_key not in patient_data:
        patient_data[data_key] = []

    # Extract the rows from the section
    rows = section.findall('.//hl7:tbody/hl7:tr', ns)

    for row in rows:
        # Extract relevant information from each row
        info = {
            'Start': row.find('hl7:td[1]', ns).text if row.find('hl7:td[1]', ns) is not None else None,
            'Stop': row.find('hl7:td[2]', ns).text if row.find('hl7:td[2]', ns) is not None else None,
            'Description': row.find('hl7:td[3]', ns).text if row.find('hl7:td[3]', ns) is not None else None
        }
        patient_data[data_key].append(info)

In [16]:
# Extract general patient details (ID, Gender, Birth Time, Race, etc.)
def extract_patient_details(root):
    """Extracts general patient details like ID, Gender, Birth Time, Race, etc., and adds them to the patient_data dictionary."""

    # Find the recordTarget element and navigate inside it
    record_target = root.find('hl7:recordTarget/hl7:patientRole', ns)
    
    if record_target is not None:
        # Extract Patient ID
        patient_id = record_target.find('hl7:id', ns)
        if patient_id is not None:
            patient_data['Patient ID'] = patient_id.get('extension')

        # Extract Given Name
        given_name = record_target.find('.//hl7:name/hl7:given', ns).text if record_target.find('.//hl7:name/hl7:given', ns) is not None else None
        patient_data['Given Name'] = given_name
        # Extract administrativeGenderCode
        gender_code = record_target.find('hl7:patient/hl7:administrativeGenderCode', ns)
        if gender_code is not None:
            patient_data['Gender'] = gender_code.get('code')

        # Extract birthTime
        birth_time = record_target.find('hl7:patient/hl7:birthTime', ns)
        if birth_time is not None:
            patient_data['Birth Time'] = birth_time.get('value')

        # Extract raceCode displayName
        race_code = record_target.find('hl7:patient/hl7:raceCode', ns)
        if race_code is not None:
            patient_data['Race'] = race_code.get('displayName')

        # Extract ethnicGroupCode displayName
        ethnic_group = record_target.find('hl7:patient/hl7:ethnicGroupCode', ns)
        if ethnic_group is not None:
            patient_data['Ethnic Group'] = ethnic_group.get('displayName')

        # Extract languageCode code
        language_code = record_target.find('hl7:patient/hl7:languageCommunication/hl7:languageCode', ns)
        if language_code is not None:
            patient_data['Language'] = language_code.get('code')

In [17]:
# Main logic for parsing the XML and capturing all data
def parse_patient_data(root):
    """Main function to parse patient data and section information from the XML."""

    # Extract general patient details
    extract_patient_details(root)

    # List of sections to extract (these are the section names we are looking for in the XML)
    sections = ['Allergies', 'Medications', 'Diagnostic Results']  # Add more section names if needed

    # Navigate to the structuredBody and then to the component sections
    structured_body = root.find('.//hl7:structuredBody', ns)

    # Now loop through all components within the structuredBody
    for component in structured_body.findall('hl7:component', ns):
        section = component.find('hl7:section', ns)
        if section is not None:
            title = section.find('hl7:title', ns)
            
            if title is not None:
                section_title = title.text.strip()  # Get the section title and strip whitespace

                # Iterate over the list of sections we're interested in
                for section_name in sections:
                    if section_name in section_title:
                        # Call the function to extract data if the section matches
                        extract_section_data(section_title, section)

# Call the main parsing function
parse_patient_data(root)

# Output the extracted patient data
print(patient_data)

Extracting Section: Allergies and Adverse Reactions
Extracting Section: Medications
Extracting Section: Diagnostic Results
{'Patient ID': '466df3cc-4a2e-70a8-5ebb-29f1ce64763f', 'Given Name': 'Carolina179 Francisca486', 'Gender': 'F', 'Birth Time': '19530616173200', 'Race': 'black', 'Ethnic Group': 'non-hispanic', 'Language': 'en-US', 'Allergies and Adverse Reactions': [{'Start': '1954-09-17T22:32:00Z', 'Stop': None, 'Description': 'Allergy to substance (finding)'}, {'Start': '1954-09-17T22:32:00Z', 'Stop': None, 'Description': 'Bee venom (substance)'}, {'Start': '1954-09-17T22:32:00Z', 'Stop': None, 'Description': 'Mold (organism)'}, {'Start': '1954-09-17T22:32:00Z', 'Stop': None, 'Description': 'House dust mite (organism)'}, {'Start': '1954-09-17T22:32:00Z', 'Stop': None, 'Description': 'Animal dander (substance)'}, {'Start': '1954-09-17T22:32:00Z', 'Stop': None, 'Description': 'Grass pollen (substance)'}, {'Start': '1954-09-17T22:32:00Z', 'Stop': None, 'Description': 'Tree pollen (s

In [18]:
# Define the path for the output JSON file
output_json_path = '/Users/bharathbeeravelly/Desktop/patient-trials-matching/data/patient_data.json'

# Write the patient_data dictionary to a JSON file
with open(output_json_path, 'w') as json_file:
    json.dump(patient_data, json_file, indent=4)

print(f"Patient data has been written to {output_json_path}")

Patient data has been written to /Users/bharathbeeravelly/Desktop/patient-trials-matching/data/patient_data.json
