[c87959]: / benchmark / data_collection.py

Download this file

102 lines (89 with data), 4.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
import csv
import xml.etree.ElementTree as ET
def pull_tag_content(root_node, tag_path):
content_list = []
stack = [(root_node, '')]
while stack:
node, prefix = stack.pop()
path = prefix + '/' + node.tag
if path == tag_path and node.text:
content_list.append(node.text)
for child in node:
stack.append((child, path))
return content_list
def extract_criteria(criteria_textblock):
# Find the positions of "Inclusion Criteria:" and "Exclusion Criteria:"
c1 = criteria_textblock.find("Inclusion Criteria:",0)
c2 = criteria_textblock.find("Exclusion Criteria:",0)
# Extract the Inclusion Criteria and handle missing criteria
if c1 >= 0:
if c2 >= 0:
inclusion_criteria = criteria_textblock[c1 + len("Inclusion Criteria:"):c2].strip()
else:
inclusion_criteria = criteria_textblock[c1 + len("Inclusion Criteria:"):].strip()
else:
inclusion_criteria = ""
# Extract the Exclusion Criteria and handle missing criteria
if c2 >= 0:
if c1 >= 0:
exclusion_criteria = criteria_textblock[c2 + len("Exclusion Criteria:"):].strip()
else:
exclusion_criteria = criteria_textblock[c2 + len("Exclusion Criteria:"):].strip()
else:
exclusion_criteria = ""
return inclusion_criteria, exclusion_criteria
def read_xml_file(file_path):
try:
# Parse the XML file
tree = ET.parse(file_path)
root = tree.getroot()
clinical_data = {
'nct_id': root.findtext('id_info/nct_id',''),
'brief_title': root.findtext('brief_title',''),
'official_title': root.findtext('official_title',''),
'agency': root.findtext('sponsors/lead_sponsor/agency',''),
'agency_class': root.findtext('sponsors/lead_sponsor/agency_class',''),
'collaborator_agency': root.findtext('sponsors/collaborator/agency',''),
'brief_summary': root.findtext('brief_summary/textblock',''),
'detailed_description': root.findtext('detailed_description/textblock',''),
# 'conditions': root.findtext('condition',''),
'overall_status': root.findtext('overall_status',''),
'phase': root.findtext('phase',''),
'study_type': root.findtext('study_type',''),
'has_expanded_access': root.findtext('has_expanded_access',''),
'intervention': root.findtext('intervention',''),
'intervention_type': root.findtext('intervention/intervention_type',''),
'intervention_name': root.findtext('intervention/intervention_name',''),
'lead_sponsor_agency': root.find('sponsors/lead_sponsor/agency',''),
'primary_completion_date': root.findtext('primary_completion_date',''),
'start_date': root.findtext('start_date',''),
'completion_date': root.findtext('completion_date',''),
'gender': root.findtext('eligibility/gender',''),
'minimum_age': root.findtext('eligibility/minimum_age',''),
'maximum_age': root.findtext('eligibility/maximum_age',''),
'healthy_volunteers': root.findtext('eligibility/healthy_volunteers',''),
'why_stopped': root.findtext('why_stopped',''),
}
# check for multipal marks
conditions = pull_tag_content(root, '/clinical_study/condition')
keywords = pull_tag_content(root, '/clinical_study/keyword')
clinical_data['conditions'] = conditions
clinical_data['keywords'] = keywords
# Extract Inclusion and Exclusion Criteria
criteria_textblock = root.findtext('eligibility/criteria/textblock','')
inclusion_criteria, exclusion_criteria = extract_criteria(criteria_textblock)
clinical_data['inclusion_criteria'] = inclusion_criteria
clinical_data['exclusion_criteria'] = exclusion_criteria
return clinical_data
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return None
except ET.ParseError:
print(f"Error: Invalid XML format in '{file_path}'.")
return None
def save_to_csv(data_list, csv_file):
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
fieldnames = data_list[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data_list)