|
a |
|
b/src/MISC/jsonify.py |
|
|
1 |
import xml.etree.ElementTree as ET |
|
|
2 |
import json |
|
|
3 |
import os |
|
|
4 |
|
|
|
5 |
def parse_xml(xml_file): |
|
|
6 |
tree = ET.parse(xml_file) |
|
|
7 |
root = tree.getroot() |
|
|
8 |
|
|
|
9 |
data = {} |
|
|
10 |
|
|
|
11 |
# Extracting required fields |
|
|
12 |
data['nct_id'] = root.findtext('id_info/nct_id') |
|
|
13 |
data['brief_title'] = root.findtext('brief_title') |
|
|
14 |
data['official_title'] = root.findtext('official_title') |
|
|
15 |
data['brief_summary'] = root.findtext('brief_summary/textblock') |
|
|
16 |
data['detailed_description'] = root.findtext('detailed_description/textblock') |
|
|
17 |
data['overall_status'] = root.findtext('overall_status') |
|
|
18 |
data['start_date'] = root.findtext('start_date') |
|
|
19 |
data['completion_date'] = root.findtext('completion_date') |
|
|
20 |
data['phase'] = root.findtext('phase') |
|
|
21 |
data['study_type'] = root.findtext('study_type') |
|
|
22 |
data['condition'] = root.findtext('condition') |
|
|
23 |
data['intervention'] = { 'intervention_type': root.findtext('intervention/intervention_type'), 'intervention_name': root.findtext('intervention/intervention_name') } |
|
|
24 |
data['gender'] = root.findtext('eligibility/gender') |
|
|
25 |
data['minimum_age'] = root.findtext('eligibility/minimum_age') |
|
|
26 |
data['maximum_age'] = root.findtext('eligibility/maximum_age') |
|
|
27 |
city = root.findtext('location/facility/address/city') |
|
|
28 |
state = root.findtext('location/facility/address/state') |
|
|
29 |
country = root.findtext('location/facility/address/country') |
|
|
30 |
|
|
|
31 |
data['location'] = { |
|
|
32 |
'location_name': root.findtext('location/facility/name'), |
|
|
33 |
'location_address': ', '.join(filter(None, [city, state, country])) |
|
|
34 |
} |
|
|
35 |
data['reference'] = [{'citation': ref.findtext('citation'), 'PMID': ref.findtext('PMID')} for ref in root.findall('reference')] |
|
|
36 |
return data |
|
|
37 |
|
|
|
38 |
def convert_to_json(data): |
|
|
39 |
json_data = json.dumps(data, indent=4) |
|
|
40 |
return json_data |
|
|
41 |
|
|
|
42 |
def process_files(input_dir, output_dir): |
|
|
43 |
for filename in os.listdir(input_dir): |
|
|
44 |
if filename.endswith('.xml'): |
|
|
45 |
xml_file = os.path.join(input_dir, filename) |
|
|
46 |
data = parse_xml(xml_file) |
|
|
47 |
json_data = convert_to_json(data) |
|
|
48 |
|
|
|
49 |
# Save JSON to output directory |
|
|
50 |
json_file = os.path.join(output_dir, filename.replace('.xml', '.json')) |
|
|
51 |
with open(json_file, 'w') as f: |
|
|
52 |
f.write(json_data) |
|
|
53 |
|
|
|
54 |
if __name__ == "__main__": |
|
|
55 |
input_dir = '../data/trials_xmls/' |
|
|
56 |
output_dir = '../data/trials_jsons/' |
|
|
57 |
process_files(input_dir, output_dir) |