718 lines (717 with data), 21.5 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "BUhZwrAQJcon"
},
"source": [
"# Task 1: Prompt engineering: reorganize the X-Ray report findings into predefined anatomical regions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "iUuRHPZwJcoq"
},
"outputs": [],
"source": [
"# import packages\n",
"from openai import OpenAI\n",
"import json\n",
"import os\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "q89Q5xXiJcor"
},
"outputs": [],
"source": [
"# Set your API key for OpenAI\n",
"client = OpenAI(api_key=\"<Your_openai_API_key>\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "34cEjbglJcos"
},
"outputs": [],
"source": [
"def categorize_findings(report):\n",
" # Create a chat completion request using a structured prompt\n",
" chat_completion = client.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": \"\"\"Categorize the findings of a chest X-ray report into predefined anatomical regions: bone, heart, lung, and mediastinal.\n",
" If a finding does not clearly belong to these categories, classify it under 'others'. Read each sentence carefully. Determine the main anatomical focus of each sentence:\n",
" - If a sentence discusses any findings related to bones, categorize it under 'bone'.\n",
" - If it pertains to the heart, categorize it under 'heart'.\n",
" - If a sentence discusses any findings related to the lungs or associated structures, categorize it under 'lung'.\n",
" - If it mentions any findings related to the mediastinal area, categorize it under 'mediastinal'.\n",
" - If a sentence does not fit any of the above categories or is ambiguous, place it under 'others'.\n",
" Provide the output as a JSON object with each category listing relevant sentences from the report in **plain text** without extra double quotes around the sentences.\n",
" The format should be: {\"bone\": \"\", \"heart\": \"\", \"lung\": \"\", \"mediastinal\": \"\", \"others\": \"\"}.\n",
" \"\"\"\n",
" },\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": report\n",
" }\n",
" ],\n",
" response_format= {\"type\": \"json_object\"}\n",
" )\n",
" # Extract and return the model's response\n",
" return chat_completion.choices[0].message.content\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "N6AfNmOjJcot",
"outputId": "dd9ec8e5-1f62-43ed-b4ba-32b3a8254045"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"bone\": \"There are mild degenerative endplate changes in the thoracic spine. There are no acute bony findings.\",\n",
" \"heart\": \"The cardiomediastinal silhouette and pulmonary vasculature are within normal limits in size.\",\n",
" \"lung\": \"The lungs are mildly hypoinflated but grossly clear of focal airspace disease, pneumothorax, or pleural effusion.\",\n",
" \"mediastinal\": \"\",\n",
" \"others\": \"\"\n",
"}\n"
]
}
],
"source": [
"# Example usage with 1 report\n",
"sample_report = \"The cardiomediastinal silhouette and pulmonary vasculature are within normal limits in size. The lungs are mildly hypoinflated but grossly clear of focal airspace disease, pneumothorax, or pleural effusion. There are mild degenerative endplate changes in the thoracic spine. There are no acute bony findings.\"\n",
"categorized_report = categorize_findings(sample_report)\n",
"print(categorized_report)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "mAn7F0NrJcov",
"outputId": "3544ff5d-847f-43d9-836c-55fb1db2925b"
},
"outputs": [
{
"data": {
"text/plain": [
"{'lung': ['The lungs are mildly hypoinflated but grossly clear of focal airspace disease, pneumothorax, or pleural effusion.'],\n",
" 'heart': ['The cardiomediastinal silhouette and pulmonary vasculature are within normal limits in size.'],\n",
" 'mediastinal': [],\n",
" 'bone': ['There are mild degenerative endplate changes in the thoracic spine.',\n",
" 'There are no acute bony findings.'],\n",
" 'others': []}"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = json.loads(categorized_report)\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7txaUnl5Jcow",
"outputId": "f416efb3-d4ec-4747-d276-caf36faf4fd0"
},
"outputs": [
{
"data": {
"text/plain": [
"['The lungs are mildly hypoinflated but grossly clear of focal airspace disease, pneumothorax, or pleural effusion.']"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result['lung']"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "iv5KjhIuJcow"
},
"source": [
"# For all reports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_Tr89e9nJcow"
},
"outputs": [],
"source": [
"# Get all reports\n",
"# Path to your JSON file\n",
"json_file_path = 'data/annotation_quiz_all.json'\n",
"\n",
"# Read the JSON file\n",
"with open(json_file_path, 'r') as file:\n",
" data = json.load(file)\n",
" val_reports = data.get('val', []) # retrieve the value associated with the key 'val' from the dictionary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KnWV8tWSJcox",
"outputId": "6c0f160d-c3ff-4f3b-a063-9955db2d3c30"
},
"outputs": [
{
"data": {
"text/plain": [
"296"
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(val_reports)\n",
"\n",
"#val_reports_sub= val_reports[:15]\n",
"#val_reports = val_reports_sub\n",
"#val_reports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "th1Pk4HPJcox",
"outputId": "9fe6a7af-4a70-471d-fdbc-4d673ecb8fa8"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 0%| | 0/30 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 3%|▎ | 1/30 [00:14<07:13, 14.94s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 7%|▋ | 2/30 [00:29<06:51, 14.68s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 10%|█ | 3/30 [00:50<07:50, 17.41s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 13%|█▎ | 4/30 [01:07<07:37, 17.59s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 17%|█▋ | 5/30 [01:22<06:51, 16.48s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 20%|██ | 6/30 [01:37<06:19, 15.82s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 23%|██▎ | 7/30 [01:54<06:17, 16.42s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 27%|██▋ | 8/30 [02:07<05:37, 15.32s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 30%|███ | 9/30 [02:22<05:20, 15.24s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 33%|███▎ | 10/30 [02:35<04:52, 14.64s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 37%|███▋ | 11/30 [02:51<04:41, 14.83s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 40%|████ | 12/30 [03:08<04:37, 15.43s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 43%|████▎ | 13/30 [03:20<04:06, 14.51s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 47%|████▋ | 14/30 [03:35<03:54, 14.65s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 50%|█████ | 15/30 [03:49<03:35, 14.38s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 53%|█████▎ | 16/30 [04:03<03:19, 14.23s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 57%|█████▋ | 17/30 [04:19<03:12, 14.78s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 60%|██████ | 18/30 [04:35<03:03, 15.27s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 63%|██████▎ | 19/30 [04:48<02:41, 14.71s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 67%|██████▋ | 20/30 [05:02<02:25, 14.50s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 70%|███████ | 21/30 [05:22<02:23, 15.91s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 73%|███████▎ | 22/30 [05:37<02:06, 15.79s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 77%|███████▋ | 23/30 [05:52<01:47, 15.36s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 80%|████████ | 24/30 [06:09<01:36, 16.06s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 83%|████████▎ | 25/30 [06:23<01:16, 15.40s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 87%|████████▋ | 26/30 [06:39<01:02, 15.50s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 90%|█████████ | 27/30 [06:55<00:46, 15.65s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 93%|█████████▎| 28/30 [07:13<00:32, 16.42s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 97%|█████████▋| 29/30 [07:27<00:15, 15.65s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing reports: 100%|██████████| 30/30 [07:35<00:00, 15.20s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File updated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# Categorize findings for all reports with batching\n",
"\n",
"input_file_path = 'data/annotation_quiz_all.json'\n",
"output_file_path = 'data/annotation.json'\n",
"\n",
"# Categorize findings for all reports with batching\n",
"\n",
"# Batch size\n",
"batch_size = 10 # set batch size\n",
"categorized_results = []\n",
"\n",
"# Process the reports in batches\n",
"for i in tqdm(range(0, len(val_reports), batch_size), desc=\"Processing reports\"):\n",
" batch = val_reports[i:i + batch_size] # Get the current batch of reports\n",
" \n",
" for report in batch:\n",
" try:\n",
" result = categorize_findings(report['original_report']) # Get output from the model\n",
" result = json.loads(result) # Load string from JSON object\n",
" except Exception as e:\n",
" print(f\"Error processing report {report['id']}: {e}\")\n",
" continue\n",
"\n",
" dict_results = {'id': report['id'], 'report': result, 'split': report['split']}\n",
" categorized_results.append(dict_results)\n",
"\n",
" # Also replace the original file with the updated results\n",
" with open(input_file_path, 'r') as file: # Read the JSON file\n",
" data = json.load(file)\n",
" data['val'] = categorized_results # Update the 'val' key with the categorized results\n",
"\n",
" with open(output_file_path, 'w') as file: # Write the updated JSON back to a new file\n",
" json.dump(data, file, indent=4)\n",
" \n",
" print(\"File updated successfully.\")\n"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}