# Evalute Model using GREEN score 

## Setup and installations

In [None]:
# mount to your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# navigate to project folder
%cd '/content/drive/My Drive/ML-Quiz-XRay-ReportGeneration'

In [None]:
# Clone the GREEN repo
!git clone https://github.com/Stanford-AIMI/GREEN.git

In [None]:
# navigate to the green repo
%cd GREEN

In [None]:
!pip install -e . # run and then restart kernel and re-run all cells except this one

Obtaining file:///content/drive/My%20Drive/UHN_Test/GREEN
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence_transformers (from green_score==0.0.5)
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets (from green_score==0.0.5)
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets->green_score==0.0.5)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->green_score==0.0.5)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->green_score==0.0.5)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets->green_score==0.0.5)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [None]:
# import libraries
from src import green # import green.py
from src.green3 import compute # modified code (green3.py) to save both GREEN summary and result_df
import os
import json
import torch
import sys
import importlib
import time
import gc

# import processor
import re
import torch.distributed as dist
from transformers import AutoProcessor

In [None]:
# mount to your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# navigate to project folder
%cd '/content/drive/My Drive/ML-Quiz-XRay-ReportGeneration'

## Convert predicted token sentences in jsonl file to json object

In [None]:
path_to_predictions = '/dataset_test/answers/llava-v1ft.5-7b.jsonl'
path_to_ground_truth = '/dataset_test/test/test_dataset.json'

In [None]:
# Load the JSON file with ground truth values
with open(path_to_ground_truth, 'r') as json_file:
    ground_truth_data = json.load(json_file)

# Create a mapping of IDs to their corresponding "value" in the ground truth data
id_to_value = {
    item['id']: item['conversations'][1]['value']
    for item in ground_truth_data
}

# Prepare to read the JSONL file and find corresponding values
results = []
with open(path_to_predictions, 'r') as jsonl_file:
    for line in jsonl_file:
        jsonl_item = json.loads(line)
        question_id = jsonl_item['question_id']

        # Get the corresponding value from the ground truth data
        if question_id in id_to_value:
            results.append({
                "question_id": question_id,
                "gpt_value": id_to_value[question_id],
                "text_output": jsonl_item['text'],
            })

# Output the results
for result in results:
    print(result)

{'question_id': 'cf33da4a-49f3-4dd1-8e5b-038d2637751f', 'gpt_value': '<s_bone>No acute bony abnormality.</s_bone><s_heart>Normal cardiac contours.</s_heart><s_lung>No focal consolidation. No large pleural effusion. No pneumothorax.</s_lung><s_mediastinal>Normal mediastinal contours.</s_mediastinal><s_others></s_others>', 'text_output': '<s_bone>No acute bony abnormality.</s_bone><s_heart>Cardiac contours are within normal limits.</s_heart><s_lung>Lungs are clear. No pneumothorax or pleural effusion.</s_lung><s_mediastinal>Mediastinal contours are within normal limits.</s_mediastinal><s_others></s_others>'}
{'question_id': '252d413e-ac8e-4cb2-907d-7b2037804924', 'gpt_value': '<s_bone>Bony structures are intact.</s_bone><s_heart>The heart is within normal limits.</s_heart><s_lung>Lungs are clear. There is no pneumothorax or pleural effusion.</s_lung><s_mediastinal>The mediastinum is within normal limits.</s_mediastinal><s_others></s_others>', 'text_output': '<s_bone>The skeletal structur

In [None]:
# Convert tokens to json dict to retrieve the generated text on each anatomical region

# Use llava-mistral-7b model processor to parse tokens
MODEL_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
processor = AutoProcessor.from_pretrained(MODEL_ID)


def token2json(tokens, is_inner_value=False, added_vocab=None):
        """
        Convert a (generated) token sequence into an ordered JSON format.
        """
        if added_vocab is None:
            added_vocab = processor.tokenizer.get_added_vocab()

        output = {}

        while tokens:
            start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
            if start_token is None:
                break
            key = start_token.group(1)
            key_escaped = re.escape(key)

            end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
            start_token = start_token.group()
            if end_token is None:
                tokens = tokens.replace(start_token, "")
            else:
                end_token = end_token.group()
                start_token_escaped = re.escape(start_token)
                end_token_escaped = re.escape(end_token)
                content = re.search(
                    f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE | re.DOTALL
                )
                if content is not None:
                    content = content.group(1).strip()
                    if r"<s_" in content and r"</s_" in content:  # non-leaf node
                        value = token2json(content, is_inner_value=True, added_vocab=added_vocab)
                        if value:
                            if len(value) == 1:
                                value = value[0]
                            output[key] = value
                    else:  # leaf nodes
                        output[key] = []
                        for leaf in content.split(r"<sep/>"):
                            leaf = leaf.strip()
                            if leaf in added_vocab and leaf[0] == "<" and leaf[-2:] == "/>":
                                leaf = leaf[1:-2]  # for categorical special tokens
                            output[key].append(leaf)
                        if len(output[key]) == 1:
                            output[key] = output[key][0]

                tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
                if tokens[:6] == r"<sep/>":  # non-leaf nodes
                    return [output] + token2json(tokens[6:], is_inner_value=True, added_vocab=added_vocab)

        if len(output):
            return [output] if is_inner_value else output
        else:
            return [] if is_inner_value else {"text_sequence": tokens}

In [None]:
bone_gt=[]
bone_out= []
heart_gt=[]
heart_out= []
lung_gt=[]
lung_out= []
mediastinal_gt=[]
mediastinal_out= []
#others_gt=[] # no need
#others_out= []

for report in results:

  ground_truth= report['gpt_value']
  #print(ground_truth)
  output= report['text_output']
  #print(output)


  # convert token to json
  ground_truth_dict= token2json(report['gpt_value'])
  print(ground_truth_dict)
  output_dict= token2json(report['text_output'])
  print(output_dict)


  # if output_dict['bone'] causes error then skip that report # as model did not predict in the correct format using tokens
  if 'bone' in output_dict:
    bone_gt.append(ground_truth_dict['bone'])
    bone_out.append(output_dict['bone'])

    heart_gt.append(ground_truth_dict['heart'])
    heart_out.append(output_dict['heart'])

    lung_gt.append(ground_truth_dict['lung'])
    lung_out.append(output_dict['lung'])

    mediastinal_gt.append(ground_truth_dict['mediastinal'])
    mediastinal_out.append(output_dict['mediastinal'])
  else:
    continue

  #others_gt.append(ground_truth_dict['others'])
  #others_out.append(output_dict['others'])

{'bone': 'No acute bony abnormality.', 'heart': 'Normal cardiac contours.', 'lung': 'No focal consolidation. No large pleural effusion. No pneumothorax.', 'mediastinal': 'Normal mediastinal contours.', 'others': ''}
{'bone': 'No acute bony abnormality.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'Lungs are clear. No pneumothorax or pleural effusion.', 'mediastinal': 'Mediastinal contours are within normal limits.', 'others': ''}
{'bone': 'Bony structures are intact.', 'heart': 'The heart is within normal limits.', 'lung': 'Lungs are clear. There is no pneumothorax or pleural effusion.', 'mediastinal': 'The mediastinum is within normal limits.', 'others': ''}
{'bone': 'The skeletal structures are normal.', 'heart': 'The heart is normal.', 'lung': 'The lungs are clear.', 'mediastinal': 'The mediastinum is normal.', 'others': ''}
{'bone': 'Bony structures are intact.', 'heart': 'Cardiac contours are within normal limits.', 'lung': 'The lungs are clear.', 'mediastinal'

In [None]:
print(bone_gt[:3])
print(len(bone_gt))

['No acute bony abnormality.', 'Bony structures are intact.', 'Bony structures are intact.']
588


In [None]:
print(bone_out[:3])
print(len(bone_out))

['No acute bony abnormality.', 'The skeletal structures are normal.', 'No acute bony abnormality.']
588


In [None]:
# check if lens of all other categories match

# print all in one line
print(len(bone_gt))
print(len(bone_out))
print(len(heart_gt))
print(len(heart_out))
print(len(lung_gt))
print(len(lung_out))
print(len(mediastinal_gt))
print(len(mediastinal_out))


588
588
588
588
588
588
588
588


# Generate GREEN evalution metric for each anatomical region

In [None]:
# Note: modified green.py to save both GREEN summary and result_df and 
# used batch_size of 12 instead of 16 to reduce GPU load

In [None]:
# navigate to the green repo
%cd GREEN

In [None]:
#importlib.reload(sys.modules['src.green3'])

In [None]:
from src.green3 import compute # modified code (green3.py) to save both GREEN summary and result_df

model_name = "StanfordAIMI/GREEN-Phi2" #"StanfordAIMI/GREEN-RadLlama2-7b" (takes too long) # "StanfordAIMI/GREEN-Mistral-7b" (doesn't load) # Cuda out of memory error

# Manually clear the cache to free GPU memory
torch.cuda.empty_cache()
gc.collect() # garbage collection

# To avoid cuda out of memory error
# Set the environment variable for the current session
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
# GREEN for heart
compute(model_name, heart_out, heart_gt, output_dir="/content/drive/MyDrive/UHN_Test/GREEN/green_test_heart")

config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Processing data...making prompts


Map:   0%|          | 0/588 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 49/49 [22:44<00:00, 27.85s/it]


==== End Inference ====
Saving generated response to prompt to  /content/drive/MyDrive/UHN_Test/GREEN/green_test_heart/results_GREEN-Phi2.csv
Computing summary ...

-------------GREEN-Phi2----------------
 [Summary]: Green average 0.8163265306122449 and standard variation 0.3657609375613893 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

 (a) False report of a finding in the candidate: 0.9081632653061225. 


 (b) Missing a finding present in the reference: 0.9098639455782312. 


 (c) Misidentification of a finding's anatomic location/position: 0.9897959183673469. 


 (d) Misassessment of the severity of a finding: 0.9778911564625851. 


 (e) Mentioning a comparison that isn't in the reference: 0.9965986394557823. 


 (f) Omitting a comparison detailing a change from a prior study: 1.0.
----------------------------------

Saving generated Summary to prompt to  /content/drive/MyDrive/UHN_Test/GREEN/green_test_heart/resultsSummary_GREEN-Phi2.txt
Seconds per

In [None]:
# GREEN for bone
compute(model_name, bone_out, bone_gt, output_dir="/content/drive/MyDrive/UHN_Test/GREEN/green_test_bone")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Processing data...making prompts


Map:   0%|          | 0/588 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 49/49 [23:00<00:00, 28.18s/it]

==== End Inference ====
Saving generated response to prompt to  /content/drive/MyDrive/UHN_Test/GREEN/green_test_bone/results_GREEN-Phi2.csv
Computing summary ...

-------------GREEN-Phi2----------------
 [Summary]: Green average 0.3253968253968254 and standard variation 0.46355593167261244 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

 (a) False report of a finding in the candidate: 0.7959183673469388. 


 (b) Missing a finding present in the reference: 0.9217687074829932. 


 (c) Misidentification of a finding's anatomic location/position: 1.0. 


 (d) Misassessment of the severity of a finding: 1.0. 


 (e) Mentioning a comparison that isn't in the reference: 0.9982993197278912. 


 (f) Omitting a comparison detailing a change from a prior study: 1.0.
----------------------------------

Saving generated Summary to prompt to  /content/drive/MyDrive/UHN_Test/GREEN/green_test_bone/resultsSummary_GREEN-Phi2.txt
Seconds per example:  2.348370688302176





In [None]:
# # GREEN for lung
compute(model_name, lung_out, lung_gt, output_dir="/content/drive/MyDrive/UHN_Test/GREEN/green_test_lung")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Processing data...making prompts


Map:   0%|          | 0/588 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 49/49 [26:14<00:00, 32.13s/it]

==== End Inference ====
Saving generated response to prompt to  /content/drive/MyDrive/UHN_Test/GREEN/green_test_lung/results_GREEN-Phi2.csv
Computing summary ...

-------------GREEN-Phi2----------------
 [Summary]: Green average 0.6737001943634596 and standard variation 0.3195053647422164 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

 (a) False report of a finding in the candidate: 0.5425170068027211. 


 (b) Missing a finding present in the reference: 0.8095238095238095. 


 (c) Misidentification of a finding's anatomic location/position: 0.9982993197278912. 


 (d) Misassessment of the severity of a finding: 0.9931972789115646. 


 (e) Mentioning a comparison that isn't in the reference: 0.9880952380952381. 


 (f) Omitting a comparison detailing a change from a prior study: 1.0.
----------------------------------

Saving generated Summary to prompt to  /content/drive/MyDrive/UHN_Test/GREEN/green_test_lung/resultsSummary_GREEN-Phi2.txt
Seconds per e




In [None]:
# # GREEN for mediastinal
compute(model_name, mediastinal_out, mediastinal_gt, output_dir="/content/drive/MyDrive/UHN_Test/GREEN/green_test_mediastinal")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Processing data...making prompts


Map:   0%|          | 0/588 [00:00<?, ? examples/s]

Done.
==== Beginning Inference ====


100%|██████████| 49/49 [23:06<00:00, 28.30s/it]

==== End Inference ====
Saving generated response to prompt to  /content/drive/MyDrive/UHN_Test/GREEN/green_test_mediastinal/results_GREEN-Phi2.csv
Computing summary ...

-------------GREEN-Phi2----------------
 [Summary]: Green average 0.5939625850340136 and standard variation 0.47670400972284116 
 [Clinically Significant Errors Analyses]: <accuracy>. <representative error>

 (a) False report of a finding in the candidate: 0.8469387755102041. 


 (b) Missing a finding present in the reference: 0.79421768707483. 


 (c) Misidentification of a finding's anatomic location/position: 0.9829931972789115. 


 (d) Misassessment of the severity of a finding: 0.9948979591836735. 


 (e) Mentioning a comparison that isn't in the reference: 0.9965986394557823. 


 (f) Omitting a comparison detailing a change from a prior study: 1.0.
----------------------------------

Saving generated Summary to prompt to  /content/drive/MyDrive/UHN_Test/GREEN/green_test_mediastinal/resultsSummary_GREEN-Phi2.txt





# Create table for GREEN score results (Test Set)

In [None]:
# create a dataframe
import pandas as pd
df = pd.DataFrame(columns=['GREEN Score', 'Bone', 'Heart', 'Lung', 'Mediastinal'])

new_row_1 = {'GREEN Score': 'Green average', 'Bone': 0.3253968253968254,'Heart': 0.8163265306122449, 'Lung': 0.6737001943634596, 'Mediastinal': 0.5939625850340136}
new_row_2 = {'GREEN Score': 'standard variation', 'Bone': 0.46355593167261244,'Heart': 0.3657609375613893, 'Lung': 0.3195053647422164, 'Mediastinal': 0.47670400972284116}
new_row_3 = {'GREEN Score': '(a) False report of a finding in the candidate', 'Bone': 0.7959183673469388,'Heart': 0.9081632653061225, 'Lung': 0.5425170068027211, 'Mediastinal': 0.8469387755102041}
new_row_4 = {'GREEN Score': '(b) Missing a finding present in the reference', 'Bone': 0.9217687074829932,'Heart': 0.9098639455782312, 'Lung': 0.8095238095238095, 'Mediastinal': 0.79421768707483}
new_row_5 = {'GREEN Score': '(c) Misidentification of a finding\'s anatomic location/position', 'Bone': 1.0,'Heart': 0.9897959183673469, 'Lung': 0.9982993197278912, 'Mediastinal': 0.9829931972789115}
new_row_6 = {'GREEN Score': '(d) Misassessment of the severity of a finding', 'Bone': 1.0,'Heart': 0.9778911564625851, 'Lung': 0.9931972789115646, 'Mediastinal': 0.9948979591836735}
new_row_7 = {'GREEN Score': '(e) Mentioning a comparison that isn\'t in the reference', 'Bone': 0.9982993197278912,'Heart': 0.9965986394557823, 'Lung': 0.9880952380952381, 'Mediastinal': 0.9965986394557823}
new_row_8 = {'GREEN Score': '(f) Omitting a comparison detailing a change from a prior study', 'Bone': 1.0,'Heart': 1.0, 'Lung': 1.0, 'Mediastinal': 1.0}

df = pd.concat([df, pd.DataFrame([new_row_1])], ignore_index=True)
df = pd.concat([df, pd.DataFrame([new_row_2])], ignore_index=True)
df = pd.concat([df, pd.DataFrame([new_row_3])], ignore_index=True)
df = pd.concat([df, pd.DataFrame([new_row_4])], ignore_index=True)
df = pd.concat([df, pd.DataFrame([new_row_5])], ignore_index=True)
df = pd.concat([df, pd.DataFrame([new_row_6])], ignore_index=True)
df = pd.concat([df, pd.DataFrame([new_row_7])], ignore_index=True)
df = pd.concat([df, pd.DataFrame([new_row_8])], ignore_index=True)

# view full value in row
pd.set_option('display.max_colwidth', None)
df

  df = pd.concat([df, pd.DataFrame([new_row_1])], ignore_index=True)


Unnamed: 0,GREEN Score,Bone,Heart,Lung,Mediastinal
0,Green average,0.325397,0.816327,0.6737,0.593963
1,standard variation,0.463556,0.365761,0.319505,0.476704
2,(a) False report of a finding in the candidate,0.795918,0.908163,0.542517,0.846939
3,(b) Missing a finding present in the reference,0.921769,0.909864,0.809524,0.794218
4,(c) Misidentification of a finding's anatomic location/position,1.0,0.989796,0.998299,0.982993
5,(d) Misassessment of the severity of a finding,1.0,0.977891,0.993197,0.994898
6,(e) Mentioning a comparison that isn't in the reference,0.998299,0.996599,0.988095,0.996599
7,(f) Omitting a comparison detailing a change from a prior study,1.0,1.0,1.0,1.0
