NCCS-GUI-For-NLP-Tasks / Git / [ec103b] /rule_based

Models:
DanielG/
NCCS-GUI-For-NLP-Tasks
Downloads: 1
[ec103b]: / rule_based_model.py
History
Download this file
275 lines (217 with data), 9.4 kB

"""
Moving toward front end html webpage
# just need the functions, no need to run anything, all commented out
"""

import pandas as pd
import re
import copy
import pprint
import os

path = os.path.dirname(__file__)

'''
# for flaskapp integration: put this into app.py

dataset_location = os.path.join(path, r"14122021 NLP Histo 3032 Reports (Classify Cancerous and Non-cancerous Reports).xlsx")
df_initial = pd.read_excel(dataset_location, usecols="D, F, H, P")
df_initial.rename(columns={"Grade(1, 2, 3, mildly or well = 1, moderately = 2, poorly = 3)": "grades"}, inplace=True)
df_initial['grades'] = df_initial['grades'].fillna(0) # this changes all NaN values in the grade column to 0
'''

###################################
####### ESSENTIAL FUNCTIONS #######
###################################

# STEP 1 FUNCTION
def convert_df(data):
    id_list = []
    text_list = []
    grades_list = []
    for index, row in data.iterrows():
      id = row['SCM GUIDE'].lower()
      text = (row['DIAGNOSIS'] + " " + row['MICROSCOPIC DESCRIPTION']).lower()
      grades = re.findall('\d+', str(row['grades']))
      # if there are no grades assigned, it is 'Unknown' --> treat as '0' (no grade)
      if len(grades) == 0:
        grades = ['0']
      id_list.append(id)
      text_list.append(text)
      grades_list.append(grades)
    converted_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list})
    return converted_df


# STEP 2 FUNCTION
def find_matches(data):
    matches_list = []
    for index, row in data.iterrows():
      pattern = re.compile(r"[\s\S]{25}grade[\s\S]{25}|[\s\S]{25}differentiated[\s\S]{25}")
      matches = pattern.findall(row['text'])
      matches_list.append(matches) 
    data["matches"] = matches_list
    return data


# STEP 3 FUNCTION
def determine_grade(data):

    # for id in determined_data.keys():
    #   determined_data[id]["matches"] = None    # set "matches" = None for all rows in determined_data first

    determined_list = []

    # Edit patterns for the respective grades here
    pattern1 = re.compile(r"low|grade\s1|grade\si|well")
    pattern2 = re.compile(r"intermediate|grade\s2|grade\sii|moderately")
    pattern3 = re.compile(r"high|grade\s3|grade\siii|poorly")
    pattern_false = re.compile(r"dcis|dysplasia") # removed 'nuclear' for now.
    
    for index, row in data.iterrows():
      grades_list = []
      for match_text in row["matches"]:  
        grade1_match = pattern1.findall(match_text)
        grade2_match = pattern2.findall(match_text)
        grade3_match = pattern3.findall(match_text)
        matches_false = pattern_false.findall(match_text)

        if not matches_false:
          if grade1_match:
            grades_list.append('1')
          if grade2_match:
            grades_list.append('2')
          if grade3_match:
            grades_list.append('3')
          
        # grades_list.append('9')

      # If all matches in a given row does not result in any determined grade, set 'determined' = ['0']
      if len(grades_list) == 0:
        grades_list.append('0')
      determined_list.append(grades_list)
    data["determined"] = determined_list
      
    #   determined_data[id]["determined_grades"] = determined_grades
    #   determined_data[id]["matches"] = all_matches[id]
    return data


# STEP 4 FUNCTION
def evaluate_accuracy(data):
    correct_counter = 0
    total = len(data)
    result_list = []  # holds a string "Correct" or "Wrong" for each row after comparing "grades" with "determined"

    for index, row in data.iterrows():
      grades = row["grades"]
      determined_grades = row["determined"]

      if determined_grades == grades: # if determined grades matches actual grades exacty, it is correct
        correct_counter += 1
        result = "Correct"
      else:
        result = "Wrong"
      
      result_list.append(result)
    
    data["result"] = result_list

    score = correct_counter / total
    
    return data, score



###################################
##### MISCELLANEOUS FUNCTIONS #####
###################################

def wrong_gradings(data):
  id_list = []
  text_list = []
  grades_list = []
  matches_list = []
  determined_list = []
  result_list = []
  for index, row in data.iterrows():
    if row['result'] == 'Wrong':
      id_list.append(row['id'])
      text_list.append(row['text'])
      grades_list.append(row['grades'])
      matches_list.append(row['matches'])
      determined_list.append(row['determined'])
      result_list.append(row['result'])
    wrong_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list})
  return wrong_df


def correct_gradings(data):
  id_list = []
  text_list = []
  grades_list = []
  matches_list = []
  determined_list = []
  result_list = []
  for index, row in data.iterrows():
    if row['result'] == 'Correct':
      id_list.append(row['id'])
      text_list.append(row['text'])
      grades_list.append(row['grades'])
      matches_list.append(row['matches'])
      determined_list.append(row['determined'])
      result_list.append(row['result'])
    correct_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list})
  return correct_df


def false_positives(data):
    id_list = []
    text_list = []
    grades_list = []
    matches_list = []
    determined_list = []
    result_list = []
    for index, row in data.iterrows():
      # if there are detected matches, yet there are not supposed to be any grades at all (grades = ['0'])
      if len(row['matches']) != 0 and row['grades'] == ['0']:
        id_list.append(row['id'])
        text_list.append(row['text'])
        grades_list.append(row['grades'])
        matches_list.append(row['matches'])
        determined_list.append(row['determined'])
        result_list.append(row['result'])
      false_positives_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list})
    return false_positives_df



def false_negatives(data):
    id_list = []
    text_list = []
    grades_list = []
    matches_list = []
    determined_list = []
    result_list = []
    for index, row in data.iterrows():
      # if there are NO detected matches, yet there are supposed to be some grade(s) assigned to the report.
      if len(row['matches']) == 0 and row['grades'] != ['0']:
        id_list.append(row['id'])
        text_list.append(row['text'])
        grades_list.append(row['grades'])
        matches_list.append(row['matches'])
        determined_list.append(row['determined'])
        result_list.append(row['result'])
      false_negatives_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list})
    return false_negatives_df

'''
# for flaskapp integration: don't need to run anything, will do in app.py


# This section of the code utlises all the essential functions defined above.

# The naming of varaibles will start with 'df' and then state the columns in the
# corresponding dataframe, separated by a '_' (e.g. df_id_text_grades_matches
# means that the dataframe has 4 columns: id, text, grades, matches)

# The steps that the dataframe goes through are as such:

# Step 1: Convert DF to show ID, TEXT and GRADES
df = convert_df(df_initial)

# Step 2: Find text matches to the word 'grade' and 'differentiated' and store in list (+ MATCHES)
df = find_matches(df)

# Step 3: Determine the list of grades from the list of matches (+ DETERMINED)
df = determine_grade(df)

# Step 4: Evaulate if determined grade is "Correct" or "Wrong" and calculate overall accuracy score (+ RESULT)
df, accuracy_score = evaluate_accuracy(df)

# RESULTANT DATAFRAME AFTER ALL STEPS (6 COLUMNS)
print("The total number of REPORTS is: " + str(len(df)))
print("\n")
print("The accuracy of determine_grade function is: " + str(accuracy_score))
print("\n")
df

# dataframe for wrong gradings
wrong_df = wrong_gradings(df)

print("The number of wrongly graded rows is: " + str(len(wrong_df)))
wrong_df

# dataframe for correct gradings
correct_df = correct_gradings(df)

print("The number of correctly graded reports is: " + str(len(correct_df)))
correct_df

# dataframe for false positives
false_positives_df = false_positives(df)

print("The number of false_positives is: " + str(len(false_positives_df)))
false_positives_df

# dataframe for false negatives
false_negatives_df = false_negatives(df)

print("The number of false_negatives is: " + str(len(false_negatives_df)))
false_negatives_df

# exporting dataframes

try: 
    os.mkdir(os.path.join(path,r'csvfiles')) # create directory to store csv files
except OSError as error: 
    print(error)  

df.to_csv(os.path.join(path, r'csvfiles\df.csv')) # export to to csv file
wrong_df.to_csv(os.path.join(path, r'csvfiles\wrong_df.csv')) # export to to csv file
correct_df.to_csv(os.path.join(path, r'csvfiles\correct_df.csv')) # export to to csv file
false_positives_df.to_csv(os.path.join(path, r'csvfiles\false_positives_df.csv')) # export to to csv file
false_negatives_df.to_csv(os.path.join(path, r'csvfiles\false_negatives_df.csv')) # export to to csv file
'''