Diff of /rule_based_model.py [000000] .. [ec103b]

Switch to unified view

a b/rule_based_model.py
1
"""
2
Moving toward front end html webpage
3
# just need the functions, no need to run anything, all commented out
4
"""
5
6
import pandas as pd
7
import re
8
import copy
9
import pprint
10
import os
11
12
path = os.path.dirname(__file__)
13
14
'''
15
# for flaskapp integration: put this into app.py
16
17
dataset_location = os.path.join(path, r"14122021 NLP Histo 3032 Reports (Classify Cancerous and Non-cancerous Reports).xlsx")
18
df_initial = pd.read_excel(dataset_location, usecols="D, F, H, P")
19
df_initial.rename(columns={"Grade(1, 2, 3, mildly or well = 1, moderately = 2, poorly = 3)": "grades"}, inplace=True)
20
df_initial['grades'] = df_initial['grades'].fillna(0) # this changes all NaN values in the grade column to 0
21
'''
22
23
###################################
24
####### ESSENTIAL FUNCTIONS #######
25
###################################
26
27
# STEP 1 FUNCTION
28
def convert_df(data):
29
    id_list = []
30
    text_list = []
31
    grades_list = []
32
    for index, row in data.iterrows():
33
      id = row['SCM GUIDE'].lower()
34
      text = (row['DIAGNOSIS'] + " " + row['MICROSCOPIC DESCRIPTION']).lower()
35
      grades = re.findall('\d+', str(row['grades']))
36
      # if there are no grades assigned, it is 'Unknown' --> treat as '0' (no grade)
37
      if len(grades) == 0:
38
        grades = ['0']
39
      id_list.append(id)
40
      text_list.append(text)
41
      grades_list.append(grades)
42
    converted_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list})
43
    return converted_df
44
45
46
# STEP 2 FUNCTION
47
def find_matches(data):
48
    matches_list = []
49
    for index, row in data.iterrows():
50
      pattern = re.compile(r"[\s\S]{25}grade[\s\S]{25}|[\s\S]{25}differentiated[\s\S]{25}")
51
      matches = pattern.findall(row['text'])
52
      matches_list.append(matches) 
53
    data["matches"] = matches_list
54
    return data
55
56
57
# STEP 3 FUNCTION
58
def determine_grade(data):
59
60
    # for id in determined_data.keys():
61
    #   determined_data[id]["matches"] = None    # set "matches" = None for all rows in determined_data first
62
63
    determined_list = []
64
65
    # Edit patterns for the respective grades here
66
    pattern1 = re.compile(r"low|grade\s1|grade\si|well")
67
    pattern2 = re.compile(r"intermediate|grade\s2|grade\sii|moderately")
68
    pattern3 = re.compile(r"high|grade\s3|grade\siii|poorly")
69
    pattern_false = re.compile(r"dcis|dysplasia") # removed 'nuclear' for now.
70
    
71
    for index, row in data.iterrows():
72
      grades_list = []
73
      for match_text in row["matches"]:  
74
        grade1_match = pattern1.findall(match_text)
75
        grade2_match = pattern2.findall(match_text)
76
        grade3_match = pattern3.findall(match_text)
77
        matches_false = pattern_false.findall(match_text)
78
79
        if not matches_false:
80
          if grade1_match:
81
            grades_list.append('1')
82
          if grade2_match:
83
            grades_list.append('2')
84
          if grade3_match:
85
            grades_list.append('3')
86
          
87
        # grades_list.append('9')
88
89
      # If all matches in a given row does not result in any determined grade, set 'determined' = ['0']
90
      if len(grades_list) == 0:
91
        grades_list.append('0')
92
      determined_list.append(grades_list)
93
    data["determined"] = determined_list
94
      
95
    #   determined_data[id]["determined_grades"] = determined_grades
96
    #   determined_data[id]["matches"] = all_matches[id]
97
    return data
98
99
100
# STEP 4 FUNCTION
101
def evaluate_accuracy(data):
102
    correct_counter = 0
103
    total = len(data)
104
    result_list = []  # holds a string "Correct" or "Wrong" for each row after comparing "grades" with "determined"
105
106
    for index, row in data.iterrows():
107
      grades = row["grades"]
108
      determined_grades = row["determined"]
109
110
      if determined_grades == grades: # if determined grades matches actual grades exacty, it is correct
111
        correct_counter += 1
112
        result = "Correct"
113
      else:
114
        result = "Wrong"
115
      
116
      result_list.append(result)
117
    
118
    data["result"] = result_list
119
120
    score = correct_counter / total
121
    
122
    return data, score
123
124
125
126
###################################
127
##### MISCELLANEOUS FUNCTIONS #####
128
###################################
129
130
def wrong_gradings(data):
131
  id_list = []
132
  text_list = []
133
  grades_list = []
134
  matches_list = []
135
  determined_list = []
136
  result_list = []
137
  for index, row in data.iterrows():
138
    if row['result'] == 'Wrong':
139
      id_list.append(row['id'])
140
      text_list.append(row['text'])
141
      grades_list.append(row['grades'])
142
      matches_list.append(row['matches'])
143
      determined_list.append(row['determined'])
144
      result_list.append(row['result'])
145
    wrong_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list})
146
  return wrong_df
147
148
149
def correct_gradings(data):
150
  id_list = []
151
  text_list = []
152
  grades_list = []
153
  matches_list = []
154
  determined_list = []
155
  result_list = []
156
  for index, row in data.iterrows():
157
    if row['result'] == 'Correct':
158
      id_list.append(row['id'])
159
      text_list.append(row['text'])
160
      grades_list.append(row['grades'])
161
      matches_list.append(row['matches'])
162
      determined_list.append(row['determined'])
163
      result_list.append(row['result'])
164
    correct_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list})
165
  return correct_df
166
167
168
def false_positives(data):
169
    id_list = []
170
    text_list = []
171
    grades_list = []
172
    matches_list = []
173
    determined_list = []
174
    result_list = []
175
    for index, row in data.iterrows():
176
      # if there are detected matches, yet there are not supposed to be any grades at all (grades = ['0'])
177
      if len(row['matches']) != 0 and row['grades'] == ['0']:
178
        id_list.append(row['id'])
179
        text_list.append(row['text'])
180
        grades_list.append(row['grades'])
181
        matches_list.append(row['matches'])
182
        determined_list.append(row['determined'])
183
        result_list.append(row['result'])
184
      false_positives_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list})
185
    return false_positives_df
186
187
188
189
def false_negatives(data):
190
    id_list = []
191
    text_list = []
192
    grades_list = []
193
    matches_list = []
194
    determined_list = []
195
    result_list = []
196
    for index, row in data.iterrows():
197
      # if there are NO detected matches, yet there are supposed to be some grade(s) assigned to the report.
198
      if len(row['matches']) == 0 and row['grades'] != ['0']:
199
        id_list.append(row['id'])
200
        text_list.append(row['text'])
201
        grades_list.append(row['grades'])
202
        matches_list.append(row['matches'])
203
        determined_list.append(row['determined'])
204
        result_list.append(row['result'])
205
      false_negatives_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list})
206
    return false_negatives_df
207
208
'''
209
# for flaskapp integration: don't need to run anything, will do in app.py
210
211
212
# This section of the code utlises all the essential functions defined above.
213
214
# The naming of varaibles will start with 'df' and then state the columns in the
215
# corresponding dataframe, separated by a '_' (e.g. df_id_text_grades_matches
216
# means that the dataframe has 4 columns: id, text, grades, matches)
217
218
# The steps that the dataframe goes through are as such:
219
220
# Step 1: Convert DF to show ID, TEXT and GRADES
221
df = convert_df(df_initial)
222
223
# Step 2: Find text matches to the word 'grade' and 'differentiated' and store in list (+ MATCHES)
224
df = find_matches(df)
225
226
# Step 3: Determine the list of grades from the list of matches (+ DETERMINED)
227
df = determine_grade(df)
228
229
# Step 4: Evaulate if determined grade is "Correct" or "Wrong" and calculate overall accuracy score (+ RESULT)
230
df, accuracy_score = evaluate_accuracy(df)
231
232
# RESULTANT DATAFRAME AFTER ALL STEPS (6 COLUMNS)
233
print("The total number of REPORTS is: " + str(len(df)))
234
print("\n")
235
print("The accuracy of determine_grade function is: " + str(accuracy_score))
236
print("\n")
237
df
238
239
# dataframe for wrong gradings
240
wrong_df = wrong_gradings(df)
241
242
print("The number of wrongly graded rows is: " + str(len(wrong_df)))
243
wrong_df
244
245
# dataframe for correct gradings
246
correct_df = correct_gradings(df)
247
248
print("The number of correctly graded reports is: " + str(len(correct_df)))
249
correct_df
250
251
# dataframe for false positives
252
false_positives_df = false_positives(df)
253
254
print("The number of false_positives is: " + str(len(false_positives_df)))
255
false_positives_df
256
257
# dataframe for false negatives
258
false_negatives_df = false_negatives(df)
259
260
print("The number of false_negatives is: " + str(len(false_negatives_df)))
261
false_negatives_df
262
263
# exporting dataframes
264
265
try: 
266
    os.mkdir(os.path.join(path,r'csvfiles')) # create directory to store csv files
267
except OSError as error: 
268
    print(error)  
269
270
df.to_csv(os.path.join(path, r'csvfiles\df.csv')) # export to to csv file
271
wrong_df.to_csv(os.path.join(path, r'csvfiles\wrong_df.csv')) # export to to csv file
272
correct_df.to_csv(os.path.join(path, r'csvfiles\correct_df.csv')) # export to to csv file
273
false_positives_df.to_csv(os.path.join(path, r'csvfiles\false_positives_df.csv')) # export to to csv file
274
false_negatives_df.to_csv(os.path.join(path, r'csvfiles\false_negatives_df.csv')) # export to to csv file
275
'''