|
a |
|
b/rule_based_model.py |
|
|
1 |
""" |
|
|
2 |
Moving toward front end html webpage |
|
|
3 |
# just need the functions, no need to run anything, all commented out |
|
|
4 |
""" |
|
|
5 |
|
|
|
6 |
import pandas as pd |
|
|
7 |
import re |
|
|
8 |
import copy |
|
|
9 |
import pprint |
|
|
10 |
import os |
|
|
11 |
|
|
|
12 |
path = os.path.dirname(__file__) |
|
|
13 |
|
|
|
14 |
''' |
|
|
15 |
# for flaskapp integration: put this into app.py |
|
|
16 |
|
|
|
17 |
dataset_location = os.path.join(path, r"14122021 NLP Histo 3032 Reports (Classify Cancerous and Non-cancerous Reports).xlsx") |
|
|
18 |
df_initial = pd.read_excel(dataset_location, usecols="D, F, H, P") |
|
|
19 |
df_initial.rename(columns={"Grade(1, 2, 3, mildly or well = 1, moderately = 2, poorly = 3)": "grades"}, inplace=True) |
|
|
20 |
df_initial['grades'] = df_initial['grades'].fillna(0) # this changes all NaN values in the grade column to 0 |
|
|
21 |
''' |
|
|
22 |
|
|
|
23 |
################################### |
|
|
24 |
####### ESSENTIAL FUNCTIONS ####### |
|
|
25 |
################################### |
|
|
26 |
|
|
|
27 |
# STEP 1 FUNCTION |
|
|
28 |
def convert_df(data): |
|
|
29 |
id_list = [] |
|
|
30 |
text_list = [] |
|
|
31 |
grades_list = [] |
|
|
32 |
for index, row in data.iterrows(): |
|
|
33 |
id = row['SCM GUIDE'].lower() |
|
|
34 |
text = (row['DIAGNOSIS'] + " " + row['MICROSCOPIC DESCRIPTION']).lower() |
|
|
35 |
grades = re.findall('\d+', str(row['grades'])) |
|
|
36 |
# if there are no grades assigned, it is 'Unknown' --> treat as '0' (no grade) |
|
|
37 |
if len(grades) == 0: |
|
|
38 |
grades = ['0'] |
|
|
39 |
id_list.append(id) |
|
|
40 |
text_list.append(text) |
|
|
41 |
grades_list.append(grades) |
|
|
42 |
converted_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list}) |
|
|
43 |
return converted_df |
|
|
44 |
|
|
|
45 |
|
|
|
46 |
# STEP 2 FUNCTION |
|
|
47 |
def find_matches(data): |
|
|
48 |
matches_list = [] |
|
|
49 |
for index, row in data.iterrows(): |
|
|
50 |
pattern = re.compile(r"[\s\S]{25}grade[\s\S]{25}|[\s\S]{25}differentiated[\s\S]{25}") |
|
|
51 |
matches = pattern.findall(row['text']) |
|
|
52 |
matches_list.append(matches) |
|
|
53 |
data["matches"] = matches_list |
|
|
54 |
return data |
|
|
55 |
|
|
|
56 |
|
|
|
57 |
# STEP 3 FUNCTION |
|
|
58 |
def determine_grade(data): |
|
|
59 |
|
|
|
60 |
# for id in determined_data.keys(): |
|
|
61 |
# determined_data[id]["matches"] = None # set "matches" = None for all rows in determined_data first |
|
|
62 |
|
|
|
63 |
determined_list = [] |
|
|
64 |
|
|
|
65 |
# Edit patterns for the respective grades here |
|
|
66 |
pattern1 = re.compile(r"low|grade\s1|grade\si|well") |
|
|
67 |
pattern2 = re.compile(r"intermediate|grade\s2|grade\sii|moderately") |
|
|
68 |
pattern3 = re.compile(r"high|grade\s3|grade\siii|poorly") |
|
|
69 |
pattern_false = re.compile(r"dcis|dysplasia") # removed 'nuclear' for now. |
|
|
70 |
|
|
|
71 |
for index, row in data.iterrows(): |
|
|
72 |
grades_list = [] |
|
|
73 |
for match_text in row["matches"]: |
|
|
74 |
grade1_match = pattern1.findall(match_text) |
|
|
75 |
grade2_match = pattern2.findall(match_text) |
|
|
76 |
grade3_match = pattern3.findall(match_text) |
|
|
77 |
matches_false = pattern_false.findall(match_text) |
|
|
78 |
|
|
|
79 |
if not matches_false: |
|
|
80 |
if grade1_match: |
|
|
81 |
grades_list.append('1') |
|
|
82 |
if grade2_match: |
|
|
83 |
grades_list.append('2') |
|
|
84 |
if grade3_match: |
|
|
85 |
grades_list.append('3') |
|
|
86 |
|
|
|
87 |
# grades_list.append('9') |
|
|
88 |
|
|
|
89 |
# If all matches in a given row does not result in any determined grade, set 'determined' = ['0'] |
|
|
90 |
if len(grades_list) == 0: |
|
|
91 |
grades_list.append('0') |
|
|
92 |
determined_list.append(grades_list) |
|
|
93 |
data["determined"] = determined_list |
|
|
94 |
|
|
|
95 |
# determined_data[id]["determined_grades"] = determined_grades |
|
|
96 |
# determined_data[id]["matches"] = all_matches[id] |
|
|
97 |
return data |
|
|
98 |
|
|
|
99 |
|
|
|
100 |
# STEP 4 FUNCTION |
|
|
101 |
def evaluate_accuracy(data): |
|
|
102 |
correct_counter = 0 |
|
|
103 |
total = len(data) |
|
|
104 |
result_list = [] # holds a string "Correct" or "Wrong" for each row after comparing "grades" with "determined" |
|
|
105 |
|
|
|
106 |
for index, row in data.iterrows(): |
|
|
107 |
grades = row["grades"] |
|
|
108 |
determined_grades = row["determined"] |
|
|
109 |
|
|
|
110 |
if determined_grades == grades: # if determined grades matches actual grades exacty, it is correct |
|
|
111 |
correct_counter += 1 |
|
|
112 |
result = "Correct" |
|
|
113 |
else: |
|
|
114 |
result = "Wrong" |
|
|
115 |
|
|
|
116 |
result_list.append(result) |
|
|
117 |
|
|
|
118 |
data["result"] = result_list |
|
|
119 |
|
|
|
120 |
score = correct_counter / total |
|
|
121 |
|
|
|
122 |
return data, score |
|
|
123 |
|
|
|
124 |
|
|
|
125 |
|
|
|
126 |
################################### |
|
|
127 |
##### MISCELLANEOUS FUNCTIONS ##### |
|
|
128 |
################################### |
|
|
129 |
|
|
|
130 |
def wrong_gradings(data): |
|
|
131 |
id_list = [] |
|
|
132 |
text_list = [] |
|
|
133 |
grades_list = [] |
|
|
134 |
matches_list = [] |
|
|
135 |
determined_list = [] |
|
|
136 |
result_list = [] |
|
|
137 |
for index, row in data.iterrows(): |
|
|
138 |
if row['result'] == 'Wrong': |
|
|
139 |
id_list.append(row['id']) |
|
|
140 |
text_list.append(row['text']) |
|
|
141 |
grades_list.append(row['grades']) |
|
|
142 |
matches_list.append(row['matches']) |
|
|
143 |
determined_list.append(row['determined']) |
|
|
144 |
result_list.append(row['result']) |
|
|
145 |
wrong_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list}) |
|
|
146 |
return wrong_df |
|
|
147 |
|
|
|
148 |
|
|
|
149 |
def correct_gradings(data): |
|
|
150 |
id_list = [] |
|
|
151 |
text_list = [] |
|
|
152 |
grades_list = [] |
|
|
153 |
matches_list = [] |
|
|
154 |
determined_list = [] |
|
|
155 |
result_list = [] |
|
|
156 |
for index, row in data.iterrows(): |
|
|
157 |
if row['result'] == 'Correct': |
|
|
158 |
id_list.append(row['id']) |
|
|
159 |
text_list.append(row['text']) |
|
|
160 |
grades_list.append(row['grades']) |
|
|
161 |
matches_list.append(row['matches']) |
|
|
162 |
determined_list.append(row['determined']) |
|
|
163 |
result_list.append(row['result']) |
|
|
164 |
correct_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list}) |
|
|
165 |
return correct_df |
|
|
166 |
|
|
|
167 |
|
|
|
168 |
def false_positives(data): |
|
|
169 |
id_list = [] |
|
|
170 |
text_list = [] |
|
|
171 |
grades_list = [] |
|
|
172 |
matches_list = [] |
|
|
173 |
determined_list = [] |
|
|
174 |
result_list = [] |
|
|
175 |
for index, row in data.iterrows(): |
|
|
176 |
# if there are detected matches, yet there are not supposed to be any grades at all (grades = ['0']) |
|
|
177 |
if len(row['matches']) != 0 and row['grades'] == ['0']: |
|
|
178 |
id_list.append(row['id']) |
|
|
179 |
text_list.append(row['text']) |
|
|
180 |
grades_list.append(row['grades']) |
|
|
181 |
matches_list.append(row['matches']) |
|
|
182 |
determined_list.append(row['determined']) |
|
|
183 |
result_list.append(row['result']) |
|
|
184 |
false_positives_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list}) |
|
|
185 |
return false_positives_df |
|
|
186 |
|
|
|
187 |
|
|
|
188 |
|
|
|
189 |
def false_negatives(data): |
|
|
190 |
id_list = [] |
|
|
191 |
text_list = [] |
|
|
192 |
grades_list = [] |
|
|
193 |
matches_list = [] |
|
|
194 |
determined_list = [] |
|
|
195 |
result_list = [] |
|
|
196 |
for index, row in data.iterrows(): |
|
|
197 |
# if there are NO detected matches, yet there are supposed to be some grade(s) assigned to the report. |
|
|
198 |
if len(row['matches']) == 0 and row['grades'] != ['0']: |
|
|
199 |
id_list.append(row['id']) |
|
|
200 |
text_list.append(row['text']) |
|
|
201 |
grades_list.append(row['grades']) |
|
|
202 |
matches_list.append(row['matches']) |
|
|
203 |
determined_list.append(row['determined']) |
|
|
204 |
result_list.append(row['result']) |
|
|
205 |
false_negatives_df = pd.DataFrame({'id': id_list, 'text': text_list, 'grades': grades_list, 'matches': matches_list, 'determined': determined_list, 'result': result_list}) |
|
|
206 |
return false_negatives_df |
|
|
207 |
|
|
|
208 |
''' |
|
|
209 |
# for flaskapp integration: don't need to run anything, will do in app.py |
|
|
210 |
|
|
|
211 |
|
|
|
212 |
# This section of the code utlises all the essential functions defined above. |
|
|
213 |
|
|
|
214 |
# The naming of varaibles will start with 'df' and then state the columns in the |
|
|
215 |
# corresponding dataframe, separated by a '_' (e.g. df_id_text_grades_matches |
|
|
216 |
# means that the dataframe has 4 columns: id, text, grades, matches) |
|
|
217 |
|
|
|
218 |
# The steps that the dataframe goes through are as such: |
|
|
219 |
|
|
|
220 |
# Step 1: Convert DF to show ID, TEXT and GRADES |
|
|
221 |
df = convert_df(df_initial) |
|
|
222 |
|
|
|
223 |
# Step 2: Find text matches to the word 'grade' and 'differentiated' and store in list (+ MATCHES) |
|
|
224 |
df = find_matches(df) |
|
|
225 |
|
|
|
226 |
# Step 3: Determine the list of grades from the list of matches (+ DETERMINED) |
|
|
227 |
df = determine_grade(df) |
|
|
228 |
|
|
|
229 |
# Step 4: Evaulate if determined grade is "Correct" or "Wrong" and calculate overall accuracy score (+ RESULT) |
|
|
230 |
df, accuracy_score = evaluate_accuracy(df) |
|
|
231 |
|
|
|
232 |
# RESULTANT DATAFRAME AFTER ALL STEPS (6 COLUMNS) |
|
|
233 |
print("The total number of REPORTS is: " + str(len(df))) |
|
|
234 |
print("\n") |
|
|
235 |
print("The accuracy of determine_grade function is: " + str(accuracy_score)) |
|
|
236 |
print("\n") |
|
|
237 |
df |
|
|
238 |
|
|
|
239 |
# dataframe for wrong gradings |
|
|
240 |
wrong_df = wrong_gradings(df) |
|
|
241 |
|
|
|
242 |
print("The number of wrongly graded rows is: " + str(len(wrong_df))) |
|
|
243 |
wrong_df |
|
|
244 |
|
|
|
245 |
# dataframe for correct gradings |
|
|
246 |
correct_df = correct_gradings(df) |
|
|
247 |
|
|
|
248 |
print("The number of correctly graded reports is: " + str(len(correct_df))) |
|
|
249 |
correct_df |
|
|
250 |
|
|
|
251 |
# dataframe for false positives |
|
|
252 |
false_positives_df = false_positives(df) |
|
|
253 |
|
|
|
254 |
print("The number of false_positives is: " + str(len(false_positives_df))) |
|
|
255 |
false_positives_df |
|
|
256 |
|
|
|
257 |
# dataframe for false negatives |
|
|
258 |
false_negatives_df = false_negatives(df) |
|
|
259 |
|
|
|
260 |
print("The number of false_negatives is: " + str(len(false_negatives_df))) |
|
|
261 |
false_negatives_df |
|
|
262 |
|
|
|
263 |
# exporting dataframes |
|
|
264 |
|
|
|
265 |
try: |
|
|
266 |
os.mkdir(os.path.join(path,r'csvfiles')) # create directory to store csv files |
|
|
267 |
except OSError as error: |
|
|
268 |
print(error) |
|
|
269 |
|
|
|
270 |
df.to_csv(os.path.join(path, r'csvfiles\df.csv')) # export to to csv file |
|
|
271 |
wrong_df.to_csv(os.path.join(path, r'csvfiles\wrong_df.csv')) # export to to csv file |
|
|
272 |
correct_df.to_csv(os.path.join(path, r'csvfiles\correct_df.csv')) # export to to csv file |
|
|
273 |
false_positives_df.to_csv(os.path.join(path, r'csvfiles\false_positives_df.csv')) # export to to csv file |
|
|
274 |
false_negatives_df.to_csv(os.path.join(path, r'csvfiles\false_negatives_df.csv')) # export to to csv file |
|
|
275 |
''' |