[2d4573]: / summarization / pubmed_summarization / pubmed_naive_bayes.py

Download this file

218 lines (188 with data), 9.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# Functions for extracting features from text
# Mostly taken from https://github.com/rachitjain2706/Auto-Text-sumarizer
import os
import sys
import shutil
import random
import json
from get_pubmed_nb_data import *
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import nltk
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'rouge_git')))
from rouge import FilesRouge, Rouge
def choose_pubmed_files(PARSED_DIR, EXS_DIR, n_train):
# 80/20 training/test split
n_test = int(round(n_train / 4))
abs_names = os.listdir(os.path.join(PARSED_DIR, 'abstract'))
n_total = n_train + n_test
if len(abs_names) < n_total:
message = 'Error: Desired number of training + test examples is ' + str(n_total)
message += '; however, only ' + str(len(abs_names)) + ' files parsed this way currently exist.'
sys.exit(message)
# Select examples
random.shuffle(abs_names)
training_file_names = [name[:-4] for name in abs_names][:n_train]
test_file_names = [name[:-4] for name in abs_names][n_train:n_train+n_test]
# Write to file
with open(os.path.join(EXS_DIR, 'training_files.txt'), 'w') as train:
train.writelines('%s\n' % name for name in training_file_names)
with open(os.path.join(EXS_DIR, 'test_files.txt'), 'w') as test:
test.writelines('%s\n' % name for name in test_file_names)
def classify_nb(x, pct_sum, gnb):
# Runs through Gaussian Naive Bayes models
if len(x) == 0:
return [0]
probs = gnb.predict_proba(x)
# Classifies likeliest sentences as part of summary
p_summary = [p[1] for p in probs]
n_summary_sents = round(len(x)*pct_sum)
if n_summary_sents == 0:
n_summary_sents = 1
thresh = sorted(p_summary)[len(x) - n_summary_sents]
preds = [1 if i >= thresh else 0 for i in p_summary]
return preds
def pubmed_naive_bayes(body_type=None, n_train=None):
if not body_type:
body_type = input('Train with articles\' whole body sections or just their body introductions?\n\t'
'[w=whole body, j=just intro, DEFAULT=just intro]: ')
PARSED_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'pubmed', 'parsed_articles'))
if not os.path.exists(PARSED_DIR):
missing_path = 'Error: Directory of parsed files ' + PARSED_DIR + ' does not exist.'
sys.exit(missing_path)
if body_type.lower() == 'w':
PARSED_DIR = os.path.join(PARSED_DIR, 'with_whole_bodies')
whole_body = True
elif body_type.lower() in ['j', '']:
PARSED_DIR = os.path.join(PARSED_DIR, 'with_just_intros')
whole_body = False
else:
sys.exit('Error: Must input \'w\' or \'j.\'')
if not n_train:
n_train = int(input('Number of examples for training set: '))
exs_dir_name = str(n_train) + '_exs_body' if whole_body else str(n_train) + '_exs_intro'
FILE_DIR = os.path.abspath(os.path.dirname(__file__))
EXS_DIR = os.path.join(FILE_DIR, exs_dir_name)
NB_DIR = os.path.join(EXS_DIR, 'nb')
if not os.path.exists(NB_DIR):
if n_train >= 1000 and not whole_body:
verbose = True
elif n_train >= 200 and whole_body:
verbose = True
else:
verbose = False
if verbose:
message = 'To fit a Naive Bayes model on ' + str(n_train) + \
' training articles, feature vectors must be created from the data.' \
'This can take a long time when the number of files is large. Do you wish to proceed? [Default=Yes]'
x = input(message)
proceed = 'y'
if proceed.lower() not in ['', 'y', 'yes']:
sys.exit('Exiting.')
if not os.path.exists(os.path.join(EXS_DIR, 'training_files.txt')) \
or not os.path.exists(os.path.join(EXS_DIR, 'test_files.txt')):
# Selects and writes training and test files
if not os.path.exists(EXS_DIR):
os.mkdir(EXS_DIR)
choose_pubmed_files(PARSED_DIR, EXS_DIR, n_train)
# Creates feature and output vectors for Pubmed articles
# get_pubmed_nb_data(PARSED_DIR, NB_DIR, n_train, whole_body, verbose)
get_pubmed_nb_data(PARSED_DIR, NB_DIR, n_train, whole_body)
# Loads training data
with open(os.path.join(NB_DIR, 'feature_vecs.json'), 'r') as f:
data = json.load(f)
xtrain, ytrain = data['train_features'], data['train_outputs']
# Fits model to data
gnb = GaussianNB()
gnb.fit(xtrain, ytrain)
# Calculates training accuracy
pct_sum = sum(ytrain) / len(ytrain)
p_guess_correct = (1-pct_sum)**2 + pct_sum**2
print('\nTraining Accuracy of Random Guessing: {}%'.format(round(p_guess_correct*100, 1)))
train_preds = classify_nb(xtrain, pct_sum, gnb)
print('Training Accuracy of Model: {}%'
.format(round(metrics.accuracy_score(ytrain, train_preds)*100), 1))
# Calculates average test accuracy
ytest = data['test_outputs']
pct_sum = sum(ytest) / len(ytest)
p_guess_correct = (1-pct_sum)**2 + pct_sum**2
print('\nTest Accuracy of Random Guessing: {}%'.format(round(p_guess_correct * 100, 1)))
test_accuracies = []
os.makedirs(os.path.join(NB_DIR, 'test_summary'), exist_ok=True)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
for file in os.listdir(os.path.join(NB_DIR, 'test_json')):
# Classifies sentences in each test file
with open(os.path.join(NB_DIR, 'test_json', file), 'r') as f:
test_data = json.load(f)
xtest, ytest = test_data['features'], test_data['outputs']
pct_sum = sum(ytest) / len(ytest)
test_preds = classify_nb(xtest, pct_sum, gnb)
test_accuracies.append(metrics.accuracy_score(ytest, test_preds))
summary_path = os.path.join(NB_DIR, 'test_summary', file[:-5] + '.sum')
if not os.path.exists(summary_path):
# Gets text corresponding to summary classifications
with open(os.path.join(PARSED_DIR, 'merged', file[:-5] + '.mgd'), 'r') as mgd:
mgd_text = mgd.read().replace('\n', '. ').replace('..', '.')
mgd_sents = tokenizer.tokenize(mgd_text)
summary = ''
for i in range(len(test_preds)):
if test_preds[i] == 1:
summary += mgd_sents[i]
# Writes to file
with open(summary_path, 'w') as s:
s.write(summary + '\n')
avg_pct_acc = round((sum(test_accuracies) / len(test_accuracies)*100), 1)
print('Model Average Test Accuracy: {}%'.format(avg_pct_acc))
rouge_path = os.path.join(NB_DIR, 'ROUGE.txt')
if not os.path.exists(rouge_path):
# Initialize ROUGE class
rouge_scores = {'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},
'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}
rouge_scores_random = {'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},
'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}
sum_files = os.listdir(os.path.join(NB_DIR, 'test_summary'))
files_rouge = FilesRouge()
for sum_file in sum_files:
sum_path = os.path.join(NB_DIR, 'test_summary', sum_file)
tgt_path = os.path.join(PARSED_DIR, 'abstract', sum_file[:-4] + '.tgt')
rand_sum_path = os.path.join(PARSED_DIR, 'random_summary', sum_file[:-4] + '.sum')
# Calculates ROUGE score of predicted and random summaries
scores = files_rouge.get_scores(sum_path, tgt_path, avg=True)
random_scores = files_rouge.get_scores(rand_sum_path, tgt_path, avg=True) # Compares against random summary of same size
for i in rouge_scores.keys():
for j in rouge_scores['rouge-1'].keys():
rouge_scores[i][j] += scores[i][j]
rouge_scores_random[i][j] += random_scores[i][j]
# Calculates average ROUGE scores
for i in rouge_scores.keys():
for j in rouge_scores['rouge-1'].keys():
rouge_scores[i][j] /= len(sum_files)
rouge_scores_random[i][j] /= len(sum_files)
with open(rouge_path, 'w+') as scores:
scores.write('\t' * 4)
scores.write('Model')
scores.write('\t' * 2)
scores.write('Random Guessing\n')
for i in range(1, 4):
rouge_text = 'ROUGE-' + str(i) + ' Average ' if i < 3 else 'ROUGE-L Average '
rouge_key = 'rouge-' + str(i) if i < 3 else 'rouge-l'
scores.write(rouge_text + 'Precision:\t' + str(round(rouge_scores[rouge_key]['p'], 3)) + '\t\t')
scores.write(str(round(rouge_scores_random[rouge_key]['p'], 3)) + '\n')
scores.write(rouge_text + 'Recall:\t\t' + str(round(rouge_scores[rouge_key]['r'], 3)) + '\t\t')
scores.write(str(round(rouge_scores_random[rouge_key]['r'], 3)) + '\n')
scores.write(rouge_text + 'F1 Score:\t' + str(round(rouge_scores[rouge_key]['f'], 3)) + '\t\t')
scores.write(str(round(rouge_scores_random[rouge_key]['f'], 3)) + '\n')
if i < 3:
scores.write('-'*20 + '\n')
scores.seek(0)
print('\n' + scores.read())
else:
with open(rouge_path, 'r') as scores:
print('\n' + scores.read())
if os.path.exists('__pycache__'):
shutil.rmtree('__pycache__')
if __name__ == '__main__':
sys.setrecursionlimit(3000)
pubmed_naive_bayes()