|
a |
|
b/model.py |
|
|
1 |
from simpletransformers.classification import (ClassificationArgs, ClassificationModel) |
|
|
2 |
import pandas as pd |
|
|
3 |
import pickle |
|
|
4 |
|
|
|
5 |
# ### Read data from files |
|
|
6 |
|
|
|
7 |
# In[ ]: |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
train_df = pd.read_csv('/Users/aakansha/Desktop/NCCS NLP for Histology Reports/Datasets for Trials/train.csv') |
|
|
11 |
train_df['Combined Diagnosis'] = train_df['Diagnosis'] + train_df['Gross Description'] + train_df[ |
|
|
12 |
'Microscgopic Description'] |
|
|
13 |
train_df = train_df[['Combined Diagnosis', 'Cancerous?']] |
|
|
14 |
|
|
|
15 |
# ### Pre-process data |
|
|
16 |
|
|
|
17 |
# In[ ]: |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
# Capitalize values |
|
|
21 |
train_df['Cancerous?'] = train_df['Cancerous?'].str.upper() |
|
|
22 |
|
|
|
23 |
train_df['Cancerous?'] = train_df['Cancerous?'].str.strip() |
|
|
24 |
|
|
|
25 |
# Drop all NA rows |
|
|
26 |
train_df = train_df.dropna(subset=['Cancerous?']).reset_index(drop=True) |
|
|
27 |
|
|
|
28 |
# ## Roberta-Large |
|
|
29 |
|
|
|
30 |
# In[ ]: |
|
|
31 |
|
|
|
32 |
for n in [1]: |
|
|
33 |
|
|
|
34 |
curr_epoch = "Epoch" + str(n) |
|
|
35 |
|
|
|
36 |
# Configure model args |
|
|
37 |
model_args = ClassificationArgs(num_train_epochs=n) |
|
|
38 |
model_args.evaluate_during_training_steps = -1 |
|
|
39 |
model_args.save_eval_checkpoints = False |
|
|
40 |
model_args.save_model_every_epoch = False |
|
|
41 |
model_args.learning_rate = 1e-5 |
|
|
42 |
model_args.manual_seed = 4 |
|
|
43 |
model_args.multiprocessing_chunksize = 5000 |
|
|
44 |
model_args.no_cache = True |
|
|
45 |
model_args.reprocess_input_data = True |
|
|
46 |
model_args.train_batch_size = 16 |
|
|
47 |
model_args.gradient_accumulation_steps = 2 |
|
|
48 |
model_args.use_multiprocessing = True |
|
|
49 |
model_args.overwrite_output_dir = True |
|
|
50 |
model_args.labels_list = ['YES', 'NO'] |
|
|
51 |
|
|
|
52 |
# model |
|
|
53 |
model_type = "roberta" |
|
|
54 |
model_name = "roberta-large" |
|
|
55 |
|
|
|
56 |
# Create Transformer Model |
|
|
57 |
model = ClassificationModel(model_type, model_name, num_labels=2, use_cuda=False, args=model_args) |
|
|
58 |
|
|
|
59 |
if __name__ == '__main__': |
|
|
60 |
# Train the model |
|
|
61 |
model.train_model(train_df) |
|
|
62 |
pickle.dump(model, open('model.pkl', 'wb')) |
|
|
63 |
|
|
|
64 |
|