[c09aa8]: / clusters / scripts / model.py

Download this file

170 lines (140 with data), 5.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337) #re-seed generator
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
#imports from keras for neural net
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import collections, numpy, csv
import sys
import re
def loadGloveEmbeddings():
#Load Glove, a model of words to numbers
# Stores a dictionary of words, with numbers corresponding
print('Indexing word vectors.')
BASE_DIR = '/media/hdd0/unraiddisk1/student/newsgroup' #where glove file is
GLOVE_DIR = BASE_DIR + '/'
GLOVE_DIR = BASE_DIR + '/glove.6B/'#accesses glove file
embeddings_index = {} #opens Glove
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]#sets the word to 0th value in array
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
#index mapping words in the embeddings set
#to their embedding vector
f.close()
return embeddings_index
embeddings_index = loadGloveEmbeddings() #opens Glove
print('Found %s word vectors.' % len(embeddings_index))
# Loaded Glove.
#embeddings_index is a map. ex: 'cat' => array(100)
def loadtrain():
data = []
labels = []
with open("merged2.csv") as csvfile:
csvreader = csv.reader(csvfile, delimiter=",")
for line in csvreader:
id = line[11]
review = line[6]
if review != "body":
sentiment = line[11]
labels.append(1 if (sentiment == '1') else 2 if (sentiment == '2') else 0)
data.append(review)
y = to_categorical(labels)
return (data,y)
(train,y) = loadtrain()
def loadtest():
data = []
ids = []
with open("testData.tsv") as tsvfile:
tsvreader = csv.reader(tsvfile, delimiter="\t")
for line in tsvreader:
id = line[0]
if id != 'id':
review = line[1]
data.append(review)
ids.append(id)
return (data,ids)
(test_text,test_ids) = loadtest()
corpi = [train, test_text]
def create_embedding_matrix(EMBEDDING_DIM, MAX_NB_WORDS, word_index):
print('Preparing embedding matrix.')
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
if i > MAX_NB_WORDS:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None: # words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
return (nb_words, embedding_matrix)
MAX_SEQUENCE_LENGTH = 1000
def create_tokenizer_and_embedding(MAX_SEQUENCE_LENGTH, train):
MAX_NB_WORDS = 5000 #sets up for padding
EMBEDDING_DIM = 100
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train)
(nb_words, embedding_matrix) = create_embedding_matrix(EMBEDDING_DIM, MAX_NB_WORDS, tokenizer.word_index)
# load pre-trained word embeddings into an Embedding layer
# set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)
return (tokenizer, embedding_layer)
(tokenizer, embedding_layer) = create_tokenizer_and_embedding(MAX_SEQUENCE_LENGTH, corpi[0])
def create_sequences(MAX_SEQUENCE_LENGTH, tokenizer, corpi):
MAX_NB_WORDS = 5000 #sets up for padding
EMBEDDING_DIM = 100
padded_sequences = []
for corpus in corpi:
corpi_sequence = tokenizer.texts_to_sequences(corpus)
padded_sequences.append(pad_sequences(corpi_sequence, maxlen=MAX_SEQUENCE_LENGTH))
return padded_sequences
padded_sequences = create_sequences(MAX_SEQUENCE_LENGTH, tokenizer, corpi)
data = padded_sequences[0]
VALIDATION_SPLIT = 0.3 #splits in train and test
# train is 70%, test 30%
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
#sets train and test(data and labels)
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
x_test = padded_sequences[1]
print('Training model.')
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(2)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(2)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = Dropout(0.2)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(3, activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['acc'])
# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=15, batch_size=128)
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y_train, y_pred)
model.predict(x_test)
# predict instead of fit for small sample
model.save_weights("mymodel.h5")
model_json = model.to_json()
with open("mymodel.json", "w") as json_file:
json_file.write(model_json)
import pickle
pickle.dump( tokenizer, open( "tokenizer.pickle", "wb" ) )
#test_sequences = create_sequences(MAX_SEQUENCE_LENGTH, tokenizer, [test_text])