ClinicalTrialsElig / Git / [c09aa8] /clusters/scripts/model.py

Models:
joseph-gordon/
ClinicalTrialsElig
Downloads: 1
[c09aa8]: / clusters / scripts / model.py
History
Download this file
170 lines (140 with data), 5.8 kB

from __future__ import print_function
import os
import numpy as np
np.random.seed(1337) #re-seed generator

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
#imports from keras for neural net
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import collections, numpy, csv
import sys
import re

def loadGloveEmbeddings():
    #Load Glove, a model of words to numbers
    # Stores a dictionary of words, with numbers corresponding
    print('Indexing word vectors.')
    BASE_DIR = '/media/hdd0/unraiddisk1/student/newsgroup' #where glove file is
    GLOVE_DIR = BASE_DIR + '/'
    GLOVE_DIR = BASE_DIR + '/glove.6B/'#accesses glove file
    embeddings_index = {} #opens Glove
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]#sets the word to 0th value in array
        
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    #index mapping words in the embeddings set
    #to their embedding vector
    
    f.close()
    return embeddings_index

embeddings_index = loadGloveEmbeddings() #opens Glove

print('Found %s word vectors.' % len(embeddings_index))
# Loaded Glove.
#embeddings_index is a map. ex: 'cat' => array(100)

def loadtrain():
    data = []
    labels = []
    with open("merged2.csv") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for line in csvreader:
            id = line[11]
            review = line[6]
            if review != "body":
                sentiment = line[11]
                labels.append(1 if (sentiment == '1') else 2 if (sentiment == '2') else 0)
                data.append(review)
    y = to_categorical(labels)
    return (data,y)

(train,y) = loadtrain()

def loadtest():
    data = []
    ids = []
    with open("testData.tsv") as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter="\t")
        for line in tsvreader:
            id = line[0]
            if id != 'id':
                review = line[1]
                data.append(review)
                ids.append(id)
    return (data,ids)

(test_text,test_ids) = loadtest()

corpi = [train, test_text]

def create_embedding_matrix(EMBEDDING_DIM, MAX_NB_WORDS, word_index):
    print('Preparing embedding matrix.')
    # prepare embedding matrix
    nb_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > MAX_NB_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return (nb_words, embedding_matrix)

MAX_SEQUENCE_LENGTH = 1000

def create_tokenizer_and_embedding(MAX_SEQUENCE_LENGTH, train):
    MAX_NB_WORDS = 5000 #sets up for padding
    EMBEDDING_DIM = 100
    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(train)
    (nb_words, embedding_matrix) = create_embedding_matrix(EMBEDDING_DIM, MAX_NB_WORDS, tokenizer.word_index)
    # load pre-trained word embeddings into an Embedding layer
    # set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)
    return (tokenizer, embedding_layer)

(tokenizer, embedding_layer) = create_tokenizer_and_embedding(MAX_SEQUENCE_LENGTH, corpi[0])

def create_sequences(MAX_SEQUENCE_LENGTH, tokenizer, corpi):
    MAX_NB_WORDS = 5000 #sets up for padding
    EMBEDDING_DIM = 100
    padded_sequences = []
    for corpus in corpi:
        corpi_sequence = tokenizer.texts_to_sequences(corpus)
        padded_sequences.append(pad_sequences(corpi_sequence, maxlen=MAX_SEQUENCE_LENGTH))
    return padded_sequences

padded_sequences = create_sequences(MAX_SEQUENCE_LENGTH, tokenizer, corpi)

data = padded_sequences[0]

VALIDATION_SPLIT = 0.3 #splits in train and test
# train is 70%, test 30%

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

#sets train and test(data and labels)
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
x_test = padded_sequences[1]

print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(2)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(2)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = Dropout(0.2)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(3, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=15, batch_size=128)
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y_train, y_pred)
model.predict(x_test)
# predict instead of fit for small sample

model.save_weights("mymodel.h5")
model_json = model.to_json()
with open("mymodel.json", "w") as json_file:
    json_file.write(model_json)

import pickle
pickle.dump( tokenizer, open( "tokenizer.pickle", "wb" ) )

#test_sequences = create_sequences(MAX_SEQUENCE_LENGTH, tokenizer, [test_text])