MedicalDiagnosis / Git / [1d8be1] /utils.py

Models:
philipB/
MedicalDiagnosis
Downloads: 1
[1d8be1]: / utils.py
History
Download this file
208 lines (170 with data), 7.6 kB

from __future__ import print_function
from functools import reduce
import tarfile
import numpy as np
import re
import pdb
import pickle
import h5py
from keras.preprocessing.sequence import pad_sequences
from glove import Glove


def check_repeated(name,repeated_list):
    name = name.lower().strip()
    return name if not (name in repeated_list) else repeated_list[name]

def process_title(word):
    return re.sub(r'\W+', ' ', word).strip().lower()

def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip().lower() for x in re.split('(\W+)?', sent) if x.strip()]


def parse_stories(lines, only_supporting=False,repeated_list=None):
    '''Parse stories

    If only_supporting is true, only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        spl = line.split(' ', 1)
        if len(spl) > 1:
            nid, line = spl
        else:
            continue
        try:
            nid = int(nid)
        except ValueError:
            pdb.set_trace()
        if nid == 0:
            story = []
        if '\t' in line:
            supporting, a = line.split('\t')
            a = map(process_title,a.split(','))
            options = [] if len(a) == 1 else list(set(a[1:]))
            a = a[0]    
            substory = None
            # Provide all the substories
            if supporting:
                story.append([tokenize(supporting) + [u'.']])
            substory = [x for x in story if x]
            # TODO: I should have done the lower in previous processing steps
            if not substory:
                continue
            data.append((substory, a.lower(), map(lambda x:x.lower(),options)))
        else:
            sent = tokenize(line)
            story.append([sent + [u'.']])
    return data


def get_stories(f, only_supporting=False, max_length=None,repeated_list=None, min_length=None):
    '''Given a file name, read the file, retrieve the stories, and then convert the sentences into a single story.

    If max_length is supplied, any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting,repeated_list=repeated_list)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [[flatten(reversed(story)), answer, options] for story,answer,options in data if not max_length or len(flatten(story)) < max_length]
    # At least two facts
    print(len(data))
    if min_length: 
        data = filter(lambda x: len(x[0]) > min_length, data)
    print(len(data))
    return data


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []
    Xq = []
    Y = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        y = np.zeros(len(word_idx))  # let's not forget that index 0 is reserved
        y[word_idx[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(X, maxlen=story_maxlen),
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

def get_spacy_vectors(data, answer_dict, story_maxlen, model):
    X = []
    Y = []
    for story,answer in data:
        story = story[:story_maxlen] if len(story) > story_maxlen else story
        x = [model(unicode(w)).vector for w in story]
        X.append(x)
        if not answer_dict is None:
            y = np.zeros(len(answer_dict))
            y[answer_dict[answer]] = 1
            Y.append(y)
    return (pad_sequences(X, maxlen=story_maxlen,dtype='float32'),
            np.array(Y))

def get_word_vectors(data, answer_dict, story_maxlen, model):
    X = []
    Y = []
    for story,answer in data:
        story = story[:story_maxlen] if len(story) > story_maxlen else story
        x = [model.word_vectors[model.dictionary[w]] for w in story if 
             not model.dictionary.get(w) is None]
        X.append(x)
        if not answer_dict is None:
            y = np.zeros(len(answer_dict))
            y[answer_dict[answer]] = 1
            Y.append(y)
    return (pad_sequences(X, maxlen=story_maxlen,dtype='float32'),
            np.array(Y))
     
def create_vectors_dataset(input_files, vector_files, max_len=500):
    print('Creating word vectors file')

    training_set_file, test_set_file = input_files
    train_word_file, test_word_file = vector_files
    
    train_stories = pickle.load(open(training_set_file,'r'))
    test_stories = pickle.load(open(test_set_file,'r'))

    train_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in train_stories]
    test_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in test_stories]

    vocab = sorted(reduce(lambda x, y: x | y, (set(story + [answer]) for story, answer in train_stories + test_stories)))

    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1
    story_maxlen = max(map(len, (x for x, _ in train_stories + test_stories)))


    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Story max length:', story_maxlen, 'words')
    print('Number of training stories:', len(train_stories))
    print('Number of test stories:', len(test_stories))
    print('-')
    print('Here\'s what a "story" tuple looks like (input, query, answer):')
    print(train_stories[0])
    print('-')
    print('Vectorizing the word sequences...')

    word_idx = dict((c, i + 1) for i, c in enumerate(vocab))

    answer_vocab = sorted(reduce(lambda x, y: x | y, (set([answer]) for _, answer in train_stories + test_stories)))
    # Reserve 0 for masking via pad_sequences
    answer_dict = dict((word, i) for i, word in enumerate(answer_vocab))
    print('Answers dict len: {0}'.format(len(answer_dict)))

    # I need to check also if this exist
    word_vectors_dir = 'word_vectors/glove.42B.300d.txt'
    word_vectors_model = Glove.load_stanford(word_vectors_dir)

    inputs_train, answers_train = get_word_vectors(train_stories, answer_dict, 
                                                   max_len, word_vectors_model)
    inputs_test, answers_test = get_word_vectors(test_stories, answer_dict, max_len,
                                                 word_vectors_model)

    with h5py.File(train_word_file,'w') as train_f:
        _ = train_f.create_dataset('inputs',data=inputs_train)
        _ = train_f.create_dataset('answers',data=answers_train)
    with h5py.File(test_word_file,'w') as test_f:
        _ = test_f.create_dataset('inputs',data=inputs_test)
        _ = test_f.create_dataset('answers',data=answers_test)
        
    return (inputs_train, answers_train),(inputs_test, answers_test)

def save_vectors_dict(input_files):


    # I need to check also if this exist
    filename = 'word_vectors/glove.42B.300d.txt'
    word_vectors_dict = 'word_vectors/glove_dict.hdf5'
    dct = {}
    vectors = array.array('d')

    # Read in the data.
    with io.open(filename, 'r', encoding='utf-8') as savefile:
        for i, line in enumerate(savefile):
            tokens = line.split(' ')

            word = tokens[0]
            entries = tokens[1:]

            dct[word] = i
            vectors.extend(float(x) for x in entries)
            
    print('Saving to hf5 file')
    with h5py.File(word_vectors_dict,'w') as vector_f:
        _ = vector_f.create_dataset('vectors',data=dct)