lstm_predict / Git / [c11b6d] /d2v.py

Models:

philipB/

lstm_predict

Downloads: 1

[c11b6d]: / d2v.py

History

Download this file

53 lines (41 with data), 1.4 kB

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 25 08:51:35 2019

@author: aaq109
For queries Contact: awais.ashfaq@hh.se
"""
from __future__ import print_function
from gensim.models import doc2vec
from collections import namedtuple
import scipy.io as sio


label = 'sampledata_d2v.csv'
admDiagMap = {}
infd = open(label, 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    admId = (tokens[0])
    d = (tokens[1])
    diagId = d.replace('"', '')
    if admId in admDiagMap:
        admDiagMap[admId].append(diagId)
    else:
        admDiagMap[admId] = [diagId]

infd.close()

s1 = list(admDiagMap.values())

docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, text in enumerate(s1):
    words = text
    tags = [i]
    docs.append(analyzedDocument(words, tags))

Emb_size=185 #size of the visit vector K
window=121 # Max length of codes in any visit
min_count=0 # Consider all codes
ns=20 # Negative sampling
ns_exponent=-0.75 # Negative because we like to account for rare clinical events
dm=0 # for PV-DBOW


model = doc2vec.Doc2Vec(docs, size = Emb_size, window = window, min_count = min_count, workers = 4,negative =ns, ns_exponent=ns_exponent, dm=dm)

# Get the vectorsand save

d2v=model.docvecs.doctag_syn0
sio.savemat('d2v_185.mat', {'d2v_185':d2v})