EHRKit-2022 / Git / [2d4573] /mimic_icd9_coding/utils/feature

Models:
philipB/
EHRKit-2022
Downloads: 1
[2d4573]: / mimic_icd9_coding / utils / feature_extractor.py
History
Download this file
96 lines (78 with data), 3.4 kB

from __future__ import unicode_literals
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import unicodedata
import pandas as pd
import numpy as np
import string
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')
stpwrds = set([stopword for stopword in stopwords.words('english')])
stpwrds.update({'admission', 'birth', 'date', 'discharge', 'service','sex'})
punct = set(string.punctuation.replace('-', ''))
punct.update(["``", "`", "..."])

def clean_text_simple(text, my_stopwords=stpwrds, punct=punct, remove_stopwords=True, stemming=False):
    text = text.lower()
    text = ''.join(l for l in text if l not in punct) # remove punctuation (preserving intra-word dashes)
    text = re.sub(' +',' ',text) # strip extra white space
    text = text.strip() # strip leading and trailing white space 
    tokens = text.split() # tokenize (split based on whitespace)
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if len(w) > 2]

    if remove_stopwords:
        # remove stopwords from 'tokens'
        tokens = [x for x in tokens if x not in my_stopwords]

    if stemming:
        # apply stemmer
        stemmer = SnowballStemmer('english')
        tokens = [stemmer.stem(t) for t in tokens]

    return tokens



def document_preprocessor(doc):
    # TODO: is there a way to avoid these encode/decode calls?
    try:
        doc = unicode(doc, 'utf-8')
    except NameError:  # unicode is a default on python 3
        pass
    doc = unicodedata.normalize('NFD', doc)
    doc = doc.encode('ascii', 'ignore')
    doc = doc.decode("utf-8")
    return str(doc)

class FeatureExtractor(TfidfVectorizer):
    """Convert a collection of raw docs to a matrix of TF-IDF features. """

    def __init__(self):
        self.min_occur = 1
        self.tfidf = TfidfVectorizer(ngram_range=(1, 1))
        self.vocab = Counter()
        super(FeatureExtractor, self).__init__(
                analyzer='word',stop_words ='english', preprocessor=document_preprocessor)

    def fit(self, X_df, y=None):
        """Learn a vocabulary dictionary of all tokens in the raw documents.

        Parameters
        ----------
        X_df : pandas.DataFrame
            a DataFrame, where the text data is stored in the ``TEXT``
            column.
        """
        statements = pd.Series(X_df).apply(clean_text_simple)
        self.vocab = Counter()
        for statement in statements:
            self.vocab.update(statement)   
        tokens = [k for k,c in self.vocab.items() if c >= self.min_occur]      
        statements = statements.apply(lambda x: [w for w in x if w in tokens])
        statements = statements.apply(lambda x: ' '.join(x))
        statements = list(statements.values)
        self.tfidf.fit(statements)
        return self

    def fit_transform(self, X_df, y=None):
        
        self.fit(X_df)
        return self.transform(self.X_df)

    def transform(self, X_df):

        statements = pd.Series(X_df.TEXT).apply(clean_text_simple)
        tokens = [k for k,c in self.vocab.items() if c >= self.min_occur]      
        
        statements = statements.apply(lambda x: [w for w in x if w in tokens])
        statements = statements.apply(lambda x: ' '.join(x))
        statements = list(statements.values)
        X_fe=self.tfidf.transform(statements)
        return X_fe