<a href="https://colab.research.google.com/github/bahrad/PTAB/blob/master/PTAB_Model_Responses_github.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Initialization

##Imports

In [None]:
%tensorflow_version 2.x

%xmode Context
# Verbose

import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os
import itertools

from collections import Counter, defaultdict
import random
from pandas import DataFrame
import datetime
from datetime import datetime
import dateutil
from dateutil.parser import parse as dateparse
from tqdm.notebook import tqdm
import time

import xgboost as xgb

import sklearn as sk
from sklearn.preprocessing import MultiLabelBinarizer, QuantileTransformer, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
import sklearn.metrics
from sklearn.metrics import accuracy_score,classification_report, make_scorer, balanced_accuracy_score, f1_score, coverage_error, roc_auc_score, confusion_matrix, plot_confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.utils import resample, shuffle
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
from sklearn.utils import class_weight

from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, CondensedNearestNeighbour, AllKNN
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import make_pipeline,Pipeline

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import string
import re
# import unicodedata

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

!pip install lime
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from lime.explanation import Explanation

In [None]:
# COMMENT OUT FOR PUBLIC CODE
from google.colab import drive, files
# drive.mount('/content/drive')

# FILELOC = "DATA/"

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.TPUStrategy(tpu)
    tpu_env=True
except ValueError:
    print('Not connected to a TPU runtime.')
    tpu_env=False

#Functions

##Define Models

In [None]:
def EmbedNN(Params):

    inpTensor = keras.Input(shape=(Params['text_length'],))
    

    
    if Params['pretrained_embeddings']:
        embedding = keras.layers.Embedding(Params['vocab_size'],
                                           Params['embedding_dim'],
                                           weights=[Params['embeddings']],
                                           input_length=Params['text_length'],
                                           mask_zero=True,
                                           trainable=False,
                                           )        
    else:
        embedding = keras.layers.Embedding(Params['vocab_size'],
                                           Params['embedding_dim'],
                                           mask_zero=True,
                                           trainable=True,
                                           name='embedding',
                                           )
    x = embedding(inpTensor)

    convs = []
    filter_sizes = list(range(Params['min_filter_size'],Params['max_filter_size']+1))
    for filter_size in filter_sizes:
        l_conv = keras.layers.Conv1D(filters=Params['num_filters'], 
                        kernel_size=filter_size,
                        kernel_regularizer=keras.regularizers.l2(Params['kernel_L2_reg']),
                        activation='relu')(x)
        h = keras.layers.TimeDistributed(keras.layers.Dense(Params['num_filters'],
                                                            activation='tanh'))(l_conv)
        attention = keras.layers.TimeDistributed(keras.layers.Dense(1, activation='tanh'))(h)
        attention = keras.layers.Flatten()(attention)  
        attention = keras.layers.Softmax(axis=1,
                                         name='attention_'+str(filter_size))(attention)
        attention = keras.layers.RepeatVector(Params['num_filters'])(attention)
        attention = keras.layers.Permute([2, 1])(attention)
        representation = keras.layers.multiply([h, attention])
        representation = tf.math.reduce_sum(representation, axis = 1)
        convs.append(representation)
        # l_pool = keras.layers.GlobalMaxPooling1D()(l_conv)
        # convs.append(l_pool)
    l_merge = keras.layers.concatenate(convs, axis=1)
    
    x = keras.layers.Dropout(Params['dropout_after_convs'])(l_merge) 

    dense1 = keras.layers.Dense(Params['num_dense'],
                                kernel_constraint=Params['kernel_constraint'],
                                activation = 'relu')(x)
    x = dense1
    dropout1 = keras.layers.Dropout(Params['dropout_after_Dense'])(x)
    x = dropout1

    if not Params['ifMulticlass']:
        finalOut = keras.layers.Dense(1, activation='sigmoid',
                                    bias_initializer=tf.keras.initializers.Constant(Params['initial_bias'])
                                    )(x)
    else:
        finalOut = keras.layers.Dense(Params['nclasses'], activation='softmax')(x)

    # define the model's start and end points    
    model = keras.Model(inpTensor,finalOut)

    return model

#Define Parameters

In [None]:
Params = {}

In [None]:
Params['num_epochs'] = 50

Params['learning_rate'] = 1e-4
if tpu_env:
    Params['batch_size'] = 48
else:
    Params['batch_size'] = 48

Params['embedding_dim'] = 128 # 128

# CNN parameters
Params['min_filter_size'] = 2
Params['max_filter_size'] = 12 # 12
Params['num_filters'] = 256 # 256
Params['dropout_after_convs'] = 0.4 # 0.4
# Dense Layer Parameters
Params['num_dense'] = 256 # 256
Params['dropout_after_Dense'] = 0.4

# Transformer+Attention Model parameters
Params['embdim'] = 2000
Params['mask_zero'] = True
Params['numheads'] = 8
Params['ffdim'] = 64
Params['trans_drop'] = 0.4
Params['Nt'] = 1
Params['ifPreCNN'] = False
if Params['ifPreCNN']:
    Params['W'] = 500
    Params['Nc'] = 1
    Params['Nl'] = 1
Params['num_dense_embed'] = 64 # 256
Params['dropout_after_Dense_embed'] = 0.0 #0.2

Params['kernel_constraint'] = keras.constraints.max_norm(1.0)
Params['kernel_L2_reg'] = 0.1
Params['bias_L2_reg'] = 0.1
Params['activity_L2_reg'] = 0.1

Params['ifMulticlass'] = False
Params['nclasses'] = 2

Params['sample_weighting'] = True

Params['loss'] = keras.losses.BinaryCrossentropy(from_logits=False)
if not tpu_env:
    # otherwise have to define in the TPU environment
    Params['metrics'] = [
                        #  keras.metrics.TruePositives(name='tp'),
                        #  keras.metrics.FalsePositives(name='fp'),
                        #  keras.metrics.TrueNegatives(name='tn'),
                        #  keras.metrics.FalseNegatives(name='fn'),
                        keras.metrics.BinaryAccuracy(name='acc'),
                        # keras.metrics.PrecisionAtRecall(0.5, name='par50'),
                        #  keras.metrics.Precision(name='prec'),
                        #  keras.metrics.Recall(name='rec'),
                        keras.metrics.AUC(name='auc'),
                        ]

# Params['initial_bias'] = np.log(num1/num0)
# Params['initial_bias'] = np.log(2) # default
# Params['initial_bias'] = None

Params['ifEarlyStopping'] = True
# Params['ifEarlyStopping'] = False
# Params['monitor'] = 'loss'
Params['monitor'] = 'val_auc'
Params['patience'] = 10
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor = Params['monitor'],
    verbose = 2,
    patience = Params['patience'],
    mode = 'auto',
    min_delta = 0,
    restore_best_weights = True
    )
Params['callbacks'] = [early_stopping]

#Text Preprocessing & Tokenization

In [None]:
DOCTYPE = 'Responses'

Params['pretrained_embeddings'] = False

# Params['num_words_to_use'] = None
Params['num_words_to_use'] = 20000
# Params['num_words_to_use'] = 1000


# Params['text_length'] = 4000
Params['text_length'] = 8000
Params['text_start'] = 0 # 100
Params['text_end'] = Params['text_start'] + Params['text_length']

In [None]:
ptab = pd.read_csv(FILELOC + 'PTAB_Institution_Proceedings_to_20211231.tsv', sep='\t')
# print(len(ptab))
# ptab.drop_duplicates('Proceeding', inplace=True)
# print(len(ptab))
# ptab['date'] = ptab['Case Filing Date'].apply(dateparse)

# trainingvariable = 'Responses'

In [None]:
# with open(FILELOC + 'IPR_Proceeding_PartyNames_12312022.txt', 'r', encoding="ISO-8859-1") as f:
#     outfile = [line.rstrip('\n') for line in f]
# case = []; number = []
# for k in range(0,len(outfile),2):
#     case.append(outfile[k])
#     number.append(outfile[k+1])
# casedf = pd.DataFrame.from_dict({'case':case, 'proc':number})
# casedf.drop_duplicates('proc', inplace=True)
# casedf['proc'] = casedf['proc'].apply(lambda x: x.split('(')[0].strip())
# casedf['name'] = casedf['case'].apply(lambda x: x.strip("\""))
# casedf[casedf.name.str.contains('Petition')].to_csv('a.csv')

# common_names = set(['business', 'doing', 'company', 'corporation', 'formerly', 'et', 'al'])

# def f(x):
#     y = x
#     if 'Petition' in x:
#         if 'Covered' in x:
#             y = x.replace("Petition for Covered Business Method Patent Review by","")
#         elif 'Inter' in x:
#             y = x.replace("Petition for Inter Partes Review by", "")
#     y = y.translate(str.maketrans('', '', string.punctuation))
#     if 'v' in y:
#         y = y.replace("v", "")
#     y = [s.strip() for s in y.strip().split(' ') if s != "" and s not in STOPWORDS|common_names]
#     return y
# casedf['party_names'] = casedf['name'].apply(f)

# ptdf = pd.merge(ptab,casedf,left_on='Proceeding',right_on='proc',how='inner')
# print(len(ptab), len(casedf), len(ptdf))

In [None]:
# # Drop error messages and duplicates

# ptdf.drop(columns=list({'Petitions','Responses','Decisions'}-{DOCTYPE}), inplace=True)
# ptdf.drop(columns=['case','proc'], inplace=True)

# ptdf[DOCTYPE] = ptdf[DOCTYPE].fillna('NA')

# # clean up texts by removing (cid:##) which is likely an artifact of the PDF reading process
# cid_str = re.compile("\(cid:\d+\)")
# def f(x):
#     return re.sub(cid_str, "", x)
# ptdf[DOCTYPE] = ptdf[DOCTYPE].apply(f)

# def get_word_count(text):
#     return len(text.split())
# ptdf[f'{DOCTYPE}_Len'] = ptdf[DOCTYPE].apply(get_word_count)
# MIN_LENGTH = 50

# print(len(ptdf))
# ptdf.drop(ptdf[ptdf[f'{DOCTYPE}_Len'] < MIN_LENGTH].index, inplace=True)
# print(len(ptdf))
# ptdf.drop_duplicates(DOCTYPE, keep=False, inplace=True)
# print(len(ptdf))

# ptdf.reset_index(inplace=True)

In [None]:
Params['remove_stop_words'] = True
Params['remove_alphanumeric'] = True
Params['remove_punctuation'] = True
Params['remove_shortword_size'] = 3
Params['remove_propernouns'] = True

Params['clean_all'] = True
Params['remove_shortword_size'] = 3

Params['use_lowercase'] = True

In [None]:
# remove_shortword_size = Params['remove_shortword_size']
# def process_docs(x):
#     doc = x.replace("‘", "\'").replace("’", "\'").replace("´", "\'").replace("“", "\"").replace("”", "\"")
#     t = nltk.tokenize.word_tokenize(doc)
#     PUNCT = set(string.punctuation + u"‘’´`“”–-§")
#     tags = nltk.tag.pos_tag(t)
#     propernouns = set([a for a,b in tags if b=='NNP'])
#     # shortwords = set([tt for tt in t if len(tt) <= remove_shortword_size])
#     noisewords = set([tt for tt in t if (len(tt) <= 2) and any(map(lambda x: x in PUNCT, tt))])
#     numwords = set([tt for tt in t if any(map(str.isdigit, tt))])
#     emailwords = set([tt for tt in t if '@' in tt])
#     dotwords = set([tt for tt in t if '.' in tt])
    
#     # reject_list = PUNCT|propernouns|STOPWORDS|shortwords|numwords|emailwords|dotwords
#     reject_list = PUNCT|propernouns|numwords|emailwords|dotwords|noisewords
#     proct = [tt for tt in t if tt not in reject_list]
#     return proct

# doclist = ptdf[DOCTYPE].tolist()
# # docmap = map(process_docs, doclist)
# # tokdocs = [doc for doc in tqdm(docmap)]
# tokdocs = [process_docs(doc) for doc in tqdm(doclist)]
# with open(FILELOC + 'Tokenized_Responses_20220212.pkl', 'wb') as f:
#     pickle.dump([ptdf, tokdocs], f)

In [None]:
# if Params['remove_propernouns'] or Params['clean_all']:
#     def f(x):
#         if 'v.' not in x:
#             return 'NO_PARTY'
#         else:
#             y = x.split('v.')
#             petitioner = y[0].split()[0].strip().replace(',', '')
#             patentowner = y[1].split()[0].strip().replace(',', '')
#             return [petitioner, patentowner]

# parties_first = casedf['name'].apply(f).values
# CASENAMES = set(itertools.chain.from_iterable(parties_first))

# docs = ptdf[DOCTYPE].values
# partyname_list = ptdf['party_names'].tolist()

In [None]:
# if Params['clean_all']:
#     remove_shortword_size = Params['remove_shortword_size']
#     def process_docs(x):
#         doc = x
#         t = nltk.tokenize.word_tokenize(doc)
#         PUNCT = string.punctuation + u"‘’´“”–-"
#         propernouns = set([a for a,b in nltk.tag.pos_tag(t) if b=='NNP'])
#         shortwords = set([tt for tt in t if len(tt) <= remove_shortword_size])
#         reject_list = set(PUNCT)|propernouns|STOPWORDS|shortwords

#         proct = [tt for tt in t if tt.isalpha() and tt not in reject_list]
#         return proct

# else:
#     remove_punct = Params['remove_punctuation']
#     remove_stopwords = Params['remove_stop_words']
#     remove_alphanumeric = Params['remove_alphanumeric']
#     # set to False or None if not used otherwise remove this length or less
#     remove_shortword_size = Params['remove_shortword_size']
#     remove_proper = Params['remove_propernouns']

#     def process_docs(x):
#         doc, partynames = x
#         t = nltk.tokenize.word_tokenize(doc)
#         PUNCT = string.punctuation + u"‘’´“”–-"
#         if remove_punct:
#             proct = [tt for tt in t if tt not in set(PUNCT)]
#         if remove_stopwords:
#             proct = [tt for tt in proct if tt not in STOPWORDS]
#         if remove_alphanumeric:
#             proct = [tt for tt in proct if tt.isalpha()]
#         if remove_shortword_size:
#             proct = [tt for tt in proct if len(tt) > remove_shortword_size]
#         propernouns = set([a for a,b in nltk.tag.pos_tag(proct) if b=='NNP'])
#         if Params['keep_case_names']:
#             propernouns = propernouns - (CASENAMES - set(partynames))
#         if remove_proper:
#             proct = [tt for tt in proct if tt not in propernouns]
#         return proct

# if Params['clean_all']:
#     tokdocs = ptdf[DOCTYPE].apply(process_docs)
# else:
#     tokdocs = [process_docs([docs[ind], partyname_list[ind]]) for ind in tqdm(ptdf.index)]

# # with open(FILELOC + 'Tokenized_Responses_20220131.pkl', 'wb') as f:
# #     pickle.dump([ptdf, tokdocs], f)
# # with open(FILELOC + 'Tokenized_Decisions_20220131.pkl', 'wb') as f:
# #     pickle.dump([ptdf, tokdocs], f)

In [None]:
# with open(FILELOC + 'Tokenized_Responses_20220131.pkl', 'rb') as f:
#     ptdf, tokdocs = pickle.load(f)

# with open(FILELOC + 'Tokenized_Responses_noproper_20220131.pkl', 'rb') as f:
#     ptdf, tokdocs = pickle.load(f)

with open(FILELOC + 'Tokenized_Responses_20220212.pkl', 'rb') as f:
    ptdf, tokdocs = pickle.load(f)

In [None]:
# take a list of tokenized documents (i.e. list of lists) and derive an integer
# mapping dictionary (0 = not used, 1 = out of vocabular, 2+ are tokens) for the
# all (if num_words=None) or num_words most common words
# It will generate a 2D array of truncated / padded document vectors (vec_len)
# If lowercase set to True then converts all tokens to lowercase
# Out of vocabulary string is "oov_str" (default '<OOV>')

class Token2Int(BaseEstimator,TransformerMixin):
    def __init__(self, vec_len, num_words=None, oov_str='<OOV>', lowercase=True):
        self.vec_len = vec_len
        self.num_words = num_words
        self.oov_str = oov_str
        self.lowercase = lowercase

    def fit(self, X, y=None):
        if type(X[0]) is not list:
            X = [X] # only a single document was passed
        if self.lowercase:
            X = [[d.lower() for d in doc] for doc in X]
        wc = Counter(itertools.chain.from_iterable(X))
        self.word_count = wc
        vocab = [w for w,c in wc.most_common(self.num_words)]
        vocab.insert(0, self.oov_str)       # assign 1 to OOV
        self.vocab = vocab
        self.vocab_size = len(vocab)
        wordmap = {n:m+1 for m,n in enumerate(vocab)}
        self.word_index = wordmap
        self.index_word = {n:m for m,n in wordmap.items()}
        return self

    def transform(self, X):
        if type(X[0]) is not list:
            X = [X] # only a single document was passed
        # X = np.array(list(itertools.zip_longest(*X, fillvalue=0))).T
        if self.lowercase:
            # X = np.vectorize(str.lower)(X)
            X = [[d.lower() for d in doc] for doc in X]
        wordmap = self.word_index
        vocab = self.vocab
        veclen = self.vec_len
        numdocs = len(X)
        # wordmap['0'] = 0
        # # textpad = np.array([t[:veclen] if len(t) >= veclen else t + ['0']*(veclen-len(t)) for t in X]).astype(str)
        # X = [[wordmap.get(x, 1) for x in t] for t in X]
        # return pad_sequences(X, maxlen=veclen, padding='post', truncating='post')
        textpad = np.zeros((numdocs, veclen))
        for d in tqdm(range(numdocs)):
            doc = X[d]
            doclen = min(len(doc), veclen)
            textpad[d,:doclen] = [wordmap.get(word, 1) for word in doc[:doclen]]
            # textpad[d,:doclen] = [wordmap[word] if word in vocab else 1 for word in doc[:doclen]]
        return textpad

    def reverse(self, textpad):
        texts = []
        for row in textpad:
            int2text = ['' if w==0 else self.index_word[w] for w in row]
            texts.append(' '.join(int2text).strip())
        return texts

In [None]:
map_outcome2unpat = {'Denied': 0,
                     'Denied on Rehearing': -1,
                     'Mixed': 0,
                     'Granted': 1,
                     'Granted on Rehearing': -1,
                     'Indefinite': -1,
                     }
ptdf['Unpatentable'] = ptdf['Decision'].map(map_outcome2unpat)

selind = ptdf[ptdf['Unpatentable'] != -1].index

print(len(ptdf))
ptdf.drop(ptdf[ptdf['Unpatentable'] == -1].index, inplace=True)
print(len(ptdf))
ptdf.reset_index(inplace=True)
tokdocs = [tokdocs[ind] for ind in range(len(tokdocs)) if ind in selind]

9283
9182


In [None]:
def f_pet(x):
    case, partyname = x
    if ' v.' in case:
        try:
            pet,po = case.split(' v.')
        except:
            print(case)
        return pet.strip()
    else:
        return ' '.join(partyname)

def f_po(x):
    case = x
    if ' v.' in case:
        pet,po = case.split(' v.')
        return po.strip()
    else:
        return 'UNKNOWN'

ptdf['petitioner_raw'] = ptdf[['name', 'party_names']].apply(f_pet, axis=1)
ptdf['patent_owner_raw'] = ptdf['name'].apply(f_po)

In [None]:
common_terms = ['corporation','corp.',' corp ', '& co','co.',' co ','company',
                'l.l.c.', 'llc', 'l.c', ' lc',
                'l.l.p.', 'llp', 'l.p.', ' lp',
                'incorporated', 'inc.', ' inc ',
                'limited', 'ltd',
                ' sa ', ' se ', ' ag ',
                'gmbh', 'a/s', 'bv', ' nv', 'n.v.',
                'et al',
                'n.a.', ' us ', ' usa ',
                '(us)', '(usa)', '(u.s.)', '(u.s.a.)',
                '(california)', '(delaware)', '(united states)',
                ' i,',  'ii', 'iii',
                '1)', '2)',
                ]
replace_common_terms = '|'.join(common_terms).replace('/','\/').replace(' ','\s').replace('.','\.').replace('(','\(').replace(')','\)')
replace_common_terms += '|\s\d+\s'
regexp_common_terms = re.compile(replace_common_terms, re.IGNORECASE)

split_terms = ['d/b/a/', 'd/b/a', 'doing business as', 'formerly known as', 'f/k/a/', 'f/k/a', ' and ']
split_terms_list = '|'.join(split_terms).replace('/','\/').replace(' ','\s')
regexp_split_terms = re.compile(split_terms_list, re.IGNORECASE)

def f_clean(x):
    # no cleaning up special characters
    # add a trailing whitepace to eliminate edge effects for "lp" and "inc"
    x += ' '
    # remove common terms
    if any([t in x.lower() for t in common_terms]):
        # remove commas and periods associated with these terms
        x = re.sub(regexp_common_terms, '', x)
        x = x.replace(', ',' '); x = x.replace('. ',' ')
        x = x.strip().strip(',').strip('.')

    if any([t in x.lower() for t in split_terms]):
        x = re.split(regexp_split_terms, x)
        x = ';'.join(x)

    return x

ptdf['petitioner'] = ptdf['petitioner_raw'].apply(f_clean)
ptdf['patent_owner'] = ptdf['patent_owner_raw'].apply(f_clean)

In [None]:
trainindex = ptdf[ptdf.date.between(pd.Timestamp(2018,7,1), pd.Timestamp(2020,11,30))].index


toktrain = [tokdocs[ind] for ind in range(len(tokdocs)) if ind in trainindex]

tok2int = Token2Int(Params['text_length'], num_words = Params['num_words_to_use'])
tok2int.fit(toktrain)
Params['vocab_size'] = tok2int.vocab_size + 1   # add the 0 token
print(Params['vocab_size'])

X_train = tok2int.transform(toktrain).astype(int)
Y_train = ptdf.loc[trainindex, 'Unpatentable'].values

testindex = ptdf[ptdf.date.between(pd.Timestamp(2020,12,1),pd.Timestamp(2021,3,31))].index
toktest = [tokdocs[ind] for ind in range(len(tokdocs)) if ind in testindex]
X_test = tok2int.transform(toktest).astype(int)
Y_test = ptdf.loc[testindex, 'Unpatentable'].values

20002


  0%|          | 0/2631 [00:00<?, ?it/s]

#Fit Model

In [None]:
if Params['sample_weighting']:
    class_wts = list(class_weight.compute_class_weight(class_weight='balanced',
                                                    classes=np.unique(Y_train), y=Y_train))
    print(class_wts)
    Params['sample_weights'] = np.array([class_wts[yt] for yt in Y_train])

    num = len(Y_train)
    num0 = len(np.where(Y_train==0)[0]); num1 = len(np.where(Y_train==1)[0])
    if num1 < num0:
        Params['initial_bias'] = np.log(num1/num0)
    else:
        Params['initial_bias'] = np.log(num0/num1)

else:
    Params['initial_bias'] = 0

In [None]:
for run in range(5):
    print(run)
    
    tf.keras.backend.clear_session()    # reset Tensorflow session

    X_t = X_train; Y_t = Y_train    

    with tpu_strategy.scope():
        Params['loss'] = keras.losses.BinaryCrossentropy(from_logits=False)
        Params['metrics'] = [keras.metrics.BinaryAccuracy(name='acc'),
                        keras.metrics.AUC(name='auc'),]
        model = EmbedNN(Params)
        model.compile(loss=Params['loss'],
                    optimizer=keras.optimizers.Adam(learning_rate=Params['learning_rate']),
                    metrics=Params['metrics'],
                    steps_per_execution = 100,)

        if Params['sample_weighting']:
            train_dataset = tf.data.Dataset.from_tensor_slices((X_t, Y_t, Params['sample_weights']))
            Params['val_sample_weights'] = np.array([class_wts[yt] for yt in Y_test])
            val_dataset = tf.data.Dataset.from_tensor_slices((X_test, Y_test, Params['val_sample_weights']))
        else:
            train_dataset = tf.data.Dataset.from_tensor_slices((X_t, Y_t))
            val_dataset = tf.data.Dataset.from_tensor_slices((X_test, Y_test))
        history = model.fit(train_dataset.batch(Params['batch_size']),
                            epochs = Params['num_epochs'], verbose = 1,)
                            # validation_data = val_dataset.batch(Params['batch_size']),
                                # callbacks=Params['callbacks'])

    print("Results for Testing Data:")
    test_predict = model.predict(X_test)
    test_predict_bool = np.round(test_predict)
    TestPredict = test_predict_bool
    ClassRep = classification_report(Y_test, test_predict_bool)
    ConfMatrix = confusion_matrix(Y_test, test_predict_bool)
    print(ClassRep)
    print(ConfMatrix)

    model.save_weights(FILELOC+"responses_"+str(run)+"_wts.h5", save_format='h5', overwrite=True)

#Interpret Results

In [None]:
if Params['sample_weighting']:
    class_wts = list(class_weight.compute_class_weight(class_weight='balanced',
                                                    classes=np.unique(Y_train), y=Y_train))
    print(class_wts)
    Params['sample_weights'] = np.array([class_wts[yt] for yt in Y_train])

    num = len(Y_train)
    num0 = len(np.where(Y_train==0)[0]); num1 = len(np.where(Y_train==1)[0])
    if num1 < num0:
        Params['initial_bias'] = np.log(num1/num0)
    else:
        Params['initial_bias'] = np.log(num0/num1)

else:
    Params['initial_bias'] = 0

tf.keras.backend.clear_session()
with tpu_strategy.scope():
    # try:
    model = EmbedNN(Params)
    model.load_weights(FILELOC+"responses_wts.h5")
    pred_test = model.predict(X_test, verbose=False)

att = {}
for n in range(2,12+1):
    get_attention_model = keras.Model(inputs=model.input,outputs=model.get_layer(f'attention_{n}').output)
    get_attention_model.compile()
    att[n] = get_attention_model.predict(xtest, verbose=1)

##Attention Visualization

In [None]:
from IPython.display import HTML
def  hlstr(string, color='white'):
    """
    Return HTML markup highlighting text with the desired color.
    """
    return f"<mark style=background-color:{color}>{string} </mark>"

def colorize(attrs, cmap='PiYG'):
    """
    Compute hex colors based on the attributions for a single instance.
    Uses a diverging colorscale by default and normalizes and scales
    the colormap so that colors are consistent with the attributions.
    """
    import matplotlib as mpl
    cmap_bound = np.abs(attrs).max()
    norm = mpl.colors.Normalize(vmin=-cmap_bound, vmax=cmap_bound)
    cmap = mpl.cm.get_cmap(cmap)

    # now compute hex values of colors
    colors = list(map(lambda x: mpl.colors.rgb2hex(cmap(norm(x))), attrs))
    return colors

In [None]:
N = 8
n = 7   # document index
print(pred                                                                                                                                                                                                                               [n])
xlen = np.where(X_test[n]==0)[0][0]
attvec = att[N][n][:xlen]
xvec = tok2int.reverse([X_test[n][:xlen]])[0]
strlen = len(attvec)
THRESH = np.median(attvec)
colors = colorize(attvec - THRESH)

HTML("".join(list(map(hlstr, xvec.split(), colors))))

0.31546426


Highest attention words in sample

In [None]:
uniquetokens = np.unique(xtest[n][:xlen])
print(len(uniquetokens))
tokpos = [np.where(xtest[n][:xlen] == tok)[0] for tok in uniquetokens]
meanatt = np.array([np.mean(att[2][n][np.array(tpos)]) for tpos in tokpos])
tokens_sorted_by_meanatt = uniquetokens[np.argsort(-meanatt)]

605


In [None]:
pd.DataFrame.from_dict({'Attention':[tok2int.index_word[t] for t in tokens_sorted_by_meanatt[:20]]})

Unnamed: 0,Attention
0,processor
1,entirety
2,patents
3,foregoing
4,cryptographic
5,cure
6,makes
7,amounts
8,keys
9,briefs


##LIME Analysis

In [None]:
def lean_wrapper(texts):
    x = pad_sequences(DTP.texts_to_sequences(texts),
                      maxlen = Params['text_length'],
                      padding='post',
                      truncating='post')
    return np.hstack((1-model.predict(x), model.predict(x)))

In [None]:
n = 2774
xvec = DTP.sequences_to_texts([X_data[n]])[0]

exp = LimeTextExplainer(class_names={0:'Denied',1:'Granted'})
exp_doc = exp.explain_instance(xvec, lean_wrapper, num_features=50)
# explist = exp_doc.as_list()
exp_doc.show_in_notebook()