{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "p-aVXISlSKYH" }, "source": [ "#Initialization" ] }, { "cell_type": "markdown", "metadata": { "id": "wOFk8JfaSMSV" }, "source": [ "##Imports" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ESf61HRPSERn" }, "outputs": [], "source": [ "%tensorflow_version 2.x\n", "\n", "%xmode Context\n", "# Verbose\n", "\n", "import tensorflow as tf\n", "from tensorflow import keras\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import pickle\n", "import os\n", "import itertools\n", "\n", "from collections import Counter, defaultdict\n", "import random\n", "from pandas import DataFrame\n", "import datetime\n", "from datetime import datetime\n", "import dateutil\n", "from dateutil.parser import parse as dateparse\n", "from tqdm.notebook import tqdm\n", "import time\n", "\n", "import xgboost as xgb\n", "\n", "import sklearn as sk\n", "from sklearn.preprocessing import MultiLabelBinarizer, QuantileTransformer, OneHotEncoder, StandardScaler\n", "from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV\n", "from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit\n", "from sklearn.linear_model import LogisticRegression\n", "import sklearn.metrics\n", "from sklearn.metrics import accuracy_score,classification_report, make_scorer, balanced_accuracy_score, f1_score, coverage_error, roc_auc_score, confusion_matrix, plot_confusion_matrix\n", "from sklearn.cluster import KMeans\n", "from sklearn.decomposition import PCA\n", "from sklearn.utils import resample, shuffle\n", "from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin\n", "from sklearn.neighbors import NearestNeighbors\n", "from sklearn.manifold import TSNE\n", "from sklearn.utils import class_weight\n", "\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE\n", "from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, CondensedNearestNeighbour, AllKNN\n", "from imblearn.combine import SMOTEENN, SMOTETomek\n", "from imblearn.pipeline import make_pipeline,Pipeline\n", "\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "\n", "import string\n", "import re\n", "# import unicodedata\n", "\n", "import nltk\n", "nltk.download('stopwords')\n", "from nltk.corpus import stopwords\n", "STOPWORDS = set(stopwords.words('english'))\n", "\n", "nltk.download('averaged_perceptron_tagger')\n", "nltk.download('wordnet')\n", "nltk.download('punkt')\n", "\n", "!pip install lime\n", "import lime\n", "from lime import lime_text\n", "from lime.lime_text import LimeTextExplainer\n", "from lime.explanation import Explanation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8KTO164A7t1G" }, "outputs": [], "source": [ "# COMMENT OUT FOR PUBLIC CODE\n", "from google.colab import drive, files\n", "# drive.mount('/content/drive')\n", "\n", "# FILELOC = \"DATA/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Xvw4VK92r4-b" }, "outputs": [], "source": [ "try:\n", " tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection\n", " print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])\n", " tf.config.experimental_connect_to_cluster(tpu)\n", " tf.tpu.experimental.initialize_tpu_system(tpu)\n", " tpu_strategy = tf.distribute.TPUStrategy(tpu)\n", " tpu_env=True\n", "except ValueError:\n", " print('Not connected to a TPU runtime.')\n", " tpu_env=False" ] }, { "cell_type": "markdown", "metadata": { "id": "oSiHsBoY4_bv" }, "source": [ "#Functions" ] }, { "cell_type": "markdown", "metadata": { "id": "E3qcPlglEzjC" }, "source": [ "##Define Models" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Va27pqfoegI_" }, "outputs": [], "source": [ "def EmbedNN(Params):\n", "\n", " inpTensor = keras.Input(shape=(Params['text_length'],))\n", " \n", "\n", " \n", " if Params['pretrained_embeddings']:\n", " embedding = keras.layers.Embedding(Params['vocab_size'],\n", " Params['embedding_dim'],\n", " weights=[Params['embeddings']],\n", " input_length=Params['text_length'],\n", " mask_zero=True,\n", " trainable=False,\n", " ) \n", " else:\n", " embedding = keras.layers.Embedding(Params['vocab_size'],\n", " Params['embedding_dim'],\n", " mask_zero=True,\n", " trainable=True,\n", " name='embedding',\n", " )\n", " x = embedding(inpTensor)\n", "\n", " convs = []\n", " filter_sizes = list(range(Params['min_filter_size'],Params['max_filter_size']+1))\n", " for filter_size in filter_sizes:\n", " l_conv = keras.layers.Conv1D(filters=Params['num_filters'], \n", " kernel_size=filter_size,\n", " kernel_regularizer=keras.regularizers.l2(Params['kernel_L2_reg']),\n", " activation='relu')(x)\n", " h = keras.layers.TimeDistributed(keras.layers.Dense(Params['num_filters'],\n", " activation='tanh'))(l_conv)\n", " attention = keras.layers.TimeDistributed(keras.layers.Dense(1, activation='tanh'))(h)\n", " attention = keras.layers.Flatten()(attention) \n", " attention = keras.layers.Softmax(axis=1,\n", " name='attention_'+str(filter_size))(attention)\n", " attention = keras.layers.RepeatVector(Params['num_filters'])(attention)\n", " attention = keras.layers.Permute([2, 1])(attention)\n", " representation = keras.layers.multiply([h, attention])\n", " representation = tf.math.reduce_sum(representation, axis = 1)\n", " convs.append(representation)\n", " # l_pool = keras.layers.GlobalMaxPooling1D()(l_conv)\n", " # convs.append(l_pool)\n", " l_merge = keras.layers.concatenate(convs, axis=1)\n", " \n", " x = keras.layers.Dropout(Params['dropout_after_convs'])(l_merge) \n", "\n", " dense1 = keras.layers.Dense(Params['num_dense'],\n", " kernel_constraint=Params['kernel_constraint'],\n", " activation = 'relu')(x)\n", " x = dense1\n", " dropout1 = keras.layers.Dropout(Params['dropout_after_Dense'])(x)\n", " x = dropout1\n", "\n", " if not Params['ifMulticlass']:\n", " finalOut = keras.layers.Dense(1, activation='sigmoid',\n", " bias_initializer=tf.keras.initializers.Constant(Params['initial_bias'])\n", " )(x)\n", " else:\n", " finalOut = keras.layers.Dense(Params['nclasses'], activation='softmax')(x)\n", "\n", " # define the model's start and end points \n", " model = keras.Model(inpTensor,finalOut)\n", "\n", " return model" ] }, { "cell_type": "markdown", "metadata": { "id": "xAfa4FkoIhU6" }, "source": [ "#Define Parameters" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fDoDihkSvq7h" }, "outputs": [], "source": [ "Params = {}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ndFDWyjpIhU8" }, "outputs": [], "source": [ "Params['num_epochs'] = 50\n", "\n", "Params['learning_rate'] = 1e-4\n", "if tpu_env:\n", " Params['batch_size'] = 48\n", "else:\n", " Params['batch_size'] = 48\n", "\n", "Params['embedding_dim'] = 128 # 128\n", "\n", "# CNN parameters\n", "Params['min_filter_size'] = 2\n", "Params['max_filter_size'] = 12 # 12\n", "Params['num_filters'] = 256 # 256\n", "Params['dropout_after_convs'] = 0.4 # 0.4\n", "# Dense Layer Parameters\n", "Params['num_dense'] = 256 # 256\n", "Params['dropout_after_Dense'] = 0.4\n", "\n", "# Transformer+Attention Model parameters\n", "Params['embdim'] = 2000\n", "Params['mask_zero'] = True\n", "Params['numheads'] = 8\n", "Params['ffdim'] = 64\n", "Params['trans_drop'] = 0.4\n", "Params['Nt'] = 1\n", "Params['ifPreCNN'] = False\n", "if Params['ifPreCNN']:\n", " Params['W'] = 500\n", " Params['Nc'] = 1\n", " Params['Nl'] = 1\n", "Params['num_dense_embed'] = 64 # 256\n", "Params['dropout_after_Dense_embed'] = 0.0 #0.2\n", "\n", "Params['kernel_constraint'] = keras.constraints.max_norm(1.0)\n", "Params['kernel_L2_reg'] = 0.1\n", "Params['bias_L2_reg'] = 0.1\n", "Params['activity_L2_reg'] = 0.1\n", "\n", "Params['ifMulticlass'] = False\n", "Params['nclasses'] = 2\n", "\n", "Params['sample_weighting'] = True\n", "\n", "Params['loss'] = keras.losses.BinaryCrossentropy(from_logits=False)\n", "if not tpu_env:\n", " # otherwise have to define in the TPU environment\n", " Params['metrics'] = [\n", " # keras.metrics.TruePositives(name='tp'),\n", " # keras.metrics.FalsePositives(name='fp'),\n", " # keras.metrics.TrueNegatives(name='tn'),\n", " # keras.metrics.FalseNegatives(name='fn'),\n", " keras.metrics.BinaryAccuracy(name='acc'),\n", " # keras.metrics.PrecisionAtRecall(0.5, name='par50'),\n", " # keras.metrics.Precision(name='prec'),\n", " # keras.metrics.Recall(name='rec'),\n", " keras.metrics.AUC(name='auc'),\n", " ]\n", "\n", "# Params['initial_bias'] = np.log(num1/num0)\n", "# Params['initial_bias'] = np.log(2) # default\n", "# Params['initial_bias'] = None\n", "\n", "Params['ifEarlyStopping'] = True\n", "# Params['ifEarlyStopping'] = False\n", "# Params['monitor'] = 'loss'\n", "Params['monitor'] = 'val_auc'\n", "Params['patience'] = 10\n", "early_stopping = tf.keras.callbacks.EarlyStopping(\n", " monitor = Params['monitor'],\n", " verbose = 2,\n", " patience = Params['patience'],\n", " mode = 'auto',\n", " min_delta = 0,\n", " restore_best_weights = True\n", " )\n", "Params['callbacks'] = [early_stopping]" ] }, { "cell_type": "markdown", "metadata": { "id": "wsYToQkTO5iF" }, "source": [ "#Text Preprocessing & Tokenization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "tDd45DbnVWDp" }, "outputs": [], "source": [ "DOCTYPE = 'Responses'\n", "\n", "Params['pretrained_embeddings'] = False\n", "\n", "# Params['num_words_to_use'] = None\n", "Params['num_words_to_use'] = 20000\n", "# Params['num_words_to_use'] = 1000\n", "\n", "\n", "# Params['text_length'] = 4000\n", "Params['text_length'] = 8000\n", "Params['text_start'] = 0 # 100\n", "Params['text_end'] = Params['text_start'] + Params['text_length']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uor3Kbin2lfZ" }, "outputs": [], "source": [ "ptab = pd.read_csv(FILELOC + 'PTAB_Institution_Proceedings_to_20211231.tsv', sep='\\t')\n", "# print(len(ptab))\n", "# ptab.drop_duplicates('Proceeding', inplace=True)\n", "# print(len(ptab))\n", "# ptab['date'] = ptab['Case Filing Date'].apply(dateparse)\n", "\n", "# trainingvariable = 'Responses'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Qx6RjOg82lcU" }, "outputs": [], "source": [ "# with open(FILELOC + 'IPR_Proceeding_PartyNames_12312022.txt', 'r', encoding=\"ISO-8859-1\") as f:\n", "# outfile = [line.rstrip('\\n') for line in f]\n", "# case = []; number = []\n", "# for k in range(0,len(outfile),2):\n", "# case.append(outfile[k])\n", "# number.append(outfile[k+1])\n", "# casedf = pd.DataFrame.from_dict({'case':case, 'proc':number})\n", "# casedf.drop_duplicates('proc', inplace=True)\n", "# casedf['proc'] = casedf['proc'].apply(lambda x: x.split('(')[0].strip())\n", "# casedf['name'] = casedf['case'].apply(lambda x: x.strip(\"\\\"\"))\n", "# casedf[casedf.name.str.contains('Petition')].to_csv('a.csv')\n", "\n", "# common_names = set(['business', 'doing', 'company', 'corporation', 'formerly', 'et', 'al'])\n", "\n", "# def f(x):\n", "# y = x\n", "# if 'Petition' in x:\n", "# if 'Covered' in x:\n", "# y = x.replace(\"Petition for Covered Business Method Patent Review by\",\"\")\n", "# elif 'Inter' in x:\n", "# y = x.replace(\"Petition for Inter Partes Review by\", \"\")\n", "# y = y.translate(str.maketrans('', '', string.punctuation))\n", "# if 'v' in y:\n", "# y = y.replace(\"v\", \"\")\n", "# y = [s.strip() for s in y.strip().split(' ') if s != \"\" and s not in STOPWORDS|common_names]\n", "# return y\n", "# casedf['party_names'] = casedf['name'].apply(f)\n", "\n", "# ptdf = pd.merge(ptab,casedf,left_on='Proceeding',right_on='proc',how='inner')\n", "# print(len(ptab), len(casedf), len(ptdf))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YxdJ9pvjXJSk" }, "outputs": [], "source": [ "# # Drop error messages and duplicates\n", "\n", "# ptdf.drop(columns=list({'Petitions','Responses','Decisions'}-{DOCTYPE}), inplace=True)\n", "# ptdf.drop(columns=['case','proc'], inplace=True)\n", "\n", "# ptdf[DOCTYPE] = ptdf[DOCTYPE].fillna('NA')\n", "\n", "# # clean up texts by removing (cid:##) which is likely an artifact of the PDF reading process\n", "# cid_str = re.compile(\"\\(cid:\\d+\\)\")\n", "# def f(x):\n", "# return re.sub(cid_str, \"\", x)\n", "# ptdf[DOCTYPE] = ptdf[DOCTYPE].apply(f)\n", "\n", "# def get_word_count(text):\n", "# return len(text.split())\n", "# ptdf[f'{DOCTYPE}_Len'] = ptdf[DOCTYPE].apply(get_word_count)\n", "# MIN_LENGTH = 50\n", "\n", "# print(len(ptdf))\n", "# ptdf.drop(ptdf[ptdf[f'{DOCTYPE}_Len'] < MIN_LENGTH].index, inplace=True)\n", "# print(len(ptdf))\n", "# ptdf.drop_duplicates(DOCTYPE, keep=False, inplace=True)\n", "# print(len(ptdf))\n", "\n", "# ptdf.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pUFUMGsMiLmt" }, "outputs": [], "source": [ "Params['remove_stop_words'] = True\n", "Params['remove_alphanumeric'] = True\n", "Params['remove_punctuation'] = True\n", "Params['remove_shortword_size'] = 3\n", "Params['remove_propernouns'] = True\n", "\n", "Params['clean_all'] = True\n", "Params['remove_shortword_size'] = 3\n", "\n", "Params['use_lowercase'] = True" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "WMKf8CVm39nn" }, "outputs": [], "source": [ "# remove_shortword_size = Params['remove_shortword_size']\n", "# def process_docs(x):\n", "# doc = x.replace(\"‘\", \"\\'\").replace(\"’\", \"\\'\").replace(\"´\", \"\\'\").replace(\"“\", \"\\\"\").replace(\"”\", \"\\\"\")\n", "# t = nltk.tokenize.word_tokenize(doc)\n", "# PUNCT = set(string.punctuation + u\"‘’´`“”–-§\")\n", "# tags = nltk.tag.pos_tag(t)\n", "# propernouns = set([a for a,b in tags if b=='NNP'])\n", "# # shortwords = set([tt for tt in t if len(tt) <= remove_shortword_size])\n", "# noisewords = set([tt for tt in t if (len(tt) <= 2) and any(map(lambda x: x in PUNCT, tt))])\n", "# numwords = set([tt for tt in t if any(map(str.isdigit, tt))])\n", "# emailwords = set([tt for tt in t if '@' in tt])\n", "# dotwords = set([tt for tt in t if '.' in tt])\n", " \n", "# # reject_list = PUNCT|propernouns|STOPWORDS|shortwords|numwords|emailwords|dotwords\n", "# reject_list = PUNCT|propernouns|numwords|emailwords|dotwords|noisewords\n", "# proct = [tt for tt in t if tt not in reject_list]\n", "# return proct\n", "\n", "# doclist = ptdf[DOCTYPE].tolist()\n", "# # docmap = map(process_docs, doclist)\n", "# # tokdocs = [doc for doc in tqdm(docmap)]\n", "# tokdocs = [process_docs(doc) for doc in tqdm(doclist)]\n", "# with open(FILELOC + 'Tokenized_Responses_20220212.pkl', 'wb') as f:\n", "# pickle.dump([ptdf, tokdocs], f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LkiSq11eoXag" }, "outputs": [], "source": [ "# if Params['remove_propernouns'] or Params['clean_all']:\n", "# def f(x):\n", "# if 'v.' not in x:\n", "# return 'NO_PARTY'\n", "# else:\n", "# y = x.split('v.')\n", "# petitioner = y[0].split()[0].strip().replace(',', '')\n", "# patentowner = y[1].split()[0].strip().replace(',', '')\n", "# return [petitioner, patentowner]\n", "\n", "# parties_first = casedf['name'].apply(f).values\n", "# CASENAMES = set(itertools.chain.from_iterable(parties_first))\n", "\n", "# docs = ptdf[DOCTYPE].values\n", "# partyname_list = ptdf['party_names'].tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_7WJ3Cqxbb5j" }, "outputs": [], "source": [ "# if Params['clean_all']:\n", "# remove_shortword_size = Params['remove_shortword_size']\n", "# def process_docs(x):\n", "# doc = x\n", "# t = nltk.tokenize.word_tokenize(doc)\n", "# PUNCT = string.punctuation + u\"‘’´“”–-\"\n", "# propernouns = set([a for a,b in nltk.tag.pos_tag(t) if b=='NNP'])\n", "# shortwords = set([tt for tt in t if len(tt) <= remove_shortword_size])\n", "# reject_list = set(PUNCT)|propernouns|STOPWORDS|shortwords\n", "\n", "# proct = [tt for tt in t if tt.isalpha() and tt not in reject_list]\n", "# return proct\n", "\n", "# else:\n", "# remove_punct = Params['remove_punctuation']\n", "# remove_stopwords = Params['remove_stop_words']\n", "# remove_alphanumeric = Params['remove_alphanumeric']\n", "# # set to False or None if not used otherwise remove this length or less\n", "# remove_shortword_size = Params['remove_shortword_size']\n", "# remove_proper = Params['remove_propernouns']\n", "\n", "# def process_docs(x):\n", "# doc, partynames = x\n", "# t = nltk.tokenize.word_tokenize(doc)\n", "# PUNCT = string.punctuation + u\"‘’´“”–-\"\n", "# if remove_punct:\n", "# proct = [tt for tt in t if tt not in set(PUNCT)]\n", "# if remove_stopwords:\n", "# proct = [tt for tt in proct if tt not in STOPWORDS]\n", "# if remove_alphanumeric:\n", "# proct = [tt for tt in proct if tt.isalpha()]\n", "# if remove_shortword_size:\n", "# proct = [tt for tt in proct if len(tt) > remove_shortword_size]\n", "# propernouns = set([a for a,b in nltk.tag.pos_tag(proct) if b=='NNP'])\n", "# if Params['keep_case_names']:\n", "# propernouns = propernouns - (CASENAMES - set(partynames))\n", "# if remove_proper:\n", "# proct = [tt for tt in proct if tt not in propernouns]\n", "# return proct\n", "\n", "# if Params['clean_all']:\n", "# tokdocs = ptdf[DOCTYPE].apply(process_docs)\n", "# else:\n", "# tokdocs = [process_docs([docs[ind], partyname_list[ind]]) for ind in tqdm(ptdf.index)]\n", "\n", "# # with open(FILELOC + 'Tokenized_Responses_20220131.pkl', 'wb') as f:\n", "# # pickle.dump([ptdf, tokdocs], f)\n", "# # with open(FILELOC + 'Tokenized_Decisions_20220131.pkl', 'wb') as f:\n", "# # pickle.dump([ptdf, tokdocs], f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "panQ-y442lY6" }, "outputs": [], "source": [ "# with open(FILELOC + 'Tokenized_Responses_20220131.pkl', 'rb') as f:\n", "# ptdf, tokdocs = pickle.load(f)\n", "\n", "# with open(FILELOC + 'Tokenized_Responses_noproper_20220131.pkl', 'rb') as f:\n", "# ptdf, tokdocs = pickle.load(f)\n", "\n", "with open(FILELOC + 'Tokenized_Responses_20220212.pkl', 'rb') as f:\n", " ptdf, tokdocs = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yXE3Y-_523jO" }, "outputs": [], "source": [ "# take a list of tokenized documents (i.e. list of lists) and derive an integer\n", "# mapping dictionary (0 = not used, 1 = out of vocabular, 2+ are tokens) for the\n", "# all (if num_words=None) or num_words most common words\n", "# It will generate a 2D array of truncated / padded document vectors (vec_len)\n", "# If lowercase set to True then converts all tokens to lowercase\n", "# Out of vocabulary string is \"oov_str\" (default '')\n", "\n", "class Token2Int(BaseEstimator,TransformerMixin):\n", " def __init__(self, vec_len, num_words=None, oov_str='', lowercase=True):\n", " self.vec_len = vec_len\n", " self.num_words = num_words\n", " self.oov_str = oov_str\n", " self.lowercase = lowercase\n", "\n", " def fit(self, X, y=None):\n", " if type(X[0]) is not list:\n", " X = [X] # only a single document was passed\n", " if self.lowercase:\n", " X = [[d.lower() for d in doc] for doc in X]\n", " wc = Counter(itertools.chain.from_iterable(X))\n", " self.word_count = wc\n", " vocab = [w for w,c in wc.most_common(self.num_words)]\n", " vocab.insert(0, self.oov_str) # assign 1 to OOV\n", " self.vocab = vocab\n", " self.vocab_size = len(vocab)\n", " wordmap = {n:m+1 for m,n in enumerate(vocab)}\n", " self.word_index = wordmap\n", " self.index_word = {n:m for m,n in wordmap.items()}\n", " return self\n", "\n", " def transform(self, X):\n", " if type(X[0]) is not list:\n", " X = [X] # only a single document was passed\n", " # X = np.array(list(itertools.zip_longest(*X, fillvalue=0))).T\n", " if self.lowercase:\n", " # X = np.vectorize(str.lower)(X)\n", " X = [[d.lower() for d in doc] for doc in X]\n", " wordmap = self.word_index\n", " vocab = self.vocab\n", " veclen = self.vec_len\n", " numdocs = len(X)\n", " # wordmap['0'] = 0\n", " # # textpad = np.array([t[:veclen] if len(t) >= veclen else t + ['0']*(veclen-len(t)) for t in X]).astype(str)\n", " # X = [[wordmap.get(x, 1) for x in t] for t in X]\n", " # return pad_sequences(X, maxlen=veclen, padding='post', truncating='post')\n", " textpad = np.zeros((numdocs, veclen))\n", " for d in tqdm(range(numdocs)):\n", " doc = X[d]\n", " doclen = min(len(doc), veclen)\n", " textpad[d,:doclen] = [wordmap.get(word, 1) for word in doc[:doclen]]\n", " # textpad[d,:doclen] = [wordmap[word] if word in vocab else 1 for word in doc[:doclen]]\n", " return textpad\n", "\n", " def reverse(self, textpad):\n", " texts = []\n", " for row in textpad:\n", " int2text = ['' if w==0 else self.index_word[w] for w in row]\n", " texts.append(' '.join(int2text).strip())\n", " return texts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "t1TbHH0S23f9", "outputId": "73a6aa85-4ead-4027-aac6-0188633a8e57" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "9283\n", "9182\n" ] } ], "source": [ "map_outcome2unpat = {'Denied': 0,\n", " 'Denied on Rehearing': -1,\n", " 'Mixed': 0,\n", " 'Granted': 1,\n", " 'Granted on Rehearing': -1,\n", " 'Indefinite': -1,\n", " }\n", "ptdf['Unpatentable'] = ptdf['Decision'].map(map_outcome2unpat)\n", "\n", "selind = ptdf[ptdf['Unpatentable'] != -1].index\n", "\n", "print(len(ptdf))\n", "ptdf.drop(ptdf[ptdf['Unpatentable'] == -1].index, inplace=True)\n", "print(len(ptdf))\n", "ptdf.reset_index(inplace=True)\n", "tokdocs = [tokdocs[ind] for ind in range(len(tokdocs)) if ind in selind]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_jbe0xu14bkn" }, "outputs": [], "source": [ "def f_pet(x):\n", " case, partyname = x\n", " if ' v.' in case:\n", " try:\n", " pet,po = case.split(' v.')\n", " except:\n", " print(case)\n", " return pet.strip()\n", " else:\n", " return ' '.join(partyname)\n", "\n", "def f_po(x):\n", " case = x\n", " if ' v.' in case:\n", " pet,po = case.split(' v.')\n", " return po.strip()\n", " else:\n", " return 'UNKNOWN'\n", "\n", "ptdf['petitioner_raw'] = ptdf[['name', 'party_names']].apply(f_pet, axis=1)\n", "ptdf['patent_owner_raw'] = ptdf['name'].apply(f_po)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JhZI2iUptJbt" }, "outputs": [], "source": [ "common_terms = ['corporation','corp.',' corp ', '& co','co.',' co ','company',\n", " 'l.l.c.', 'llc', 'l.c', ' lc',\n", " 'l.l.p.', 'llp', 'l.p.', ' lp',\n", " 'incorporated', 'inc.', ' inc ',\n", " 'limited', 'ltd',\n", " ' sa ', ' se ', ' ag ',\n", " 'gmbh', 'a/s', 'bv', ' nv', 'n.v.',\n", " 'et al',\n", " 'n.a.', ' us ', ' usa ',\n", " '(us)', '(usa)', '(u.s.)', '(u.s.a.)',\n", " '(california)', '(delaware)', '(united states)',\n", " ' i,', 'ii', 'iii',\n", " '1)', '2)',\n", " ]\n", "replace_common_terms = '|'.join(common_terms).replace('/','\\/').replace(' ','\\s').replace('.','\\.').replace('(','\\(').replace(')','\\)')\n", "replace_common_terms += '|\\s\\d+\\s'\n", "regexp_common_terms = re.compile(replace_common_terms, re.IGNORECASE)\n", "\n", "split_terms = ['d/b/a/', 'd/b/a', 'doing business as', 'formerly known as', 'f/k/a/', 'f/k/a', ' and ']\n", "split_terms_list = '|'.join(split_terms).replace('/','\\/').replace(' ','\\s')\n", "regexp_split_terms = re.compile(split_terms_list, re.IGNORECASE)\n", "\n", "def f_clean(x):\n", " # no cleaning up special characters\n", " # add a trailing whitepace to eliminate edge effects for \"lp\" and \"inc\"\n", " x += ' '\n", " # remove common terms\n", " if any([t in x.lower() for t in common_terms]):\n", " # remove commas and periods associated with these terms\n", " x = re.sub(regexp_common_terms, '', x)\n", " x = x.replace(', ',' '); x = x.replace('. ',' ')\n", " x = x.strip().strip(',').strip('.')\n", "\n", " if any([t in x.lower() for t in split_terms]):\n", " x = re.split(regexp_split_terms, x)\n", " x = ';'.join(x)\n", "\n", " return x\n", "\n", "ptdf['petitioner'] = ptdf['petitioner_raw'].apply(f_clean)\n", "ptdf['patent_owner'] = ptdf['patent_owner_raw'].apply(f_clean)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 66, "referenced_widgets": [ "1b33de2a3a4a470aa6f07253885fe1bc", "f63479552d154f019a083d60b5f7262e", "64723b9f5a9742df9ba5425820b8674d", "a30de5d77e1b46bd83cf8ecbeed63171", "237488437a3a452493d875733ed29275", "7c0035c0c2f842448c19f92eb1ac54d9", "acfb2bfc634642cd86aad9009938c455", "4ef7df9baa4444f5979dabf24975ffeb", "46c770d287cb402a91ccbf7feece5753", "84be481bd1844259a25de6207ce09998", "3dbc67b9c8c646bd934013cb45df9907" ] }, "id": "CY0JTAKe92st", "outputId": "cb0088e3-0858-4fca-e8c1-ac44dab4ce2a" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "20002\n" ] }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1b33de2a3a4a470aa6f07253885fe1bc", "version_minor": 0, "version_major": 2 }, "text/plain": [ " 0%| | 0/2631 [00:00{string} \"\n", "\n", "def colorize(attrs, cmap='PiYG'):\n", " \"\"\"\n", " Compute hex colors based on the attributions for a single instance.\n", " Uses a diverging colorscale by default and normalizes and scales\n", " the colormap so that colors are consistent with the attributions.\n", " \"\"\"\n", " import matplotlib as mpl\n", " cmap_bound = np.abs(attrs).max()\n", " norm = mpl.colors.Normalize(vmin=-cmap_bound, vmax=cmap_bound)\n", " cmap = mpl.cm.get_cmap(cmap)\n", "\n", " # now compute hex values of colors\n", " colors = list(map(lambda x: mpl.colors.rgb2hex(cmap(norm(x))), attrs))\n", " return colors" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "3STTYnjNgrf3", "outputId": "7f92b225-4db7-4d13-c67a-5bef8c33aa6a" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.31546426\n" ] }, { "output_type": "execute_result", "data": { "text/html": [ "states filed on behalf of by i a of and to and in of merits and other circumstances because establish the level of in the not of independent storing the caller information in association with the voice mailbox wherein the caller information is stored in association with a voice message left by the calling party for the called party in the voice mailbox ii automatically dialing the telephone number at a request of the called party while the called party is listening to the voice message the of and not of independent automatically dialing the telephone number at a request of the called party while the called party is listening to the voice message the of and not and independent conclusion passim fn scripting b -iii- et et of of the patents et to boost and for states and https last visited and et -iv- et et et et of of et of of of of and ed once a friendly a https discussing the historical role of the operator and -v- i respectfully submits this preliminary to or for or of the patent the should exercise its discretionary power to deny institution under a because two separate litigations will be decided by a jury before any final written decision would issue on the first eleven months before any and the second seven months before any at precedential denying institution where trial in parallel proceeding was scheduled to begin two months before final written decision in the first litigation has proffered an invalidity report from the same expert challenging the same claims based on the same prior art presented in the see and deposition in support of his report in that litigation has already been conducted stay has been requested during the year that case has been pending by the statutory date for decision on institution of the on claim construction fact discovery expert discovery and the pretrial conference will be completed in the first litigation instituting trial on this will not serve as an effective and efficient alternative to litigation frustrating a primary objective of the at precedential institution should also be denied because the has failed to demonstrate a reasonable likelihood that any claim of the patent is unpatentable because each ground presented in the fails to disclose or suggest key limitations of the challenged claims for these reasons institution should be denied requests that the deny institution of the with respect to all challenged claims and all asserted grounds full statement of the reasons for the relief requested is set forth in and of this a the should exercise its discretion to deny the because every factor favors the claims references combinations of references and invalidity grounds presented by this are exactly the same as those presented by in parallel district court proceedings involving the same by setting out and applying the six factors as detailed below while is not a named party in the or litigations by all evidence it is directing and funding the defense against claims in those proceedings has therefore placed itself in substantially the same posture as if it was a named defendant the time an institution decision is issued claim construction fact discovery and expert discovery will already be completed in the related of litigation involving the patent is scheduled for trial on months before the statutory date for a in this proceeding second litigation in the western of involving the patent is scheduled for trial on approximately seven months before the statutory date for a in this proceeding the timing of the parallel district court trials is a direct result of delay in filing the the was filed eight months after the litigation began and five months after its counsel entered an appearance in that litigation the is likely to fail on the merits as each unpatentability ground presented fails to disclose or suggest key limitations of the challenged claims of discretionary denial arguments are premised on the assertion that it is not a party to any litigation regarding the invalidity of the based on this assertion urges the to ignore the and litigations because deserves to be heard regarding the invalidity of the however there is no reasonable doubt that is directing and controlling defense of those proceedings the same counsel representing before the also represents eight defendants in the and litigations and has identified those eight defendants as real parties-in-interest for purposes of this pp with and commissioned the same expert relied upon here to prepare a report challenging the validity of the patent in the litigation and that report was served on by the defendants has been retained by on behalf of s counsel also submitted invalidity contentions in the parallel litigations on behalf of six of those defendants there can be no doubt that through its counsel at is funding directing and controlling defense of the litigations at a bare minimum there is a more than significant relationship between and the defendants as it pertains to the validity of the patent and the challenges thereto in the litigation scripting at precedential denying institution where there was a significant relationship between and with respect to the challenged patent that weighed in favor of discretionary indeed the notably the report states that the expert was retained by but to knowledge there is no such entity and the use of instead of is a typographical error in the report avoids any representation that and its attorneys are not directing and controlling the litigations merely stating that is not a party carefully worded statement that if is instituted will not pursue in any with any ground raised or that could have been reasonably raised in an is meaningless the eight real parties-in-interest to this six of whom are defendants have not agreed to such any such stipulation and are currently relying on an expert report commissioned by that asserts the exact grounds presented in this is pursuing every unpatentability ground raised in the in the parallel litigation through its direction and control of the defense of six defendants as evidenced by the expert report itself commissioned to support those defendants and the common counsel between the two proceedings stipulation is therefore if the were instituted would be free to continue pursuing the same unpatentability arguments through the real-parties-in-interest moreover has not made the broad stipulation recommended by the precedential and which require a stipulation that would not pursue in the litigation any ground raised or that could have been reasonably raised in the pp fn in short all evidence indicates that is directing funding and controlling the invalidity challenges including commissioning and submitting an expert report that matches the grounds of this exactly and has thereby voluntarily placed itself in the shoes of the eight and defendants that are real parties-in-interest to the while is free to proceed with its chosen approach respectfully submits that the in considering whether efficiency fairness and the merits support the exercise of authority to deny institution of the should analyze the factors in view of the fact that is currently challenging the claims of the patent in the litigation on the same unpatentability grounds presented in the viewed in that light factor weighs heavily in favor of discretionary denial moreover factor would weigh in favor of denial even if were completely in the litigation instructs that e ven when a petitioner is unrelated to a defendant however if the issues are the same as or substantially similar to those already or about to be litigated or other circumstances weigh against redoing the work of another tribunal the may nonetheless exercise the authority to deny institution at submission in the litigation of an expert report that exactly matches the grounds raised in the weigh against redoing the work of the court assertion that this is the only vehicle by which it can challenge patentability of the patent is simply untrue and to and in as noted above the parallel litigations are at an advanced stage the litigation is scheduled to be tried to a jury eleven months before any would issue from the decision on institution for this is expected on see b discovery concluded more than a month before the filing date of this on and infringement expert discovery has closed and expert depositions of and have already been conducted before the decision on institution the pretrial conference will have occurred and the parties will be preparing for the trial several recent decisions found this factor to weigh in favor of denial when the scheduled trial date would precede the by less than the eleven-month difference here five months at seven months at ten months each of those decisions was issued before the recent acceleration in vaccine progress which resolve concerns that not enough of the population would be vaccinated by to conduct a jury trial the trial which also involves validity challenges to the patent will also be tried to a jury seven months before any here the presiding judge in the litigation has already resumed holding jury trials including a recent patent trial which concluded on even if trial were slightly delayed there is no evidence to suggest that such potential delay would extend more than eleven months the generally takes a court trial schedule at face value absent strong contradictory evidence which does not exist here at see also because the exact issues presented by the will be tried to at least one jury well before any would issue on the and because two different courts multiple other real-party-in-interest defendants and will have invested enormous resources into the parallel litigations before the decision on institution factors and weigh heavily in favor of discretionary denial each and every unpatentability challenge presented in the is also being pursued by in the copending litigation commissioned an invalidity report from the same expert challenging thesame claims based on the same prior art presented in the and that expert report is being relied upon by the defendants at the submitted in the litigation by counsel on behalf of and challenge claims and of the patent based on the exact same combinations presented in in the and with the does not allege otherwise and indeed does not address issue overlap in its analysis of the factors the exact same unpatentability issues presented by the exact same parties and will therefore be decided by a jury eleven months before any would issue from the this factor weighs heavily in favor of discretionary denial and should be should not waste its finite resources evaluating the exact same issues that will be decided in parallel district court proceedings by the same parties nearly a year before any would issue of this factor weighs heavily in favor of denial none of the defendants in the copending litigations including the eight defendants that counsel is actively defending in those litigations have requested a stay even though the earliest case against those defendants was filed over eleven months ago on and later-filed stay will almost certainly be denied due to the advanced stage of the litigations decision on institution for this is expected on see b before that date claim construction fact and expert discovery and the pretrial conference will already be completed in the litigations and the parties will be preparing for trial even if the trial were somewhat delayed it is a near certainty that the invalidity issues raised by the will be decided by a jury well before any on this would issue in because there is a possibility that the district court cases will be stayed pending resolution of this this factor weighs heavily in favor of denial merits and other circumstances as detailed below the does not disclose or suggest key limitations of each of the challenged claims and therefore does not establish a reasonable likelihood that the would prevail with respect to at least one of the claims challenged in the moreover any timing issues of by are of its own making delayed filing this for seven months after the first district court complaint was filed and five months after counsel entered an appearance in the litigation to begin defense of the defendants identified as real-parties-in-interest to the the should exercise its discretionary power to deny the because establish as detailed below the fails to demonstrate a reasonable likelihood that the would prevail with respect to any claim of the patent the challenges independent claim and dependent claims and based on three as detailed herein each proposed fails to disclose key limitations of each challenged claim should not be instituted the the was filed on and issued on the is titled dialing the pertains to a telephone and voice mail system that allows a user at the called user extension to call back an individual that left a voice mail messaging by using caller information that is stored with the voice mail message figure depicts an example of how caller information and voice mail messages are stored in some embodiments as shown the discloses the system receiving caller information with an incoming phone call to an extension if the intended recipient extension number of the call does not answer the voice mail system will answer the call allowing the caller to leave a voice mail message id at the system will also store caller information if available in association with the voice mail message id at figures and depict a process for a caller using caller information according to some embodiments id at in some embodiments the user at the extension number called can press a key while listening to the voice mail message which retrieves the caller information that is associated with the voice mail message the caller information is then used to dial the phone number of the caller that left the voice mail message for the user extension id at level of in the for the purposes of this only does not dispute the level of skill of a person of ordinary skill in the art identified in the the does not affirmatively offer a construction of any term of the patent but does present arguments as to the proper interpretation of certain terms pp does not necessarily agree with proposed interpretations or their rationales and reserves the right to contest those interpretations in the future however because the fails to raise a reasonable likelihood that any claim of the patent is unpatentable even under proposed interpretations this applies term interpretations to minimize the disputes to be resolved by the at this preliminary stage not of the does not render obvious any of independent claim or dependent claims and independent does not render obvious claim at least because it does not teach or suggest storing the caller information in association with the voice mailbox wherein the caller information is stored in association with a voice message left by the calling party for the called party in the voice mailbox or automatically dialing the telephone number at a request of the called party while the called party is listening to the voice message i storing the caller information in association with the voice mailbox wherein the caller information is stored in association with a voice message left by the calling party for the called party in the voice mailbox recites storing the caller information in association with the voice mailbox wherein the caller information is stored in association with a voice message left by the calling party for the called party in the voice mailbox claim emphasis added does not teach or suggest such a feature the does not allege that expressly discloses storing the caller information in association with the voice mailbox or the caller information is stored in association with a voice message left by the calling party for the called party in the voice mailbox pp failing to provide a citation to expressly disclosing these limitations discloses ing a call to an organization internal voice mail extension and entering the extension number of the called party to tell the voice mail system which voice mailbox to use however does not disclose or suggest storing caller information in association with the called party voice mailbox or in association with a voice message as required by claim instead a would understand that the system stores caller information including the name phone number of the calling party and length of the call in a separate call log the call log includes a data field referred to as which designates among other things whether the call was an outgoing call a missed call or was sent to voice mail at best discloses that when a user selects the button on the the system accesses the call log and presents a list of the call log information for those calls in the call log that were designated after that call log information is presented the user may go to the voice mail system to hear his messages a would understand that the voice mail system of which is where the voice mailbox stores voice messages for users is accessed only after the user has selected the call from the separate call log information and does not teach or suggest storing caller information in association with the called party voice mailbox or in association with a voice message as required by claim ii automatically dialing the telephone number at a request of the called party while the called party is listening to the voice message further recites automatically dialing the telephone number at a request of the called party while the called party is listening to the voice message does not clearly disclose that a call can be returned while the called party is listening to the voice message the and cite a passage of that states the following when a voice mail message is selected to be heard by the double-click of a mouse or otherwise the call management computer creates a voice pathway to the user telephone instrument if one is not already present which it then uses to play back the selected voice mail messages from the call management database or other storage during playback many new capabilities are provided the user including returning the calls with the click of a mouse citing reviewing this passage would not understand to clearly disclose that returning the calls with the click of the mouse could occur during playback of any particular message instead a would interpret this section as disclosing only that calls can be returned by the click of a mouse at some point in the playback of a group of messages by the user selecting from the call log and not the voice mailbox the number of the person whose call the user desired to return citing this call log the user may simply double-click to return missed calls with the system automatically them emphasis added would not interpret as clearly disclosing that during playback of a particular message a user can automatically return the call of the person that left that same message does not disclose storing the telephone number received as part of the caller information i in the manner required by claim therefore for at least the reasons detailed with respect to this claim element does not disclose or suggest that the telephone number number that is received as part of the caller information and stor ed in association with the voice mailbox and in association with a voice message left by the calling party for the called party in the voice mailbox can be automatically dialed at a request of the called party while the called party is listening to the voice message accordingly a would understand that does not teach or suggest all limitations of claim and depend from and add limitations to claim and fails to teach or suggest those claims for at least the same reasons and are therefore patentable over the of and not of the combination of and the references does not render obvious any of claims or independent the combination of and the references does not render obvious claim at least because neither or taken alone or in combination teach or suggest connecting the incoming call to a voice mailbox i connecting the incoming call to a voice mailbox recites connecting the incoming call to a voice mailbox claim for background claim also requires both receiving an incoming call and then connecting the incoming call to a voice mailbox this two-step series of events makes sense in the context of the patent which describes a centralized system for receiving and routing incoming calls for a plurality of user extensions with each extension having a separate mailbox on the system all stored on a hard disk the centralized system is connected to electronic key telephones associated with user extensions incoming calls are received from a through and the calls are received by the system using a caller modem that is coupled to digital crosspoint matrix and after the system receives the incoming call the system must make a further connection the system can process the call and connect it to the appropriate user if that user does not answer the system performs steps to connect the incoming call to the specific voice mailbox in hard disk via bus which is connected to a microprocessor that controls access to specific therefore the method of claim requires both receiving the incoming call and then a further connection step in order to access a particular user voice mailbox in contrast both and disclose single phone units analogous to the of the patent that themselves receive calls and then record voice messages on that same phone unit claim versatile telephone unit comprising… claim telephone answering device coupled to a telephone line for automatically answering incoming telephone calls and storing and retrieving information from the incoming telephone calls comprising… once the call is received and connected to the phone units of and no further connection to a voice mailbox is required and no such connection is disclosed because the phone itself functions as the voice mailbox the points to of which discloses a step named citing but step of is part of the answering routine and merely describes connecting a telephone circuit to the telephone unit receiving the call if the user does not pick up the handset telephone unit automatically plays an outgoing message requesting a voicemail and stores that voicemail all of these steps are handled by and any voice message is stored in see also the second response control means automatically connects the line to the remote telephone automatically transmits a preset response message to the remote telephone automatically stores in the memory medium an incoming message from a caller after transmission of the preset response message is completed and then automatically disconnects the line from the remote telephone emphasis added teaches a similar method wherein after an incoming call is received at the phone no further connection is made to a voice mailbox because the phone itself is the voice mailbox ii automatically dialing the telephone number at a request of the called party while the called party is listening to the voice message further recites automatically dialing the telephone number at a request of the called party while the called party is listening to the voice message neither nor taken alone or in combination teach or suggest such features is directed to a versatile telephone unit subjected to a caller telephone number reception service emphasis added repeatedly points to a that handles the various dialing tasks id at would understand that the phone of is to be used with a traditional with that handle dialing as a in would have read disclosures of a reception service combined with a to be referring to a manually operated service as operated by a traditional operator defining as an assembly of switch panels that is manually operated at a telephone exchange where all subscriber circuits and other exchanges terminate so that operators establish communication links between two subscribers onto the same or different exchanges defining an operator as a person who controls a machine or system such as a computer or telephone discussing the historical role of the operator and the petition citation to for claim d indicates that a caller telephone number is not automatically dialed during playback of a voice message in the normal meaning of automatically dialed to a citing instead merely describes that a user is able to transmit the caller telephone number corresponding to the incoming message to the to call a corresponding remote telephone would understand that any telephone number received at the would merely cause a at the to dial the phone number and therefore would not be automatically dialed as required by claim d the citations to other portions of that say the push of a button on the phone dials a telephone number appears to be shorthand used in drafting the specification discloses only one method of making telephone calls through its system and that is through manual does not remedy the deficiencies of specifically does not disclose or suggest automatically dialing the telephone number at a request of the called party while the called party is listening to the voice message as required by claim d merely discloses that a data record matching the caller who left the voice mail can be displayed while the message is being played back then in a next step discloses that a user can press something referred to as to cause the system to dial a phone number of the calling party therefore does not disclose that the functionality is available while the called party is listening to the voice message in fact a viewing would understand that as shown in of the voice message is fully played back and then after that step is completed a user can select a number to return the call of the calling party in addition a would not have sought to combine the manual teachings of with the automated system of the provides no rationale for how the alleged automatic dialing feature of could be implemented into the manual system of and neither the nor appears to have recognized that describes a phone for use with a manual reception service would not have been motivated to make such a modification and would not have a " ], "text/plain": [ "" ] }, "metadata": {}, "execution_count": 58 } ], "source": [ "N = 8\n", "n = 7 # document index\n", "print(pred [n])\n", "xlen = np.where(X_test[n]==0)[0][0]\n", "attvec = att[N][n][:xlen]\n", "xvec = tok2int.reverse([X_test[n][:xlen]])[0]\n", "strlen = len(attvec)\n", "THRESH = np.median(attvec)\n", "colors = colorize(attvec - THRESH)\n", "\n", "HTML(\"\".join(list(map(hlstr, xvec.split(), colors))))" ] }, { "cell_type": "markdown", "source": [ "Highest attention words in sample" ], "metadata": { "id": "HSMCjnRAOSm1" } }, { "cell_type": "code", "source": [ "uniquetokens = np.unique(xtest[n][:xlen])\n", "print(len(uniquetokens))\n", "tokpos = [np.where(xtest[n][:xlen] == tok)[0] for tok in uniquetokens]\n", "meanatt = np.array([np.mean(att[2][n][np.array(tpos)]) for tpos in tokpos])\n", "tokens_sorted_by_meanatt = uniquetokens[np.argsort(-meanatt)]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "m8U2xDZ_L7Nu", "outputId": "9aef17aa-cddb-467d-f8c6-e43786812cc0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "605\n" ] } ] }, { "cell_type": "code", "source": [ "pd.DataFrame.from_dict({'Attention':[tok2int.index_word[t] for t in tokens_sorted_by_meanatt[:20]]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 677 }, "id": "UJQzk6d8n7ZL", "outputId": "76ef312a-d2bc-4991-81b2-c6fa8cf0597d" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Attention
0processor
1entirety
2patents
3foregoing
4cryptographic
5cure
6makes
7amounts
8keys
9briefs
10certificate
11another
12requires
13unsupported
14except
15situated
16mentioned
17fully
18unpatentable
19device
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ], "text/plain": [ " Attention\n", "0 processor\n", "1 entirety\n", "2 patents\n", "3 foregoing\n", "4 cryptographic\n", "5 cure\n", "6 makes\n", "7 amounts\n", "8 keys\n", "9 briefs\n", "10 certificate\n", "11 another\n", "12 requires\n", "13 unsupported\n", "14 except\n", "15 situated\n", "16 mentioned\n", "17 fully\n", "18 unpatentable\n", "19 device" ] }, "metadata": {}, "execution_count": 37 } ] }, { "cell_type": "markdown", "source": [ "##LIME Analysis" ], "metadata": { "id": "PLCP7ucjOGGx" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "44AaMHd3wGA5" }, "outputs": [], "source": [ "def lean_wrapper(texts):\n", " x = pad_sequences(DTP.texts_to_sequences(texts),\n", " maxlen = Params['text_length'],\n", " padding='post',\n", " truncating='post')\n", " return np.hstack((1-model.predict(x), model.predict(x)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sEeA4AEzzRBZ" }, "outputs": [], "source": [ "n = 2774\n", "xvec = DTP.sequences_to_texts([X_data[n]])[0]\n", "\n", "exp = LimeTextExplainer(class_names={0:'Denied',1:'Granted'})\n", "exp_doc = exp.explain_instance(xvec, lean_wrapper, num_features=50)\n", "# explist = exp_doc.as_list()\n", "exp_doc.show_in_notebook()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8mKglEF9buWx" }, "outputs": [], "source": [ "" ] } ], "metadata": { "accelerator": "TPU", "colab": { "collapsed_sections": [], "name": "PTAB_Model_Responses_github.ipynb", "provenance": [], "toc_visible": true, "mount_file_id": "1X-M2SntuvoGIIjwFAtPNNIHXyKqSTUsK", "authorship_tag": "ABX9TyOFmMroyymx7756xgTlcntK", "include_colab_link": true }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "1b33de2a3a4a470aa6f07253885fe1bc": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_f63479552d154f019a083d60b5f7262e", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_64723b9f5a9742df9ba5425820b8674d", "IPY_MODEL_a30de5d77e1b46bd83cf8ecbeed63171", "IPY_MODEL_237488437a3a452493d875733ed29275" ] } }, "f63479552d154f019a083d60b5f7262e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "64723b9f5a9742df9ba5425820b8674d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_7c0035c0c2f842448c19f92eb1ac54d9", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": "100%", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_acfb2bfc634642cd86aad9009938c455" } }, "a30de5d77e1b46bd83cf8ecbeed63171": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_4ef7df9baa4444f5979dabf24975ffeb", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 2631, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 2631, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_46c770d287cb402a91ccbf7feece5753" } }, "237488437a3a452493d875733ed29275": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_84be481bd1844259a25de6207ce09998", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 2631/2631 [00:05<00:00, 477.85it/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_3dbc67b9c8c646bd934013cb45df9907" } }, "7c0035c0c2f842448c19f92eb1ac54d9": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "acfb2bfc634642cd86aad9009938c455": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "4ef7df9baa4444f5979dabf24975ffeb": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "46c770d287cb402a91ccbf7feece5753": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "84be481bd1844259a25de6207ce09998": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "3dbc67b9c8c646bd934013cb45df9907": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "nbformat": 4, "nbformat_minor": 0 }