[8c057c]: / PTAB_Model_Decisions_github.ipynb

Download this file

1540 lines (1540 with data), 58.8 kB

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/bahrad/PTAB/blob/master/PTAB_Model_Decisions_github.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "p-aVXISlSKYH"
      },
      "source": [
        "#Initialization"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wOFk8JfaSMSV"
      },
      "source": [
        "##Imports"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ESf61HRPSERn"
      },
      "outputs": [],
      "source": [
        "%tensorflow_version 2.x\n",
        "\n",
        "%xmode Context\n",
        "# Verbose\n",
        "\n",
        "import tensorflow as tf\n",
        "from tensorflow import keras\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "import pickle\n",
        "import os\n",
        "import itertools\n",
        "\n",
        "from collections import Counter, defaultdict\n",
        "import random\n",
        "from pandas import DataFrame\n",
        "import datetime\n",
        "from datetime import datetime\n",
        "import dateutil\n",
        "from dateutil.parser import parse as dateparse\n",
        "from tqdm.notebook import tqdm\n",
        "import time\n",
        "\n",
        "import xgboost as xgb\n",
        "\n",
        "import sklearn as sk\n",
        "from sklearn.preprocessing import MultiLabelBinarizer, QuantileTransformer, OneHotEncoder, StandardScaler\n",
        "from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV\n",
        "from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "import sklearn.metrics\n",
        "from sklearn.metrics import accuracy_score,classification_report, make_scorer, balanced_accuracy_score, f1_score, coverage_error, roc_auc_score, confusion_matrix, plot_confusion_matrix\n",
        "from sklearn.cluster import KMeans\n",
        "from sklearn.decomposition import PCA\n",
        "from sklearn.utils import resample, shuffle\n",
        "from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin\n",
        "from sklearn.neighbors import NearestNeighbors\n",
        "from sklearn.manifold import TSNE\n",
        "from sklearn.utils import class_weight\n",
        "\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "\n",
        "from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE\n",
        "from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, CondensedNearestNeighbour, AllKNN\n",
        "from imblearn.combine import SMOTEENN, SMOTETomek\n",
        "from imblearn.pipeline import make_pipeline,Pipeline\n",
        "\n",
        "from tensorflow.keras.preprocessing.text import Tokenizer\n",
        "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
        "\n",
        "import string\n",
        "import re\n",
        "# import unicodedata\n",
        "\n",
        "import nltk\n",
        "nltk.download('stopwords')\n",
        "from nltk.corpus import stopwords\n",
        "STOPWORDS = set(stopwords.words('english'))\n",
        "\n",
        "nltk.download('averaged_perceptron_tagger')\n",
        "nltk.download('wordnet')\n",
        "nltk.download('punkt')\n",
        "\n",
        "!pip install lime\n",
        "import lime\n",
        "from lime import lime_text\n",
        "from lime.lime_text import LimeTextExplainer\n",
        "from lime.explanation import Explanation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8KTO164A7t1G"
      },
      "outputs": [],
      "source": [
        "# COMMENT OUT FOR PUBLIC CODE\n",
        "from google.colab import drive, files\n",
        "# drive.mount('/content/drive')\n",
        "\n",
        "# FILELOC = \"DATA/\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Xvw4VK92r4-b"
      },
      "outputs": [],
      "source": [
        "try:\n",
        "    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection\n",
        "    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])\n",
        "    tf.config.experimental_connect_to_cluster(tpu)\n",
        "    tf.tpu.experimental.initialize_tpu_system(tpu)\n",
        "    tpu_strategy = tf.distribute.TPUStrategy(tpu)\n",
        "    tpu_env=True\n",
        "except ValueError:\n",
        "    print('Not connected to a TPU runtime.')\n",
        "    tpu_env=False"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "oSiHsBoY4_bv"
      },
      "source": [
        "#Functions"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "E3qcPlglEzjC"
      },
      "source": [
        "##Define Models"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Va27pqfoegI_"
      },
      "outputs": [],
      "source": [
        "def EmbedNN(Params):\n",
        "\n",
        "    inpTensor = keras.Input(shape=(Params['text_length'],))\n",
        "    \n",
        "\n",
        "    \n",
        "    if Params['pretrained_embeddings']:\n",
        "        embedding = keras.layers.Embedding(Params['vocab_size'],\n",
        "                                           Params['embedding_dim'],\n",
        "                                           weights=[Params['embeddings']],\n",
        "                                           input_length=Params['text_length'],\n",
        "                                           mask_zero=True,\n",
        "                                           trainable=False,\n",
        "                                           )        \n",
        "    else:\n",
        "        embedding = keras.layers.Embedding(Params['vocab_size'],\n",
        "                                           Params['embedding_dim'],\n",
        "                                           mask_zero=True,\n",
        "                                           trainable=True,\n",
        "                                           name='embedding',\n",
        "                                           )\n",
        "    x = embedding(inpTensor)\n",
        "\n",
        "    convs = []\n",
        "    filter_sizes = list(range(Params['min_filter_size'],Params['max_filter_size']+1))\n",
        "    for filter_size in filter_sizes:\n",
        "        l_conv = keras.layers.Conv1D(filters=Params['num_filters'], \n",
        "                        kernel_size=filter_size,\n",
        "                        kernel_regularizer=keras.regularizers.l2(Params['kernel_L2_reg']),\n",
        "                        activation='relu')(x)\n",
        "        h = keras.layers.TimeDistributed(keras.layers.Dense(Params['num_filters'],\n",
        "                                                            activation='tanh'))(l_conv)\n",
        "        attention = keras.layers.TimeDistributed(keras.layers.Dense(1, activation='tanh'))(h)\n",
        "        attention = keras.layers.Flatten()(attention)  \n",
        "        attention = keras.layers.Softmax(axis=1,\n",
        "                                         name='attention_'+str(filter_size))(attention)\n",
        "        attention = keras.layers.RepeatVector(Params['num_filters'])(attention)\n",
        "        attention = keras.layers.Permute([2, 1])(attention)\n",
        "        representation = keras.layers.multiply([h, attention])\n",
        "        representation = tf.math.reduce_sum(representation, axis = 1)\n",
        "        convs.append(representation)\n",
        "        # l_pool = keras.layers.GlobalMaxPooling1D()(l_conv)\n",
        "        # convs.append(l_pool)\n",
        "    l_merge = keras.layers.concatenate(convs, axis=1)\n",
        "    \n",
        "    x = keras.layers.Dropout(Params['dropout_after_convs'])(l_merge) \n",
        "\n",
        "    dense1 = keras.layers.Dense(Params['num_dense'],\n",
        "                                kernel_constraint=Params['kernel_constraint'],\n",
        "                                activation = 'relu')(x)\n",
        "    x = dense1\n",
        "    dropout1 = keras.layers.Dropout(Params['dropout_after_Dense'])(x)\n",
        "    x = dropout1\n",
        "\n",
        "    if not Params['ifMulticlass']:\n",
        "        finalOut = keras.layers.Dense(1, activation='sigmoid',\n",
        "                                    bias_initializer=tf.keras.initializers.Constant(Params['initial_bias'])\n",
        "                                    )(x)\n",
        "    else:\n",
        "        finalOut = keras.layers.Dense(Params['nclasses'], activation='softmax')(x)\n",
        "\n",
        "    # define the model's start and end points    \n",
        "    model = keras.Model(inpTensor,finalOut)\n",
        "\n",
        "    return model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xAfa4FkoIhU6"
      },
      "source": [
        "#Define Parameters"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fDoDihkSvq7h"
      },
      "outputs": [],
      "source": [
        "Params = {}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ndFDWyjpIhU8"
      },
      "outputs": [],
      "source": [
        "Params['num_epochs'] = 50\n",
        "\n",
        "Params['learning_rate'] = 1e-4\n",
        "if tpu_env:\n",
        "    Params['batch_size'] = 48\n",
        "else:\n",
        "    Params['batch_size'] = 48\n",
        "\n",
        "Params['embedding_dim'] = 128 # 128\n",
        "\n",
        "# CNN parameters\n",
        "Params['min_filter_size'] = 2\n",
        "Params['max_filter_size'] = 12 # 12\n",
        "Params['num_filters'] = 256 # 256\n",
        "Params['dropout_after_convs'] = 0.4 # 0.4\n",
        "# Dense Layer Parameters\n",
        "Params['num_dense'] = 256 # 256\n",
        "Params['dropout_after_Dense'] = 0.4\n",
        "\n",
        "# Transformer+Attention Model parameters\n",
        "Params['embdim'] = 2000\n",
        "Params['mask_zero'] = True\n",
        "Params['numheads'] = 8\n",
        "Params['ffdim'] = 64\n",
        "Params['trans_drop'] = 0.4\n",
        "Params['Nt'] = 1\n",
        "Params['ifPreCNN'] = False\n",
        "if Params['ifPreCNN']:\n",
        "    Params['W'] = 500\n",
        "    Params['Nc'] = 1\n",
        "    Params['Nl'] = 1\n",
        "Params['num_dense_embed'] = 64 # 256\n",
        "Params['dropout_after_Dense_embed'] = 0.0 #0.2\n",
        "\n",
        "Params['kernel_constraint'] = keras.constraints.max_norm(1.0)\n",
        "Params['kernel_L2_reg'] = 0.1\n",
        "Params['bias_L2_reg'] = 0.1\n",
        "Params['activity_L2_reg'] = 0.1\n",
        "\n",
        "Params['ifMulticlass'] = False\n",
        "Params['nclasses'] = 2\n",
        "\n",
        "Params['sample_weighting'] = True\n",
        "\n",
        "Params['loss'] = keras.losses.BinaryCrossentropy(from_logits=False)\n",
        "if not tpu_env:\n",
        "    # otherwise have to define in the TPU environment\n",
        "    Params['metrics'] = [\n",
        "                        #  keras.metrics.TruePositives(name='tp'),\n",
        "                        #  keras.metrics.FalsePositives(name='fp'),\n",
        "                        #  keras.metrics.TrueNegatives(name='tn'),\n",
        "                        #  keras.metrics.FalseNegatives(name='fn'),\n",
        "                        keras.metrics.BinaryAccuracy(name='acc'),\n",
        "                        # keras.metrics.PrecisionAtRecall(0.5, name='par50'),\n",
        "                        #  keras.metrics.Precision(name='prec'),\n",
        "                        #  keras.metrics.Recall(name='rec'),\n",
        "                        keras.metrics.AUC(name='auc'),\n",
        "                        ]\n",
        "\n",
        "# Params['initial_bias'] = np.log(num1/num0)\n",
        "# Params['initial_bias'] = np.log(2) # default\n",
        "# Params['initial_bias'] = None\n",
        "\n",
        "Params['ifEarlyStopping'] = True\n",
        "# Params['ifEarlyStopping'] = False\n",
        "# Params['monitor'] = 'loss'\n",
        "Params['monitor'] = 'val_auc'\n",
        "Params['patience'] = 10\n",
        "early_stopping = tf.keras.callbacks.EarlyStopping(\n",
        "    monitor = Params['monitor'],\n",
        "    verbose = 2,\n",
        "    patience = Params['patience'],\n",
        "    mode = 'auto',\n",
        "    min_delta = 0,\n",
        "    restore_best_weights = True\n",
        "    )\n",
        "Params['callbacks'] = [early_stopping]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wsYToQkTO5iF"
      },
      "source": [
        "#Text Preprocessing & Tokenization"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tDd45DbnVWDp"
      },
      "outputs": [],
      "source": [
        "DOCTYPE = 'Responses'\n",
        "\n",
        "Params['pretrained_embeddings'] = False\n",
        "\n",
        "# Params['num_words_to_use'] = None\n",
        "Params['num_words_to_use'] = 20000\n",
        "# Params['num_words_to_use'] = 1000\n",
        "\n",
        "\n",
        "# Params['text_length'] = 4000\n",
        "Params['text_length'] = 8000\n",
        "Params['text_start'] = 0 # 100\n",
        "Params['text_end'] = Params['text_start'] + Params['text_length']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "uor3Kbin2lfZ"
      },
      "outputs": [],
      "source": [
        "ptab = pd.read_csv(FILELOC + 'PTAB_Institution_Proceedings_to_20211231.tsv', sep='\\t')\n",
        "# print(len(ptab))\n",
        "# ptab.drop_duplicates('Proceeding', inplace=True)\n",
        "# print(len(ptab))\n",
        "# ptab['date'] = ptab['Case Filing Date'].apply(dateparse)\n",
        "\n",
        "# trainingvariable = 'Responses'"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Qx6RjOg82lcU"
      },
      "outputs": [],
      "source": [
        "# with open(FILELOC + 'IPR_Proceeding_PartyNames_12312022.txt', 'r', encoding=\"ISO-8859-1\") as f:\n",
        "#     outfile = [line.rstrip('\\n') for line in f]\n",
        "# case = []; number = []\n",
        "# for k in range(0,len(outfile),2):\n",
        "#     case.append(outfile[k])\n",
        "#     number.append(outfile[k+1])\n",
        "# casedf = pd.DataFrame.from_dict({'case':case, 'proc':number})\n",
        "# casedf.drop_duplicates('proc', inplace=True)\n",
        "# casedf['proc'] = casedf['proc'].apply(lambda x: x.split('(')[0].strip())\n",
        "# casedf['name'] = casedf['case'].apply(lambda x: x.strip(\"\\\"\"))\n",
        "# casedf[casedf.name.str.contains('Petition')].to_csv('a.csv')\n",
        "\n",
        "# common_names = set(['business', 'doing', 'company', 'corporation', 'formerly', 'et', 'al'])\n",
        "\n",
        "# def f(x):\n",
        "#     y = x\n",
        "#     if 'Petition' in x:\n",
        "#         if 'Covered' in x:\n",
        "#             y = x.replace(\"Petition for Covered Business Method Patent Review by\",\"\")\n",
        "#         elif 'Inter' in x:\n",
        "#             y = x.replace(\"Petition for Inter Partes Review by\", \"\")\n",
        "#     y = y.translate(str.maketrans('', '', string.punctuation))\n",
        "#     if 'v' in y:\n",
        "#         y = y.replace(\"v\", \"\")\n",
        "#     y = [s.strip() for s in y.strip().split(' ') if s != \"\" and s not in STOPWORDS|common_names]\n",
        "#     return y\n",
        "# casedf['party_names'] = casedf['name'].apply(f)\n",
        "\n",
        "# ptdf = pd.merge(ptab,casedf,left_on='Proceeding',right_on='proc',how='inner')\n",
        "# print(len(ptab), len(casedf), len(ptdf))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YxdJ9pvjXJSk"
      },
      "outputs": [],
      "source": [
        "# # Drop error messages and duplicates\n",
        "\n",
        "# ptdf.drop(columns=list({'Petitions','Responses','Decisions'}-{DOCTYPE}), inplace=True)\n",
        "# ptdf.drop(columns=['case','proc'], inplace=True)\n",
        "\n",
        "# ptdf[DOCTYPE] = ptdf[DOCTYPE].fillna('NA')\n",
        "\n",
        "# # clean up texts by removing (cid:##) which is likely an artifact of the PDF reading process\n",
        "# cid_str = re.compile(\"\\(cid:\\d+\\)\")\n",
        "# def f(x):\n",
        "#     return re.sub(cid_str, \"\", x)\n",
        "# ptdf[DOCTYPE] = ptdf[DOCTYPE].apply(f)\n",
        "\n",
        "# def get_word_count(text):\n",
        "#     return len(text.split())\n",
        "# ptdf[f'{DOCTYPE}_Len'] = ptdf[DOCTYPE].apply(get_word_count)\n",
        "# MIN_LENGTH = 50\n",
        "\n",
        "# print(len(ptdf))\n",
        "# ptdf.drop(ptdf[ptdf[f'{DOCTYPE}_Len'] < MIN_LENGTH].index, inplace=True)\n",
        "# print(len(ptdf))\n",
        "# ptdf.drop_duplicates(DOCTYPE, keep=False, inplace=True)\n",
        "# print(len(ptdf))\n",
        "\n",
        "# ptdf.reset_index(inplace=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pUFUMGsMiLmt"
      },
      "outputs": [],
      "source": [
        "Params['remove_stop_words'] = True\n",
        "Params['remove_alphanumeric'] = True\n",
        "Params['remove_punctuation'] = True\n",
        "Params['remove_shortword_size'] = 3\n",
        "Params['remove_propernouns'] = True\n",
        "\n",
        "Params['clean_all'] = True\n",
        "Params['remove_shortword_size'] = 3\n",
        "\n",
        "Params['use_lowercase'] = True"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "WMKf8CVm39nn"
      },
      "outputs": [],
      "source": [
        "# remove_shortword_size = Params['remove_shortword_size']\n",
        "# def process_docs(x):\n",
        "#     doc = x.replace(\"‘\", \"\\'\").replace(\"’\", \"\\'\").replace(\"´\", \"\\'\").replace(\"“\", \"\\\"\").replace(\"”\", \"\\\"\")\n",
        "#     t = nltk.tokenize.word_tokenize(doc)\n",
        "#     PUNCT = set(string.punctuation + u\"‘’´`“”–-§\")\n",
        "#     tags = nltk.tag.pos_tag(t)\n",
        "#     propernouns = set([a for a,b in tags if b=='NNP'])\n",
        "#     # shortwords = set([tt for tt in t if len(tt) <= remove_shortword_size])\n",
        "#     noisewords = set([tt for tt in t if (len(tt) <= 2) and any(map(lambda x: x in PUNCT, tt))])\n",
        "#     numwords = set([tt for tt in t if any(map(str.isdigit, tt))])\n",
        "#     emailwords = set([tt for tt in t if '@' in tt])\n",
        "#     dotwords = set([tt for tt in t if '.' in tt])\n",
        "    \n",
        "#     # reject_list = PUNCT|propernouns|STOPWORDS|shortwords|numwords|emailwords|dotwords\n",
        "#     reject_list = PUNCT|propernouns|numwords|emailwords|dotwords|noisewords\n",
        "#     proct = [tt for tt in t if tt not in reject_list]\n",
        "#     return proct\n",
        "\n",
        "# doclist = ptdf[DOCTYPE].tolist()\n",
        "# # docmap = map(process_docs, doclist)\n",
        "# # tokdocs = [doc for doc in tqdm(docmap)]\n",
        "# tokdocs = [process_docs(doc) for doc in tqdm(doclist)]\n",
        "# with open(FILELOC + 'Tokenized_Responses_20220212.pkl', 'wb') as f:\n",
        "#     pickle.dump([ptdf, tokdocs], f)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LkiSq11eoXag"
      },
      "outputs": [],
      "source": [
        "# if Params['remove_propernouns'] or Params['clean_all']:\n",
        "#     def f(x):\n",
        "#         if 'v.' not in x:\n",
        "#             return 'NO_PARTY'\n",
        "#         else:\n",
        "#             y = x.split('v.')\n",
        "#             petitioner = y[0].split()[0].strip().replace(',', '')\n",
        "#             patentowner = y[1].split()[0].strip().replace(',', '')\n",
        "#             return [petitioner, patentowner]\n",
        "\n",
        "# parties_first = casedf['name'].apply(f).values\n",
        "# CASENAMES = set(itertools.chain.from_iterable(parties_first))\n",
        "\n",
        "# docs = ptdf[DOCTYPE].values\n",
        "# partyname_list = ptdf['party_names'].tolist()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_7WJ3Cqxbb5j"
      },
      "outputs": [],
      "source": [
        "# if Params['clean_all']:\n",
        "#     remove_shortword_size = Params['remove_shortword_size']\n",
        "#     def process_docs(x):\n",
        "#         doc = x\n",
        "#         t = nltk.tokenize.word_tokenize(doc)\n",
        "#         PUNCT = string.punctuation + u\"‘’´“”–-\"\n",
        "#         propernouns = set([a for a,b in nltk.tag.pos_tag(t) if b=='NNP'])\n",
        "#         shortwords = set([tt for tt in t if len(tt) <= remove_shortword_size])\n",
        "#         reject_list = set(PUNCT)|propernouns|STOPWORDS|shortwords\n",
        "\n",
        "#         proct = [tt for tt in t if tt.isalpha() and tt not in reject_list]\n",
        "#         return proct\n",
        "\n",
        "# else:\n",
        "#     remove_punct = Params['remove_punctuation']\n",
        "#     remove_stopwords = Params['remove_stop_words']\n",
        "#     remove_alphanumeric = Params['remove_alphanumeric']\n",
        "#     # set to False or None if not used otherwise remove this length or less\n",
        "#     remove_shortword_size = Params['remove_shortword_size']\n",
        "#     remove_proper = Params['remove_propernouns']\n",
        "\n",
        "#     def process_docs(x):\n",
        "#         doc, partynames = x\n",
        "#         t = nltk.tokenize.word_tokenize(doc)\n",
        "#         PUNCT = string.punctuation + u\"‘’´“”–-\"\n",
        "#         if remove_punct:\n",
        "#             proct = [tt for tt in t if tt not in set(PUNCT)]\n",
        "#         if remove_stopwords:\n",
        "#             proct = [tt for tt in proct if tt not in STOPWORDS]\n",
        "#         if remove_alphanumeric:\n",
        "#             proct = [tt for tt in proct if tt.isalpha()]\n",
        "#         if remove_shortword_size:\n",
        "#             proct = [tt for tt in proct if len(tt) > remove_shortword_size]\n",
        "#         propernouns = set([a for a,b in nltk.tag.pos_tag(proct) if b=='NNP'])\n",
        "#         if Params['keep_case_names']:\n",
        "#             propernouns = propernouns - (CASENAMES - set(partynames))\n",
        "#         if remove_proper:\n",
        "#             proct = [tt for tt in proct if tt not in propernouns]\n",
        "#         return proct\n",
        "\n",
        "# if Params['clean_all']:\n",
        "#     tokdocs = ptdf[DOCTYPE].apply(process_docs)\n",
        "# else:\n",
        "#     tokdocs = [process_docs([docs[ind], partyname_list[ind]]) for ind in tqdm(ptdf.index)]\n",
        "\n",
        "# # with open(FILELOC + 'Tokenized_Responses_20220131.pkl', 'wb') as f:\n",
        "# #     pickle.dump([ptdf, tokdocs], f)\n",
        "# # with open(FILELOC + 'Tokenized_Decisions_20220131.pkl', 'wb') as f:\n",
        "# #     pickle.dump([ptdf, tokdocs], f)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "panQ-y442lY6"
      },
      "outputs": [],
      "source": [
        "# with open(FILELOC + 'Tokenized_Responses_20220131.pkl', 'rb') as f:\n",
        "#     ptdf, tokdocs = pickle.load(f)\n",
        "\n",
        "# with open(FILELOC + 'Tokenized_Responses_noproper_20220131.pkl', 'rb') as f:\n",
        "#     ptdf, tokdocs = pickle.load(f)\n",
        "\n",
        "with open(FILELOC + 'Tokenized_Responses_20220212.pkl', 'rb') as f:\n",
        "    ptdf, tokdocs = pickle.load(f)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yXE3Y-_523jO"
      },
      "outputs": [],
      "source": [
        "# take a list of tokenized documents (i.e. list of lists) and derive an integer\n",
        "# mapping dictionary (0 = not used, 1 = out of vocabular, 2+ are tokens) for the\n",
        "# all (if num_words=None) or num_words most common words\n",
        "# It will generate a 2D array of truncated / padded document vectors (vec_len)\n",
        "# If lowercase set to True then converts all tokens to lowercase\n",
        "# Out of vocabulary string is \"oov_str\" (default '<OOV>')\n",
        "\n",
        "class Token2Int(BaseEstimator,TransformerMixin):\n",
        "    def __init__(self, vec_len, num_words=None, oov_str='<OOV>', lowercase=True):\n",
        "        self.vec_len = vec_len\n",
        "        self.num_words = num_words\n",
        "        self.oov_str = oov_str\n",
        "        self.lowercase = lowercase\n",
        "\n",
        "    def fit(self, X, y=None):\n",
        "        if type(X[0]) is not list:\n",
        "            X = [X] # only a single document was passed\n",
        "        if self.lowercase:\n",
        "            X = [[d.lower() for d in doc] for doc in X]\n",
        "        wc = Counter(itertools.chain.from_iterable(X))\n",
        "        self.word_count = wc\n",
        "        vocab = [w for w,c in wc.most_common(self.num_words)]\n",
        "        vocab.insert(0, self.oov_str)       # assign 1 to OOV\n",
        "        self.vocab = vocab\n",
        "        self.vocab_size = len(vocab)\n",
        "        wordmap = {n:m+1 for m,n in enumerate(vocab)}\n",
        "        self.word_index = wordmap\n",
        "        self.index_word = {n:m for m,n in wordmap.items()}\n",
        "        return self\n",
        "\n",
        "    def transform(self, X):\n",
        "        if type(X[0]) is not list:\n",
        "            X = [X] # only a single document was passed\n",
        "        # X = np.array(list(itertools.zip_longest(*X, fillvalue=0))).T\n",
        "        if self.lowercase:\n",
        "            # X = np.vectorize(str.lower)(X)\n",
        "            X = [[d.lower() for d in doc] for doc in X]\n",
        "        wordmap = self.word_index\n",
        "        vocab = self.vocab\n",
        "        veclen = self.vec_len\n",
        "        numdocs = len(X)\n",
        "        # wordmap['0'] = 0\n",
        "        # # textpad = np.array([t[:veclen] if len(t) >= veclen else t + ['0']*(veclen-len(t)) for t in X]).astype(str)\n",
        "        # X = [[wordmap.get(x, 1) for x in t] for t in X]\n",
        "        # return pad_sequences(X, maxlen=veclen, padding='post', truncating='post')\n",
        "        textpad = np.zeros((numdocs, veclen))\n",
        "        for d in tqdm(range(numdocs)):\n",
        "            doc = X[d]\n",
        "            doclen = min(len(doc), veclen)\n",
        "            textpad[d,:doclen] = [wordmap.get(word, 1) for word in doc[:doclen]]\n",
        "            # textpad[d,:doclen] = [wordmap[word] if word in vocab else 1 for word in doc[:doclen]]\n",
        "        return textpad\n",
        "\n",
        "    def reverse(self, textpad):\n",
        "        texts = []\n",
        "        for row in textpad:\n",
        "            int2text = ['' if w==0 else self.index_word[w] for w in row]\n",
        "            texts.append(' '.join(int2text).strip())\n",
        "        return texts"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "t1TbHH0S23f9",
        "outputId": "73a6aa85-4ead-4027-aac6-0188633a8e57"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "9283\n",
            "9182\n"
          ]
        }
      ],
      "source": [
        "map_outcome2unpat = {'Denied': 0,\n",
        "                     'Denied on Rehearing': -1,\n",
        "                     'Mixed': 0,\n",
        "                     'Granted': 1,\n",
        "                     'Granted on Rehearing': -1,\n",
        "                     'Indefinite': -1,\n",
        "                     }\n",
        "ptdf['Unpatentable'] = ptdf['Decision'].map(map_outcome2unpat)\n",
        "\n",
        "selind = ptdf[ptdf['Unpatentable'] != -1].index\n",
        "\n",
        "print(len(ptdf))\n",
        "ptdf.drop(ptdf[ptdf['Unpatentable'] == -1].index, inplace=True)\n",
        "print(len(ptdf))\n",
        "ptdf.reset_index(inplace=True)\n",
        "tokdocs = [tokdocs[ind] for ind in range(len(tokdocs)) if ind in selind]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_jbe0xu14bkn"
      },
      "outputs": [],
      "source": [
        "def f_pet(x):\n",
        "    case, partyname = x\n",
        "    if ' v.' in case:\n",
        "        try:\n",
        "            pet,po = case.split(' v.')\n",
        "        except:\n",
        "            print(case)\n",
        "        return pet.strip()\n",
        "    else:\n",
        "        return ' '.join(partyname)\n",
        "\n",
        "def f_po(x):\n",
        "    case = x\n",
        "    if ' v.' in case:\n",
        "        pet,po = case.split(' v.')\n",
        "        return po.strip()\n",
        "    else:\n",
        "        return 'UNKNOWN'\n",
        "\n",
        "ptdf['petitioner_raw'] = ptdf[['name', 'party_names']].apply(f_pet, axis=1)\n",
        "ptdf['patent_owner_raw'] = ptdf['name'].apply(f_po)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "JhZI2iUptJbt"
      },
      "outputs": [],
      "source": [
        "common_terms = ['corporation','corp.',' corp ', '& co','co.',' co ','company',\n",
        "                'l.l.c.', 'llc', 'l.c', ' lc',\n",
        "                'l.l.p.', 'llp', 'l.p.', ' lp',\n",
        "                'incorporated', 'inc.', ' inc ',\n",
        "                'limited', 'ltd',\n",
        "                ' sa ', ' se ', ' ag ',\n",
        "                'gmbh', 'a/s', 'bv', ' nv', 'n.v.',\n",
        "                'et al',\n",
        "                'n.a.', ' us ', ' usa ',\n",
        "                '(us)', '(usa)', '(u.s.)', '(u.s.a.)',\n",
        "                '(california)', '(delaware)', '(united states)',\n",
        "                ' i,',  'ii', 'iii',\n",
        "                '1)', '2)',\n",
        "                ]\n",
        "replace_common_terms = '|'.join(common_terms).replace('/','\\/').replace(' ','\\s').replace('.','\\.').replace('(','\\(').replace(')','\\)')\n",
        "replace_common_terms += '|\\s\\d+\\s'\n",
        "regexp_common_terms = re.compile(replace_common_terms, re.IGNORECASE)\n",
        "\n",
        "split_terms = ['d/b/a/', 'd/b/a', 'doing business as', 'formerly known as', 'f/k/a/', 'f/k/a', ' and ']\n",
        "split_terms_list = '|'.join(split_terms).replace('/','\\/').replace(' ','\\s')\n",
        "regexp_split_terms = re.compile(split_terms_list, re.IGNORECASE)\n",
        "\n",
        "def f_clean(x):\n",
        "    # no cleaning up special characters\n",
        "    # add a trailing whitepace to eliminate edge effects for \"lp\" and \"inc\"\n",
        "    x += ' '\n",
        "    # remove common terms\n",
        "    if any([t in x.lower() for t in common_terms]):\n",
        "        # remove commas and periods associated with these terms\n",
        "        x = re.sub(regexp_common_terms, '', x)\n",
        "        x = x.replace(', ',' '); x = x.replace('. ',' ')\n",
        "        x = x.strip().strip(',').strip('.')\n",
        "\n",
        "    if any([t in x.lower() for t in split_terms]):\n",
        "        x = re.split(regexp_split_terms, x)\n",
        "        x = ';'.join(x)\n",
        "\n",
        "    return x\n",
        "\n",
        "ptdf['petitioner'] = ptdf['petitioner_raw'].apply(f_clean)\n",
        "ptdf['patent_owner'] = ptdf['patent_owner_raw'].apply(f_clean)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 66,
          "referenced_widgets": [
            "1b33de2a3a4a470aa6f07253885fe1bc",
            "f63479552d154f019a083d60b5f7262e",
            "64723b9f5a9742df9ba5425820b8674d",
            "a30de5d77e1b46bd83cf8ecbeed63171",
            "237488437a3a452493d875733ed29275",
            "7c0035c0c2f842448c19f92eb1ac54d9",
            "acfb2bfc634642cd86aad9009938c455",
            "4ef7df9baa4444f5979dabf24975ffeb",
            "46c770d287cb402a91ccbf7feece5753",
            "84be481bd1844259a25de6207ce09998",
            "3dbc67b9c8c646bd934013cb45df9907"
          ]
        },
        "id": "CY0JTAKe92st",
        "outputId": "cb0088e3-0858-4fca-e8c1-ac44dab4ce2a"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "20002\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "1b33de2a3a4a470aa6f07253885fe1bc",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "  0%|          | 0/2631 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {}
        }
      ],
      "source": [
        "trainindex = ptdf[ptdf.date < pd.Timestamp(2017,12,31)].index\n",
        "testindex = ptdf[ptdf.date > pd.Timestamp(2019,1,1)].index\n",
        "\n",
        "toktrain = [tokdocs[ind] for ind in range(len(tokdocs)) if ind in trainindex]\n",
        "\n",
        "tok2int = Token2Int(Params['text_length'], num_words = Params['num_words_to_use'])\n",
        "tok2int.fit(toktrain)\n",
        "Params['vocab_size'] = tok2int.vocab_size + 1   # add the 0 token\n",
        "print(Params['vocab_size'])\n",
        "\n",
        "X_train = tok2int.transform(toktrain).astype(int)\n",
        "Y_train = ptdf.loc[trainindex, 'Unpatentable'].values\n",
        "\n",
        "toktest = [tokdocs[ind] for ind in range(len(tokdocs)) if ind in testindex]\n",
        "X_test = tok2int.transform(toktest).astype(int)\n",
        "Y_test = ptdf.loc[testindex, 'Unpatentable'].values"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8P09Tr17Zx1o"
      },
      "source": [
        "#Fit Model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dldKFWGv-Jct"
      },
      "outputs": [],
      "source": [
        "if Params['sample_weighting']:\n",
        "    class_wts = list(class_weight.compute_class_weight(class_weight='balanced',\n",
        "                                                    classes=np.unique(Y_train), y=Y_train))\n",
        "    print(class_wts)\n",
        "    Params['sample_weights'] = np.array([class_wts[yt] for yt in Y_train])\n",
        "\n",
        "    num = len(Y_train)\n",
        "    num0 = len(np.where(Y_train==0)[0]); num1 = len(np.where(Y_train==1)[0])\n",
        "    if num1 < num0:\n",
        "        Params['initial_bias'] = np.log(num1/num0)\n",
        "    else:\n",
        "        Params['initial_bias'] = np.log(num0/num1)\n",
        "\n",
        "else:\n",
        "    Params['initial_bias'] = 0"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "u6XCIyvAZxqD"
      },
      "outputs": [],
      "source": [
        "for run in range(5):\n",
        "    print(run)\n",
        "    \n",
        "    tf.keras.backend.clear_session()    # reset Tensorflow session\n",
        "\n",
        "    X_t = X_train; Y_t = Y_train    \n",
        "\n",
        "    with tpu_strategy.scope():\n",
        "        Params['loss'] = keras.losses.BinaryCrossentropy(from_logits=False)\n",
        "        Params['metrics'] = [keras.metrics.BinaryAccuracy(name='acc'),\n",
        "                        keras.metrics.AUC(name='auc'),]\n",
        "        model = EmbedNN(Params)\n",
        "        model.compile(loss=Params['loss'],\n",
        "                    optimizer=keras.optimizers.Adam(learning_rate=Params['learning_rate']),\n",
        "                    metrics=Params['metrics'],\n",
        "                    steps_per_execution = 100,)\n",
        "\n",
        "        if Params['sample_weighting']:\n",
        "            train_dataset = tf.data.Dataset.from_tensor_slices((X_t, Y_t, Params['sample_weights']))\n",
        "            Params['val_sample_weights'] = np.array([class_wts[yt] for yt in Y_test])\n",
        "            val_dataset = tf.data.Dataset.from_tensor_slices((X_test, Y_test, Params['val_sample_weights']))\n",
        "        else:\n",
        "            train_dataset = tf.data.Dataset.from_tensor_slices((X_t, Y_t))\n",
        "            val_dataset = tf.data.Dataset.from_tensor_slices((X_test, Y_test))\n",
        "        history = model.fit(train_dataset.batch(Params['batch_size']),\n",
        "                            epochs = Params['num_epochs'], verbose = 1,)\n",
        "                            # validation_data = val_dataset.batch(Params['batch_size']),\n",
        "                                # callbacks=Params['callbacks'])\n",
        "\n",
        "    print(\"Results for Testing Data:\")\n",
        "    test_predict = model.predict(X_test)\n",
        "    test_predict_bool = np.round(test_predict)\n",
        "    TestPredict = test_predict_bool\n",
        "    ClassRep = classification_report(Y_test, test_predict_bool)\n",
        "    ConfMatrix = confusion_matrix(Y_test, test_predict_bool)\n",
        "    print(ClassRep)\n",
        "    print(ConfMatrix)\n",
        "\n",
        "    model.save_weights(FILELOC+\"responses_\"+str(run)+\"_wts.h5\", save_format='h5', overwrite=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vyNgcoOWZ2EQ"
      },
      "source": [
        "#Interpret Results"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "if Params['sample_weighting']:\n",
        "    class_wts = list(class_weight.compute_class_weight(class_weight='balanced',\n",
        "                                                    classes=np.unique(Y_train), y=Y_train))\n",
        "    print(class_wts)\n",
        "    Params['sample_weights'] = np.array([class_wts[yt] for yt in Y_train])\n",
        "\n",
        "    num = len(Y_train)\n",
        "    num0 = len(np.where(Y_train==0)[0]); num1 = len(np.where(Y_train==1)[0])\n",
        "    if num1 < num0:\n",
        "        Params['initial_bias'] = np.log(num1/num0)\n",
        "    else:\n",
        "        Params['initial_bias'] = np.log(num0/num1)\n",
        "\n",
        "else:\n",
        "    Params['initial_bias'] = 0\n",
        "\n",
        "tf.keras.backend.clear_session()\n",
        "with tpu_strategy.scope():\n",
        "    # try:\n",
        "    model = EmbedNN(Params)\n",
        "    model.load_weights(FILELOC+\"responses_wts.h5\")\n",
        "    pred_test = model.predict(X_test, verbose=False)\n",
        "\n",
        "att = {}\n",
        "for n in range(2,12+1):\n",
        "    get_attention_model = keras.Model(inputs=model.input,outputs=model.get_layer(f'attention_{n}').output)\n",
        "    get_attention_model.compile()\n",
        "    att[n] = get_attention_model.predict(xtest, verbose=1)"
      ],
      "metadata": {
        "id": "LSAzh3t1FeLn"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "##Attention Visualization"
      ],
      "metadata": {
        "id": "q10V2RD5OZ4A"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "stL5tUYCWqUa"
      },
      "outputs": [],
      "source": [
        "from IPython.display import HTML\n",
        "def  hlstr(string, color='white'):\n",
        "    \"\"\"\n",
        "    Return HTML markup highlighting text with the desired color.\n",
        "    \"\"\"\n",
        "    return f\"<mark style=background-color:{color}>{string} </mark>\"\n",
        "\n",
        "def colorize(attrs, cmap='PiYG'):\n",
        "    \"\"\"\n",
        "    Compute hex colors based on the attributions for a single instance.\n",
        "    Uses a diverging colorscale by default and normalizes and scales\n",
        "    the colormap so that colors are consistent with the attributions.\n",
        "    \"\"\"\n",
        "    import matplotlib as mpl\n",
        "    cmap_bound = np.abs(attrs).max()\n",
        "    norm = mpl.colors.Normalize(vmin=-cmap_bound, vmax=cmap_bound)\n",
        "    cmap = mpl.cm.get_cmap(cmap)\n",
        "\n",
        "    # now compute hex values of colors\n",
        "    colors = list(map(lambda x: mpl.colors.rgb2hex(cmap(norm(x))), attrs))\n",
        "    return colors"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3STTYnjNgrf3"
      },
      "outputs": [],
      "source": [
        "N = 8\n",
        "n = 7   # document index\n",
        "print(pred                                                                                                                                                                                                                               [n])\n",
        "xlen = np.where(X_test[n]==0)[0][0]\n",
        "attvec = att[N][n][:xlen]\n",
        "xvec = tok2int.reverse([X_test[n][:xlen]])[0]\n",
        "strlen = len(attvec)\n",
        "THRESH = np.median(attvec)\n",
        "colors = colorize(attvec - THRESH)\n",
        "\n",
        "HTML(\"\".join(list(map(hlstr, xvec.split(), colors))))"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Highest attention words in sample"
      ],
      "metadata": {
        "id": "HSMCjnRAOSm1"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "uniquetokens = np.unique(xtest[n][:xlen])\n",
        "print(len(uniquetokens))\n",
        "tokpos = [np.where(xtest[n][:xlen] == tok)[0] for tok in uniquetokens]\n",
        "meanatt = np.array([np.mean(att[2][n][np.array(tpos)]) for tpos in tokpos])\n",
        "tokens_sorted_by_meanatt = uniquetokens[np.argsort(-meanatt)]"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "m8U2xDZ_L7Nu",
        "outputId": "9aef17aa-cddb-467d-f8c6-e43786812cc0"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "605\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "pd.DataFrame.from_dict({'Attention':[tok2int.index_word[t] for t in tokens_sorted_by_meanatt[:20]]})"
      ],
      "metadata": {
        "id": "UJQzk6d8n7ZL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "##LIME Analysis"
      ],
      "metadata": {
        "id": "PLCP7ucjOGGx"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "44AaMHd3wGA5"
      },
      "outputs": [],
      "source": [
        "def lean_wrapper(texts):\n",
        "    x = pad_sequences(DTP.texts_to_sequences(texts),\n",
        "                      maxlen = Params['text_length'],\n",
        "                      padding='post',\n",
        "                      truncating='post')\n",
        "    return np.hstack((1-model.predict(x), model.predict(x)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sEeA4AEzzRBZ"
      },
      "outputs": [],
      "source": [
        "n = 2774\n",
        "xvec = DTP.sequences_to_texts([X_data[n]])[0]\n",
        "\n",
        "exp = LimeTextExplainer(class_names={0:'Denied',1:'Granted'})\n",
        "exp_doc = exp.explain_instance(xvec, lean_wrapper, num_features=50)\n",
        "# explist = exp_doc.as_list()\n",
        "exp_doc.show_in_notebook()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8mKglEF9buWx"
      },
      "outputs": [],
      "source": [
        ""
      ]
    }
  ],
  "metadata": {
    "accelerator": "TPU",
    "colab": {
      "collapsed_sections": [],
      "name": "PTAB_Model_Decisions_github.ipynb",
      "provenance": [],
      "toc_visible": true,
      "mount_file_id": "1X-M2SntuvoGIIjwFAtPNNIHXyKqSTUsK",
      "authorship_tag": "ABX9TyMnadSiryOYGloyXckhl2DZ",
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "1b33de2a3a4a470aa6f07253885fe1bc": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_f63479552d154f019a083d60b5f7262e",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_64723b9f5a9742df9ba5425820b8674d",
              "IPY_MODEL_a30de5d77e1b46bd83cf8ecbeed63171",
              "IPY_MODEL_237488437a3a452493d875733ed29275"
            ]
          }
        },
        "f63479552d154f019a083d60b5f7262e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "64723b9f5a9742df9ba5425820b8674d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_7c0035c0c2f842448c19f92eb1ac54d9",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_acfb2bfc634642cd86aad9009938c455"
          }
        },
        "a30de5d77e1b46bd83cf8ecbeed63171": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_4ef7df9baa4444f5979dabf24975ffeb",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 2631,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 2631,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_46c770d287cb402a91ccbf7feece5753"
          }
        },
        "237488437a3a452493d875733ed29275": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_84be481bd1844259a25de6207ce09998",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 2631/2631 [00:05&lt;00:00, 477.85it/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_3dbc67b9c8c646bd934013cb45df9907"
          }
        },
        "7c0035c0c2f842448c19f92eb1ac54d9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "acfb2bfc634642cd86aad9009938c455": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "4ef7df9baa4444f5979dabf24975ffeb": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "46c770d287cb402a91ccbf7feece5753": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "84be481bd1844259a25de6207ce09998": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "3dbc67b9c8c646bd934013cb45df9907": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}