--- a
+++ b/PTAB_xgboost_github.ipynb
@@ -0,0 +1,1533 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/bahrad/PTAB/blob/master/PTAB_xgboost_github.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KxKMLWEKr-KV"
+      },
+      "source": [
+        "#Initialization"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SO6irSSWsgw0"
+      },
+      "outputs": [],
+      "source": [
+        "%tensorflow_version 2.x\n",
+        "\n",
+        "%xmode Context\n",
+        "# Verbose\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow import keras\n",
+        "import numpy as np\n",
+        "!pip install xlrd==2.0.1\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "import pickle\n",
+        "import os\n",
+        "import itertools\n",
+        "\n",
+        "from collections import Counter, defaultdict\n",
+        "import random\n",
+        "from pandas import DataFrame\n",
+        "import datetime\n",
+        "from datetime import datetime\n",
+        "import dateutil\n",
+        "from dateutil.parser import parse as dateparse\n",
+        "from tqdm.notebook import tqdm\n",
+        "import time\n",
+        "\n",
+        "import sklearn as sk\n",
+        "from sklearn.preprocessing import MultiLabelBinarizer, QuantileTransformer, OneHotEncoder, StandardScaler, OrdinalEncoder\n",
+        "from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV\n",
+        "from sklearn.model_selection import KFold, StratifiedShuffleSplit, ShuffleSplit\n",
+        "from sklearn.linear_model import LogisticRegression\n",
+        "import sklearn.metrics\n",
+        "from sklearn.metrics import accuracy_score,classification_report, make_scorer, balanced_accuracy_score, f1_score, coverage_error, roc_auc_score, confusion_matrix, plot_confusion_matrix\n",
+        "from sklearn.cluster import KMeans\n",
+        "from sklearn.decomposition import PCA\n",
+        "from sklearn.utils import resample, shuffle\n",
+        "from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin\n",
+        "from sklearn.neighbors import NearestNeighbors\n",
+        "from sklearn.manifold import TSNE\n",
+        "from sklearn.utils import class_weight\n",
+        "\n",
+        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+        "import xgboost as xgb\n",
+        "from sklearn.model_selection import RandomizedSearchCV\n",
+        "\n",
+        "from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE\n",
+        "from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, CondensedNearestNeighbour, AllKNN\n",
+        "from imblearn.combine import SMOTEENN, SMOTETomek\n",
+        "from imblearn.pipeline import make_pipeline,Pipeline\n",
+        "\n",
+        "from tensorflow.keras.preprocessing.text import Tokenizer\n",
+        "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
+        "\n",
+        "import string\n",
+        "import re\n",
+        "import ast\n",
+        "# import unicodedata\n",
+        "\n",
+        "import nltk\n",
+        "nltk.download('stopwords')\n",
+        "from nltk.corpus import stopwords\n",
+        "STOPWORDS = set(stopwords.words('english'))\n",
+        "\n",
+        "nltk.download('averaged_perceptron_tagger')\n",
+        "nltk.download('wordnet')\n",
+        "nltk.download('punkt')\n",
+        "\n",
+        "!pip install lime\n",
+        "import lime\n",
+        "from lime import lime_text\n",
+        "from lime.lime_text import LimeTextExplainer\n",
+        "from lime.explanation import Explanation\n",
+        "\n",
+        "!pip install shap\n",
+        "import shap"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8KTO164A7t1G"
+      },
+      "outputs": [],
+      "source": [
+        "# COMMENT OUT FOR PUBLIC CODE\n",
+        "from google.colab import drive, files\n",
+        "# drive.mount('/content/drive')\n",
+        "\n",
+        "FILELOC = \"DATA/\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "meLd62MmSiZn"
+      },
+      "source": [
+        "#Pre-Processing"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "m55MrI8lPFhv"
+      },
+      "outputs": [],
+      "source": [
+        "ptab = pd.read_csv(FILELOC + 'PTAB_Institution_Proceedings_to_20211231.tsv', sep='\\t')\n",
+        "print(len(ptab))\n",
+        "ptab.drop_duplicates('Proceeding', inplace=True)\n",
+        "print(len(ptab))\n",
+        "ptab['date'] = ptab['Case Filing Date'].apply(dateparse)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9RkTxlgRPFhy"
+      },
+      "outputs": [],
+      "source": [
+        "with open(FILELOC + 'IPR_Proceeding_PartyNames_12312022.txt', 'r', encoding=\"ISO-8859-1\") as f:\n",
+        "    outfile = [line.rstrip('\\n') for line in f]\n",
+        "case = []; number = []\n",
+        "for k in range(0,len(outfile),2):\n",
+        "    case.append(outfile[k])\n",
+        "    number.append(outfile[k+1])\n",
+        "casedf = pd.DataFrame.from_dict({'case':case, 'proc':number})\n",
+        "casedf.drop_duplicates('proc', inplace=True)\n",
+        "casedf['proc'] = casedf['proc'].apply(lambda x: x.split('(')[0].strip())\n",
+        "casedf['name'] = casedf['case'].apply(lambda x: x.strip(\"\\\"\"))\n",
+        "casedf[casedf.name.str.contains('Petition')].to_csv('a.csv')\n",
+        "\n",
+        "common_names = set(['business', 'doing', 'company', 'corporation', 'formerly', 'et', 'al'])\n",
+        "\n",
+        "def f(x):\n",
+        "    y = x\n",
+        "    if 'Petition' in x:\n",
+        "        if 'Covered' in x:\n",
+        "            y = x.replace(\"Petition for Covered Business Method Patent Review by\",\"\")\n",
+        "        elif 'Inter' in x:\n",
+        "            y = x.replace(\"Petition for Inter Partes Review by\", \"\")\n",
+        "    y = y.translate(str.maketrans('', '', string.punctuation))\n",
+        "    if 'v' in y:\n",
+        "        y = y.replace(\"v\", \"\")\n",
+        "    y = [s.strip() for s in y.strip().split(' ') if s != \"\" and s not in STOPWORDS|common_names]\n",
+        "    return y\n",
+        "casedf['party_names'] = casedf['name'].apply(f)\n",
+        "\n",
+        "ptdf = pd.merge(ptab,casedf,left_on='Proceeding',right_on='proc',how='inner')\n",
+        "print(len(ptab), len(casedf), len(ptdf))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YxdJ9pvjXJSk"
+      },
+      "outputs": [],
+      "source": [
+        "# DOCTYPE = 'Responses'\n",
+        "DOCTYPE = 'Decisions'\n",
+        "# DOCTYPE = 'Petitions'\n",
+        "\n",
+        "ptdf.drop(columns=list({'Petitions','Responses','Decisions'}-{DOCTYPE}), inplace=True)\n",
+        "ptdf.drop(columns=['case','proc'], inplace=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "o4QcepJBau6N",
+        "outputId": "60cac29e-999f-4be5-ae39-5c118eb34448"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "10462\n",
+            "9296\n",
+            "9079\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Drop error messages and duplicates\n",
+        "\n",
+        "ptdf[DOCTYPE] = ptdf[DOCTYPE].fillna('NA')\n",
+        "\n",
+        "# clean up texts by removing (cid:##) which is likely an artifact of the PDF reading process\n",
+        "cid_str = re.compile(\"\\(cid:\\d+\\)\")\n",
+        "def f(x):\n",
+        "    return re.sub(cid_str, \"\", x)\n",
+        "ptdf[DOCTYPE] = ptdf[DOCTYPE].apply(f)\n",
+        "\n",
+        "def get_word_count(text):\n",
+        "    return len(text.split())\n",
+        "ptdf[f'{DOCTYPE}_Len'] = ptdf[DOCTYPE].apply(get_word_count)\n",
+        "MIN_LENGTH = 50\n",
+        "\n",
+        "print(len(ptdf))\n",
+        "ptdf.drop(ptdf[ptdf[f'{DOCTYPE}_Len'] < MIN_LENGTH].index, inplace=True)\n",
+        "print(len(ptdf))\n",
+        "ptdf.drop_duplicates(DOCTYPE, keep=False, inplace=True)\n",
+        "print(len(ptdf))\n",
+        "\n",
+        "ptdf.reset_index(inplace=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Outcome Statistics"
+      ],
+      "metadata": {
+        "id": "wMmkiANFUsQ6"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "Years = [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]\n",
+        "granted = np.array([len(ptdf[(ptdf.Decision=='Granted') & (ptdf.date.dt.year==yr)]) for yr in Years])\n",
+        "denied = np.array([len(ptdf[(ptdf.Decision=='Denied') & (ptdf.date.dt.year==yr)]) for yr in Years])\n",
+        "mixed = np.array([len(ptdf[(ptdf.Decision=='Mixed') & (ptdf.date.dt.year==yr)]) for yr in Years])\n",
+        "print(granted, denied, mixed)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "E1TQELftdKIn",
+        "outputId": "8edda5a1-bb30-4559-8889-99cd48297555"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[ 39 212 439 442 468 553 708 542 612 237] [ 11 153 395 420 430 481 461 381 389 183] [ 50 281 401 290 225 181   1   0   0   0]\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "granted/(denied+granted+mixed)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Pesoo-b4sFGM",
+        "outputId": "fc026ec6-b79a-4b73-e9c2-7e6b62e046ba"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "array([0.39      , 0.32817337, 0.35546559, 0.38368056, 0.41674087,\n",
+              "       0.45514403, 0.60512821, 0.5872156 , 0.61138861, 0.56428571])"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 13
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "nbars = len(Years)\n",
+        "x_pos = np.linspace(0,nbars-1,nbars)\n",
+        "width = 0.5\n",
+        "\n",
+        "fig, ax = plt.subplots()\n",
+        "fig.set_size_inches(10,6)\n",
+        "\n",
+        "ax.bar(x_pos, granted, label='Granted', color='blue')\n",
+        "ax.bar(x_pos, mixed, bottom=granted, label='Mixed', color='brown')\n",
+        "ax.bar(x_pos, denied, bottom=granted+mixed, label='Denied', color='orange')\n",
+        "\n",
+        "ax.set_xticks(x_pos)\n",
+        "ax.set_xticklabels(Years, rotation=45, ha='right', fontsize=14, fontweight='bold')\n",
+        "# ax.set_ylim([0.4,1.0])\n",
+        "plt.yticks(fontsize=14)\n",
+        "ax.set_ylabel('Number of Petitions', fontsize=18, fontweight='bold')\n",
+        "ax.yaxis.grid(True)\n",
+        "ax.spines['top'].set_visible(False)\n",
+        "ax.spines['right'].set_visible(False)\n",
+        "ax.legend(framealpha=1.0, fontsize=14, loc='upper right', bbox_to_anchor=(1.1, 1))\n",
+        "plt.tight_layout()\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 441
+        },
+        "id": "Mpc64FQkdVWK",
+        "outputId": "7cd04609-c0ee-42fd-c354-1927e8492517"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "\n",
+            "text/plain": [
+              "<Figure size 720x432 with 1 Axes>"
+            ]
+          },
+          "metadata": {
+            "needs_background": "light"
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Tokenize"
+      ],
+      "metadata": {
+        "id": "csv_PV5kUi4R"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HaMewizibi7j"
+      },
+      "outputs": [],
+      "source": [
+        "Params = {}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pUFUMGsMiLmt"
+      },
+      "outputs": [],
+      "source": [
+        "Params['remove_stop_words'] = True\n",
+        "Params['remove_alphanumeric'] = True\n",
+        "Params['remove_punctuation'] = True\n",
+        "Params['remove_shortword_size'] = 3\n",
+        "Params['remove_propernouns'] = True\n",
+        "Params['keep_case_names'] = False\n",
+        "\n",
+        "Params['clean_all'] = True\n",
+        "Params['remove_shortword_size'] = 3\n",
+        "\n",
+        "Params['use_lowercase'] = True"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LkiSq11eoXag"
+      },
+      "outputs": [],
+      "source": [
+        "if Params['remove_propernouns'] or Params['clean_all']:\n",
+        "    def f(x):\n",
+        "        if 'v.' not in x:\n",
+        "            return 'NO_PARTY'\n",
+        "        else:\n",
+        "            y = x.split('v.')\n",
+        "            petitioner = y[0].split()[0].strip().replace(',', '')\n",
+        "            patentowner = y[1].split()[0].strip().replace(',', '')\n",
+        "            return [petitioner, patentowner]\n",
+        "\n",
+        "parties_first = casedf['name'].apply(f).values\n",
+        "CASENAMES = set(itertools.chain.from_iterable(parties_first))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6eR0nIlGtQvs"
+      },
+      "outputs": [],
+      "source": [
+        "docs = ptdf[DOCTYPE].values\n",
+        "partyname_list = ptdf['party_names'].tolist()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_7WJ3Cqxbb5j"
+      },
+      "outputs": [],
+      "source": [
+        "if Params['clean_all']:\n",
+        "    remove_shortword_size = Params['remove_shortword_size']\n",
+        "    def process_docs(x):\n",
+        "        doc = x\n",
+        "        t = nltk.tokenize.word_tokenize(doc)\n",
+        "        PUNCT = string.punctuation + u\"‘’´“”–-\"\n",
+        "        propernouns = set([a for a,b in nltk.tag.pos_tag(t) if b=='NNP'])\n",
+        "        shortwords = set([tt for tt in t if len(tt) <= remove_shortword_size])\n",
+        "        reject_list = set(PUNCT)|propernouns|STOPWORDS|shortwords\n",
+        "\n",
+        "        proct = [tt for tt in t if tt.isalpha() and tt not in reject_list]\n",
+        "        return proct\n",
+        "\n",
+        "else:\n",
+        "    remove_punct = Params['remove_punctuation']\n",
+        "    remove_stopwords = Params['remove_stop_words']\n",
+        "    remove_alphanumeric = Params['remove_alphanumeric']\n",
+        "    # set to False or None if not used otherwise remove this length or less\n",
+        "    remove_shortword_size = Params['remove_shortword_size']\n",
+        "    remove_proper = Params['remove_propernouns']\n",
+        "\n",
+        "    def process_docs(x):\n",
+        "        doc, partynames = x\n",
+        "        t = nltk.tokenize.word_tokenize(doc)\n",
+        "        PUNCT = string.punctuation + u\"‘’´“”–-\"\n",
+        "        if remove_punct:\n",
+        "            proct = [tt for tt in t if tt not in set(PUNCT)]\n",
+        "        if remove_stopwords:\n",
+        "            proct = [tt for tt in proct if tt not in STOPWORDS]\n",
+        "        if remove_alphanumeric:\n",
+        "            proct = [tt for tt in proct if tt.isalpha()]\n",
+        "        if remove_shortword_size:\n",
+        "            proct = [tt for tt in proct if len(tt) > remove_shortword_size]\n",
+        "        propernouns = set([a for a,b in nltk.tag.pos_tag(proct) if b=='NNP'])\n",
+        "        if Params['keep_case_names']:\n",
+        "            propernouns = propernouns - (CASENAMES - set(partynames))\n",
+        "        if remove_proper:\n",
+        "            proct = [tt for tt in proct if tt not in propernouns]\n",
+        "        return proct\n",
+        "\n",
+        "if Params['clean_all']:\n",
+        "    tokdocs = ptdf[DOCTYPE].apply(process_docs)\n",
+        "else:\n",
+        "    tokdocs = [process_docs([docs[ind], partyname_list[ind]]) for ind in tqdm(ptdf.index)]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "S7dmQLyRM1hd"
+      },
+      "source": [
+        "##Missing words in pre-trained embeddings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lYh-xEb9ehz1"
+      },
+      "outputs": [],
+      "source": [
+        "with open(FILELOC + 'Tokenized_Responses_noproper_20220131.pkl', 'rb') as f:\n",
+        "    _, t1 = pickle.load(f)\n",
+        "with open(FILELOC + 'Tokenized_Decisions_noproper_20220131.pkl', 'rb') as f:\n",
+        "    _, t2 = pickle.load(f)\n",
+        "with open(FILELOC + 'Tokenized_Petitions_noproper_20220131.pkl', 'rb') as f:\n",
+        "    _, t3 = pickle.load(f)\n",
+        "\n",
+        "t1 = [t for t in t1]\n",
+        "t2 = [t for t in t2]\n",
+        "t3 = [t for t in t3]\n",
+        "\n",
+        "tokdocs = t1 + t2 + t3"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Na9ZyOf3mkf1"
+      },
+      "outputs": [],
+      "source": [
+        "wordcount = Counter(itertools.chain.from_iterable([[t.lower() for t in doc] for doc in tokdocs]))\n",
+        "vocabsize = len(wordcount)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 66,
+          "referenced_widgets": [
+            "e25001286f6a472f9eeb81781bf096f4",
+            "f6424274b9d34ac6a53c73f4a817dab1",
+            "2c6b85e1b64e42e9ad63f1f1e07fe62a",
+            "c02a86138788487bad802b41687d1cc4",
+            "feec8e5c4f2c4b4291d79c5914789a35",
+            "2a0eba30a22f4378aafec0be7ce3810c",
+            "add4245050924296b45c05f1da684446",
+            "afd6311ea23b403ebc36aafc243ceda1",
+            "0156ba14f4f44fc884c622344cd5a943",
+            "83a8e24763764e22ace8482ff5f582a0",
+            "8d8a1e80d5d74866b80a7b995dc36c3b"
+          ]
+        },
+        "id": "SiI6wu4_C5-P",
+        "outputId": "2a00267e-c6dc-40dd-f71c-84dba4a54631"
+      },
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "e25001286f6a472f9eeb81781bf096f4",
+              "version_minor": 0,
+              "version_major": 2
+            },
+            "text/plain": [
+              "0it [00:00, ?it/s]"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Found 400000 word vectors.\n"
+          ]
+        }
+      ],
+      "source": [
+        "embedding_dim = 200\n",
+        "PRETRAINED_EMBEDDINGS_FILE = FILELOC + f\"glove.6B.{embedding_dim}d.txt\"\n",
+        "# embedding_dim = 200\n",
+        "# PRETRAINED_EMBEDDINGS_FILE = FILELOC + f\"Law2Vec.{embedding_dim}d.txt\"\n",
+        "\n",
+        "embedindex = {}\n",
+        "with open(PRETRAINED_EMBEDDINGS_FILE) as f:\n",
+        "    for line in tqdm(f):\n",
+        "        word, coefs = line.split(maxsplit=1)\n",
+        "        coefs = np.fromstring(coefs, \"f\", sep=\" \")\n",
+        "        embedindex[word] = coefs\n",
+        "\n",
+        "print(\"Found %s word vectors.\" % len(embedindex))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "GtNqwafscmeQ",
+        "outputId": "3a0c596b-3a2b-4693-dec4-897bc0736e75"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Converted 40479 words (119248 misses)\n"
+          ]
+        }
+      ],
+      "source": [
+        "hits = 0; misses = 0\n",
+        "# Prepare embedding matrix\n",
+        "for word in wordcount.keys():\n",
+        "    embedding_vector = embedindex.get(word)\n",
+        "    if embedding_vector is not None:\n",
+        "        # Words not found in embedding index will be all-zeros.\n",
+        "        # This includes the representation for \"padding\" and \"OOV\"\n",
+        "        hits += 1\n",
+        "    else:\n",
+        "        misses += 1\n",
+        "print(\"Converted %d words (%d misses)\" % (hits, misses))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "60bSujwLDnhd"
+      },
+      "outputs": [],
+      "source": [
+        "missingwords = list(set(wordcount.keys()) - set(embedindex.keys()))\n",
+        "df = pd.DataFrame.from_dict({'words':[m for m in missingwords],'count':[wordcount[m] for m in missingwords]})"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "20qaC2xOnY30"
+      },
+      "outputs": [],
+      "source": [
+        "# Converted 24723 words (27096 misses)\n",
+        "law2vec_missing_df = df.copy()\n",
+        "law2vec_missing_df.sort_values(by='count', ascending=False).head(50)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "V4vGFUXsnSJO"
+      },
+      "outputs": [],
+      "source": [
+        "# Converted 29951 words (21868 misses)\n",
+        "glove_missing_df = df.copy()\n",
+        "glove_missing_df.sort_values(by='count', ascending=False).head(50)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "s_fdzeRaPFh3",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "211653ae-b3c1-4081-b648-7a14ec3e3a76"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "9079\n",
+            "8985\n"
+          ]
+        }
+      ],
+      "source": [
+        ""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Oh33URDEUto3"
+      },
+      "outputs": [],
+      "source": [
+        ""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1O5NNIHB5GCp"
+      },
+      "source": [
+        "#XGBoost"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ySPIrSCX3YzA"
+      },
+      "source": [
+        "##Data Preparation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "VU6Ium3c4YKX"
+      },
+      "outputs": [],
+      "source": [
+        "# take a list of tokenized documents (i.e. list of lists) and derive an integer\n",
+        "# mapping dictionary (0 = not used, 1 = out of vocabular, 2+ are tokens) for the\n",
+        "# all (if num_words=None) or num_words most common words\n",
+        "# It will generate a 2D array of truncated / padded document vectors (vec_len)\n",
+        "# If lowercase set to True then converts all tokens to lowercase\n",
+        "# Out of vocabulary string is \"oov_str\" (default '<OOV>')\n",
+        "\n",
+        "class Token2Int(BaseEstimator,TransformerMixin):\n",
+        "    def __init__(self, vec_len, num_words=None, oov_str='<OOV>', lowercase=True):\n",
+        "        self.vec_len = vec_len\n",
+        "        self.num_words = num_words\n",
+        "        self.oov_str = oov_str\n",
+        "        self.lowercase = lowercase\n",
+        "\n",
+        "    def fit(self, X, y=None):\n",
+        "        if type(X[0]) is not list:\n",
+        "            X = [X] # only a single document was passed\n",
+        "        if self.lowercase:\n",
+        "            X = [[d.lower() for d in doc] for doc in X]\n",
+        "        wc = Counter(itertools.chain.from_iterable(X))\n",
+        "        self.word_count = wc\n",
+        "        vocab = [w for w,c in wc.most_common(self.num_words)]\n",
+        "        vocab.insert(0, self.oov_str)       # assign 1 to OOV\n",
+        "        self.vocab = vocab\n",
+        "        self.vocab_size = len(vocab)\n",
+        "        wordmap = {n:m+1 for m,n in enumerate(vocab)}\n",
+        "        self.word_index = wordmap\n",
+        "        self.index_word = {n:m for m,n in wordmap.items()}\n",
+        "        return self\n",
+        "\n",
+        "    def transform(self, X):\n",
+        "        if type(X[0]) is not list:\n",
+        "            X = [X] # only a single document was passed\n",
+        "        # X = np.array(list(itertools.zip_longest(*X, fillvalue=0))).T\n",
+        "        if self.lowercase:\n",
+        "            # X = np.vectorize(str.lower)(X)\n",
+        "            X = [[d.lower() for d in doc] for doc in X]\n",
+        "        wordmap = self.word_index\n",
+        "        vocab = self.vocab\n",
+        "        veclen = self.vec_len\n",
+        "        numdocs = len(X)\n",
+        "        # wordmap['0'] = 0\n",
+        "        # # textpad = np.array([t[:veclen] if len(t) >= veclen else t + ['0']*(veclen-len(t)) for t in X]).astype(str)\n",
+        "        # X = [[wordmap.get(x, 1) for x in t] for t in X]\n",
+        "        # return pad_sequences(X, maxlen=veclen, padding='post', truncating='post')\n",
+        "        textpad = np.zeros((numdocs, veclen))\n",
+        "        for d in tqdm(range(numdocs)):\n",
+        "            doc = X[d]\n",
+        "            doclen = min(len(doc), veclen)\n",
+        "            textpad[d,:doclen] = [wordmap.get(word, 1) for word in doc[:doclen]]\n",
+        "            # textpad[d,:doclen] = [wordmap[word] if word in vocab else 1 for word in doc[:doclen]]\n",
+        "        return textpad\n",
+        "\n",
+        "    def reverse(self, textpad):\n",
+        "        texts = []\n",
+        "        for row in textpad:\n",
+        "            int2text = ['' if w==0 else self.index_word[w] for w in row]\n",
+        "            texts.append(' '.join(int2text).strip())\n",
+        "        return texts"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "UxxwuzHA3Yon"
+      },
+      "outputs": [],
+      "source": [
+        "with open(FILELOC + 'Tokenized_Responses_noproper_20220131.pkl', 'rb') as f:\n",
+        "    ptdf, tokdocs = pickle.load(f)\n",
+        "\n",
+        "# with open(FILELOC + 'Tokenized_Decisions_noproper_20220131.pkl', 'rb') as f:\n",
+        "#     ptdf, tokdocs = pickle.load(f)\n",
+        "\n",
+        "# with open(FILELOC + 'Tokenized_Petitions_noproper_20220131.pkl', 'rb') as f:\n",
+        "#     ptdf, tokdocs = pickle.load(f)\n",
+        "\n",
+        "# with open(FILELOC + 'Tokenized_Responses_20220212.pkl', 'rb') as f:\n",
+        "#     ptdf, tokdocs = pickle.load(f)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "j0mgqy-c5-kP",
+        "outputId": "73927061-1523-44f2-a628-b112dd4ff572"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "9283\n",
+            "7843\n"
+          ]
+        }
+      ],
+      "source": [
+        "map_outcome2unpat = {'Denied': 0,\n",
+        "                     'Denied on Rehearing': -1,\n",
+        "                    #  'Mixed': 0,\n",
+        "                     'Mixed': -1,\n",
+        "                     'Granted': 1,\n",
+        "                     'Granted on Rehearing': -1,\n",
+        "                     'Indefinite': -1,\n",
+        "                     }\n",
+        "ptdf['Unpatentable'] = ptdf['Decision'].map(map_outcome2unpat)\n",
+        "\n",
+        "selind = ptdf[ptdf['Unpatentable'] != -1].index\n",
+        "\n",
+        "print(len(ptdf))\n",
+        "ptdf.drop(ptdf[ptdf['Unpatentable'] == -1].index, inplace=True)\n",
+        "print(len(ptdf))\n",
+        "ptdf.reset_index(inplace=True)\n",
+        "tokdocs = [tokdocs[ind] for ind in range(len(tokdocs)) if ind in selind]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "STcP1HIJ6GPL"
+      },
+      "outputs": [],
+      "source": [
+        "# trainindex = ptdf[ptdf.date.between(pd.Timestamp(2018,7,1), pd.Timestamp(2020,11,30))].index\n",
+        "# testindex = ptdf[ptdf.date.between(pd.Timestamp(2020,12,1),pd.Timestamp(2021,3,31))].index\n",
+        "\n",
+        "trainindex = ptdf[ptdf.date < pd.Timestamp(2017,12,31)].index\n",
+        "testindex = ptdf[ptdf.date > pd.Timestamp(2019,1,1)].index\n",
+        "\n",
+        "print(len(trainindex), len(testindex))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ytrain = ptdf.loc[trainindex, 'Unpatentable'].values\n",
+        "ytest = ptdf.loc[testindex, 'Unpatentable'].values"
+      ],
+      "metadata": {
+        "id": "FMTI0cnZLYef"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "Params = {}\n",
+        "Params['num_words_to_use'] = 20000\n",
+        "Params['text_length'] = 8000\n",
+        "\n",
+        "trainindex = ptdf[ptdf.date.between(pd.Timestamp(2018,7,1), pd.Timestamp(2020,11,30))].index\n",
+        "testindex = ptdf[ptdf.date >= pd.Timestamp(2020,12,1)].index\n",
+        "\n",
+        "ytrain = ptdf.loc[trainindex, 'Unpatentable'].values\n",
+        "ytest = ptdf.loc[testindex, 'Unpatentable'].values\n",
+        "\n",
+        "tdocs = [t if len(t) < Params['text_length'] else t[:Params['text_length']] for t in tokdocs]\n",
+        "tdocs = [[t.lower() for t in doc] for doc in tdocs]\n",
+        "\n",
+        "toktrain = [tdocs[ind] for ind in range(len(tdocs)) if ind in trainindex]\n",
+        "toktest = [tdocs[ind] for ind in range(len(tdocs)) if ind in testindex]"
+      ],
+      "metadata": {
+        "id": "10kXEym3HSiS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Params = {}\n",
+        "# Params['num_words_to_use'] = None\n",
+        "# Params['text_length'] = 4000\n",
+        "\n",
+        "# trainindex = ptdf[ptdf.date.between(pd.Timestamp(2018,7,1), pd.Timestamp(2020,11,30))].index\n",
+        "# testindex = ptdf[ptdf.date >= pd.Timestamp(2020,12,1)].index\n",
+        "\n",
+        "# ytrain = ptdf.loc[trainindex, 'Unpatentable'].values\n",
+        "# ytest = ptdf.loc[testindex, 'Unpatentable'].values\n",
+        "\n",
+        "# tdocs = [t if len(t) < Params['text_length'] else t[:Params['text_length']] for t in tokdocs]\n",
+        "# tdocs = [[t.lower() for t in doc] for doc in tdocs]\n",
+        "\n",
+        "# toktrain = [tdocs[ind] for ind in range(len(tdocs)) if ind in trainindex]\n",
+        "# toktest = [tdocs[ind] for ind in range(len(tdocs)) if ind in testindex]"
+      ],
+      "metadata": {
+        "id": "g3Ba4ja8k63p"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uDabtFQy3WZp"
+      },
+      "source": [
+        "##Model Fitting"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kFokRMfE5V7-"
+      },
+      "outputs": [],
+      "source": [
+        "def set_model(random_state=2856):\n",
+        "    return xgb.XGBClassifier(\n",
+        "        objective= \"binary:logistic\",\n",
+        "\n",
+        "        booster='dart',  # rate_drop, skip_drop', and 'sample_type' are for dart booster\n",
+        "        rate_drop= 0.1, # 0.1,\n",
+        "        skip_drop= 0.5, # 0.5,\n",
+        "        sample_type='weighted',\n",
+        "\n",
+        "        random_state=random_state,\n",
+        "        subsample=0.8,          # typically 0.5-1, lower values = more conservative\n",
+        "        reg_lambda=2.0,         # L2 regularization, default 1 (reg_alpha default 0 is L1)\n",
+        "        max_depth=15,           # 8\n",
+        "        learning_rate=0.1,   # 0.001\n",
+        "        gamma=0,                # makes model more conservative\n",
+        "        colsample_bytree=0.8,   # typically 0.5-1, fraction of columns to be randomly sampled / tree\n",
+        "        n_jobs=-1,\n",
+        "        eval_metric = 'auc',\n",
+        "        # predictor = \"cpu_predictor\",\n",
+        "        predictor = \"gpu_predictor\",\n",
+        "        n_estimators = 1200,\n",
+        "        )"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lMxwBxUiFXDZ"
+      },
+      "outputs": [],
+      "source": [
+        "def dummy_fun(doc):\n",
+        "    return doc\n",
+        "def return_tfidf(max_features, vocabulary=None):\n",
+        "    tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun,\n",
+        "                            token_pattern=None, max_features=max_features, vocabulary=vocabulary)  \n",
+        "    return tfidf"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3d6aCY9xAk_I"
+      },
+      "outputs": [],
+      "source": [
+        "vectorizer = return_tfidf(max_features=Params['num_words_to_use'])\n",
+        "\n",
+        "vectorizer.fit(toktrain)\n",
+        "xtrain = vectorizer.transform(toktrain)\n",
+        "xtest = vectorizer.transform(toktest)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "eCoVPE4nAk8F",
+        "outputId": "ec07dcec-34f5-4d72-9313-a983017106cf"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[1.0693121693121692, 0.9391263940520446]\n"
+          ]
+        }
+      ],
+      "source": [
+        "class_weights = list(class_weight.compute_class_weight(class_weight='balanced',\n",
+        "                                                    classes=np.unique(ytrain), y=ytrain))\n",
+        "sample_weights = np.array([class_weights[y] for y in ytrain])\n",
+        "print(class_weights)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cKpI0KcBJkSr"
+      },
+      "source": [
+        "###Responses - Outcomes"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "w0xcDlxfIkrn"
+      },
+      "outputs": [],
+      "source": [
+        "vectorizer = return_tfidf(max_features=Params['num_words_to_use'])\n",
+        "vectorizer.fit(toktrain)\n",
+        "xtrain = vectorizer.transform(toktrain)\n",
+        "xtest = vectorizer.transform(toktest)\n",
+        "\n",
+        "class_weights = list(class_weight.compute_class_weight(class_weight='balanced',\n",
+        "                                                    classes=np.unique(ytrain), y=ytrain))\n",
+        "sample_weights = np.array([class_weights[y] for y in ytrain])\n",
+        "print(class_weights)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "RANDSTATE_LIST = [10666, 559, 1024, 6, 39842]\n",
+        "results_test = {}; results_val = {}; results_paper = {}\n",
+        "results_cleantest = {}; results_cleanval = {}; results_cleanpaper = {}\n",
+        "\n",
+        "for RANDSTATE in RANDSTATE_LIST:\n",
+        "    print(\"training...\")\n",
+        "    model_tfidf = set_model(RANDSTATE)\n",
+        "    model_tfidf.fit(xtrain, ytrain,\n",
+        "                    sample_weight=sample_weights,\n",
+        "                    verbose=False,\n",
+        "                    )\n",
+        "    pred_train = model_tfidf.predict(xtrain)\n",
+        "    pred_test = model_tfidf.predict(xtest)\n",
+        "\n",
+        "    print(confusion_matrix(ytest,np.round(pred_test)))"
+      ],
+      "metadata": {
+        "id": "o2-AgstVErZM"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model_tfidf_booster = model_tfidf.get_booster()\n",
+        "\n",
+        "feature_names = vectorizer.get_feature_names_out()\n",
+        "\n",
+        "fs_dict = model_tfidf_booster.get_fscore()\n",
+        "fsdf = pd.DataFrame.from_dict({'feature':fs_dict.keys(),\n",
+        "                               'score':fs_dict.values()}).sort_values(by='score', ascending=False)\n",
+        "def f(x):\n",
+        "    feature_id = int(x[1:])\n",
+        "    return feature_names[feature_id]\n",
+        "\n",
+        "fsdf['feature_name'] = fsdf.feature.apply(f)"
+      ],
+      "metadata": {
+        "id": "yIFdT1-kkV0i"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "shap_pred_raw = model_tfidf_booster.predict(xgb.DMatrix(xtest, ytest), pred_contribs=True)\n",
+        "shap_pred = shap_pred_raw[:,:-1]\n",
+        "shap.summary_plot(shap_pred, xtest, feature_names=feature_names)"
+      ],
+      "metadata": {
+        "id": "VQYhw5NYExQq",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 585
+        },
+        "outputId": "a45c93a9-e2f8-4110-c5cd-876bf1ce7d9b"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "\n",
+            "text/plain": [
+              "<Figure size 576x684 with 2 Axes>"
+            ]
+          },
+          "metadata": {
+            "needs_background": "light"
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-4nryQZTJr-0"
+      },
+      "source": [
+        "###Decisions"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "d5ANtstDZuFW"
+      },
+      "outputs": [],
+      "source": [
+        "RANDSTATE_LIST = [10935, 432, 978]\n",
+        "\n",
+        "for RANDSTATE in RANDSTATE_LIST:\n",
+        "    print(\"training...\")\n",
+        "    model_tfidf = set_model(RANDSTATE)\n",
+        "    model_tfidf.fit(xtrain, ytrain,\n",
+        "                    sample_weight=sample_weights,\n",
+        "                    verbose=False,\n",
+        "                    )\n",
+        "    pred_test_cont = model_tfidf.predict(xtest)\n",
+        "    pred_test = np.round(pred_test_cont)\n",
+        "    print(\"Classification Report: \"); print(classification_report(np.array(ytest), pred_test))\n",
+        "    print(confusion_matrix(ytest,pred_test))\n",
+        "    "
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "machine_shape": "hm",
+      "name": "PTAB_xgboost_github.ipynb",
+      "provenance": [],
+      "toc_visible": true,
+      "mount_file_id": "1fElvudNKO69ttwo8qNl20HLGxLqUZr3w",
+      "authorship_tag": "ABX9TyOG/jvmCFI+MWmp9koFMFFW",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "e25001286f6a472f9eeb81781bf096f4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HBoxView",
+            "_dom_classes": [],
+            "_model_name": "HBoxModel",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "box_style": "",
+            "layout": "IPY_MODEL_f6424274b9d34ac6a53c73f4a817dab1",
+            "_model_module": "@jupyter-widgets/controls",
+            "children": [
+              "IPY_MODEL_2c6b85e1b64e42e9ad63f1f1e07fe62a",
+              "IPY_MODEL_c02a86138788487bad802b41687d1cc4",
+              "IPY_MODEL_feec8e5c4f2c4b4291d79c5914789a35"
+            ]
+          }
+        },
+        "f6424274b9d34ac6a53c73f4a817dab1": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "2c6b85e1b64e42e9ad63f1f1e07fe62a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_2a0eba30a22f4378aafec0be7ce3810c",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": "",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_add4245050924296b45c05f1da684446"
+          }
+        },
+        "c02a86138788487bad802b41687d1cc4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "ProgressView",
+            "style": "IPY_MODEL_afd6311ea23b403ebc36aafc243ceda1",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "FloatProgressModel",
+            "bar_style": "success",
+            "max": 1,
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": 1,
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "orientation": "horizontal",
+            "min": 0,
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_0156ba14f4f44fc884c622344cd5a943"
+          }
+        },
+        "feec8e5c4f2c4b4291d79c5914789a35": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_83a8e24763764e22ace8482ff5f582a0",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": " 400000/? [00:15&lt;00:00, 28509.58it/s]",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_8d8a1e80d5d74866b80a7b995dc36c3b"
+          }
+        },
+        "2a0eba30a22f4378aafec0be7ce3810c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "add4245050924296b45c05f1da684446": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "afd6311ea23b403ebc36aafc243ceda1": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "ProgressStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "bar_color": null,
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "0156ba14f4f44fc884c622344cd5a943": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": "20px",
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "83a8e24763764e22ace8482ff5f582a0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "8d8a1e80d5d74866b80a7b995dc36c3b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        }
+      }
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file