[404218]: / Code / LangChain / RAPTOR / A100_gpt4o_RAPTOR, kkawchak.ipynb

Download this file

1183 lines (1182 with data), 104.8 kB

{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "id": "3058e9ca-07c3-4eef-b98c-bc2f2dbb9cc6",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "id": "3058e9ca-07c3-4eef-b98c-bc2f2dbb9cc6",
        "outputId": "2f96d9a0-1395-44a8-86bb-8a1671ce2d37"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting langchain\n",
            "  Downloading langchain-0.1.20-py3-none-any.whl (1.0 MB)\n",
            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/1.0 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.4/1.0 MB\u001b[0m \u001b[31m11.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting umap-learn\n",
            "  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)\n",
            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/85.7 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.7/85.7 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.2.2)\n",
            "Collecting scikit-learn\n",
            "  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.1/12.1 MB\u001b[0m \u001b[31m63.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting langchain_community\n",
            "  Downloading langchain_community-0.0.38-py3-none-any.whl (2.0 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m43.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting tiktoken\n",
            "  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m40.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting langchain-openai\n",
            "  Downloading langchain_openai-0.1.7-py3-none-any.whl (34 kB)\n",
            "Collecting langchainhub\n",
            "  Downloading langchainhub-0.1.15-py3-none-any.whl (4.6 kB)\n",
            "Collecting chromadb\n",
            "  Downloading chromadb-0.5.0-py3-none-any.whl (526 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m526.8/526.8 kB\u001b[0m \u001b[31m33.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting langchain-anthropic\n",
            "  Downloading langchain_anthropic-0.1.12-py3-none-any.whl (16 kB)\n",
            "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n",
            "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.30)\n",
            "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.5)\n",
            "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n",
            "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n",
            "  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)\n",
            "Collecting langchain-core<0.2.0,>=0.1.52 (from langchain)\n",
            "  Downloading langchain_core-0.1.52-py3-none-any.whl (302 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.9/302.9 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)\n",
            "  Downloading langchain_text_splitters-0.0.2-py3-none-any.whl (23 kB)\n",
            "Collecting langsmith<0.2.0,>=0.1.17 (from langchain)\n",
            "  Downloading langsmith-0.1.59-py3-none-any.whl (121 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.2/121.2 kB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.25.2)\n",
            "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.7.1)\n",
            "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n",
            "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.3.0)\n",
            "Requirement already satisfied: scipy>=1.3.1 in /usr/local/lib/python3.10/dist-packages (from umap-learn) (1.11.4)\n",
            "Requirement already satisfied: numba>=0.51.2 in /usr/local/lib/python3.10/dist-packages (from umap-learn) (0.58.1)\n",
            "Collecting pynndescent>=0.5 (from umap-learn)\n",
            "  Downloading pynndescent-0.5.12-py3-none-any.whl (56 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.8/56.8 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from umap-learn) (4.66.4)\n",
            "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.2)\n",
            "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.5.0)\n",
            "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2023.12.25)\n",
            "Collecting openai<2.0.0,>=1.24.0 (from langchain-openai)\n",
            "  Downloading openai-1.30.1-py3-none-any.whl (320 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.6/320.6 kB\u001b[0m \u001b[31m25.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting types-requests<3.0.0.0,>=2.31.0.2 (from langchainhub)\n",
            "  Downloading types_requests-2.31.0.20240406-py3-none-any.whl (15 kB)\n",
            "Requirement already satisfied: build>=1.0.3 in /usr/local/lib/python3.10/dist-packages (from chromadb) (1.2.1)\n",
            "Collecting chroma-hnswlib==0.7.3 (from chromadb)\n",
            "  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m54.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting fastapi>=0.95.2 (from chromadb)\n",
            "  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)\n",
            "  Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.8/60.8 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting posthog>=2.4.0 (from chromadb)\n",
            "  Downloading posthog-3.5.0-py2.py3-none-any.whl (41 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.3/41.3 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (4.11.0)\n",
            "Collecting onnxruntime>=1.14.1 (from chromadb)\n",
            "  Downloading onnxruntime-1.17.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m55.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting opentelemetry-api>=1.2.0 (from chromadb)\n",
            "  Downloading opentelemetry_api-1.24.0-py3-none-any.whl (60 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.1/60.1 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)\n",
            "  Downloading opentelemetry_exporter_otlp_proto_grpc-1.24.0-py3-none-any.whl (18 kB)\n",
            "Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)\n",
            "  Downloading opentelemetry_instrumentation_fastapi-0.45b0-py3-none-any.whl (11 kB)\n",
            "Collecting opentelemetry-sdk>=1.2.0 (from chromadb)\n",
            "  Downloading opentelemetry_sdk-1.24.0-py3-none-any.whl (106 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m106.1/106.1 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: tokenizers>=0.13.2 in /usr/local/lib/python3.10/dist-packages (from chromadb) (0.19.1)\n",
            "Collecting pypika>=0.48.9 (from chromadb)\n",
            "  Downloading PyPika-0.48.9.tar.gz (67 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "Collecting overrides>=7.3.1 (from chromadb)\n",
            "  Downloading overrides-7.7.0-py3-none-any.whl (17 kB)\n",
            "Requirement already satisfied: importlib-resources in /usr/local/lib/python3.10/dist-packages (from chromadb) (6.4.0)\n",
            "Requirement already satisfied: grpcio>=1.58.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (1.63.0)\n",
            "Collecting bcrypt>=4.0.1 (from chromadb)\n",
            "  Downloading bcrypt-4.1.3-cp39-abi3-manylinux_2_28_x86_64.whl (283 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m283.7/283.7 kB\u001b[0m \u001b[31m22.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: typer>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (0.9.4)\n",
            "Collecting kubernetes>=28.1.0 (from chromadb)\n",
            "  Downloading kubernetes-29.0.0-py2.py3-none-any.whl (1.6 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m43.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting mmh3>=4.0.1 (from chromadb)\n",
            "  Downloading mmh3-4.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (67 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.6/67.6 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting orjson>=3.9.12 (from chromadb)\n",
            "  Downloading orjson-3.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (142 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m142.5/142.5 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting anthropic<1,>=0.23.0 (from langchain-anthropic)\n",
            "  Downloading anthropic-0.25.9-py3-none-any.whl (871 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m871.1/871.1 kB\u001b[0m \u001b[31m40.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: defusedxml<0.8.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from langchain-anthropic) (0.7.1)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n",
            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n",
            "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from anthropic<1,>=0.23.0->langchain-anthropic) (3.7.1)\n",
            "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from anthropic<1,>=0.23.0->langchain-anthropic) (1.7.0)\n",
            "Collecting httpx<1,>=0.23.0 (from anthropic<1,>=0.23.0->langchain-anthropic)\n",
            "  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from anthropic<1,>=0.23.0->langchain-anthropic) (1.3.1)\n",
            "Requirement already satisfied: packaging>=19.1 in /usr/local/lib/python3.10/dist-packages (from build>=1.0.3->chromadb) (24.0)\n",
            "Requirement already satisfied: pyproject_hooks in /usr/local/lib/python3.10/dist-packages (from build>=1.0.3->chromadb) (1.1.0)\n",
            "Requirement already satisfied: tomli>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from build>=1.0.3->chromadb) (2.0.1)\n",
            "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n",
            "  Downloading marshmallow-3.21.2-py3-none-any.whl (49 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n",
            "  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n",
            "Collecting starlette<0.38.0,>=0.37.2 (from fastapi>=0.95.2->chromadb)\n",
            "  Downloading starlette-0.37.2-py3-none-any.whl (71 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.9/71.9 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting fastapi-cli>=0.0.2 (from fastapi>=0.95.2->chromadb)\n",
            "  Downloading fastapi_cli-0.0.3-py3-none-any.whl (9.2 kB)\n",
            "Requirement already satisfied: jinja2>=2.11.2 in /usr/local/lib/python3.10/dist-packages (from fastapi>=0.95.2->chromadb) (3.1.4)\n",
            "Collecting python-multipart>=0.0.7 (from fastapi>=0.95.2->chromadb)\n",
            "  Downloading python_multipart-0.0.9-py3-none-any.whl (22 kB)\n",
            "Collecting ujson!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,>=4.0.1 (from fastapi>=0.95.2->chromadb)\n",
            "  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.6/53.6 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting email_validator>=2.0.0 (from fastapi>=0.95.2->chromadb)\n",
            "  Downloading email_validator-2.1.1-py3-none-any.whl (30 kB)\n",
            "Requirement already satisfied: certifi>=14.05.14 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2024.2.2)\n",
            "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\n",
            "Requirement already satisfied: python-dateutil>=2.5.3 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.8.2)\n",
            "Requirement already satisfied: google-auth>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.27.0)\n",
            "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.8.0)\n",
            "Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.3.1)\n",
            "Requirement already satisfied: oauthlib>=3.2.2 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\n",
            "Requirement already satisfied: urllib3>=1.24.2 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.0.7)\n",
            "Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.2.0,>=0.1.52->langchain)\n",
            "  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n",
            "Collecting packaging>=19.1 (from build>=1.0.3->chromadb)\n",
            "  Downloading packaging-23.2-py3-none-any.whl (53 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.0/53.0 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.2->umap-learn) (0.41.1)\n",
            "Collecting coloredlogs (from onnxruntime>=1.14.1->chromadb)\n",
            "  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.14.1->chromadb) (24.3.25)\n",
            "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.14.1->chromadb) (3.20.3)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.14.1->chromadb) (1.12)\n",
            "Collecting deprecated>=1.2.6 (from opentelemetry-api>=1.2.0->chromadb)\n",
            "  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n",
            "Collecting importlib-metadata<=7.0,>=6.0 (from opentelemetry-api>=1.2.0->chromadb)\n",
            "  Downloading importlib_metadata-7.0.0-py3-none-any.whl (23 kB)\n",
            "Requirement already satisfied: googleapis-common-protos~=1.52 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.63.0)\n",
            "Collecting opentelemetry-exporter-otlp-proto-common==1.24.0 (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb)\n",
            "  Downloading opentelemetry_exporter_otlp_proto_common-1.24.0-py3-none-any.whl (17 kB)\n",
            "Collecting opentelemetry-proto==1.24.0 (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb)\n",
            "  Downloading opentelemetry_proto-1.24.0-py3-none-any.whl (50 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting opentelemetry-instrumentation-asgi==0.45b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
            "  Downloading opentelemetry_instrumentation_asgi-0.45b0-py3-none-any.whl (14 kB)\n",
            "Collecting opentelemetry-instrumentation==0.45b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
            "  Downloading opentelemetry_instrumentation-0.45b0-py3-none-any.whl (28 kB)\n",
            "Collecting opentelemetry-semantic-conventions==0.45b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
            "  Downloading opentelemetry_semantic_conventions-0.45b0-py3-none-any.whl (36 kB)\n",
            "Collecting opentelemetry-util-http==0.45b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
            "  Downloading opentelemetry_util_http-0.45b0-py3-none-any.whl (6.9 kB)\n",
            "Requirement already satisfied: setuptools>=16.0 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (67.7.2)\n",
            "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (1.14.1)\n",
            "Collecting asgiref~=3.0 (from opentelemetry-instrumentation-asgi==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
            "  Downloading asgiref-3.8.1-py3-none-any.whl (23 kB)\n",
            "Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb)\n",
            "  Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n",
            "Collecting backoff>=1.10.0 (from posthog>=2.4.0->chromadb)\n",
            "  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
            "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (0.6.0)\n",
            "Requirement already satisfied: pydantic-core==2.18.2 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (2.18.2)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.7)\n",
            "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n",
            "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from tokenizers>=0.13.2->chromadb) (0.20.3)\n",
            "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.10/dist-packages (from typer>=0.9.0->chromadb) (8.1.7)\n",
            "Collecting h11>=0.8 (from uvicorn[standard]>=0.18.3->chromadb)\n",
            "  Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting httptools>=0.5.0 (from uvicorn[standard]>=0.18.3->chromadb)\n",
            "  Downloading httptools-0.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (341 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m28.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting python-dotenv>=0.13 (from uvicorn[standard]>=0.18.3->chromadb)\n",
            "  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n",
            "Collecting uvloop!=0.15.0,!=0.15.1,>=0.14.0 (from uvicorn[standard]>=0.18.3->chromadb)\n",
            "  Downloading uvloop-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m56.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting watchfiles>=0.13 (from uvicorn[standard]>=0.18.3->chromadb)\n",
            "  Downloading watchfiles-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m45.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting websockets>=10.4 (from uvicorn[standard]>=0.18.3->chromadb)\n",
            "  Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m16.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->anthropic<1,>=0.23.0->langchain-anthropic) (1.2.1)\n",
            "Collecting dnspython>=2.0.0 (from email_validator>=2.0.0->fastapi>=0.95.2->chromadb)\n",
            "  Downloading dnspython-2.6.1-py3-none-any.whl (307 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m307.7/307.7 kB\u001b[0m \u001b[31m32.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting typer>=0.9.0 (from chromadb)\n",
            "  Downloading typer-0.12.3-py3-none-any.whl (47 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.2/47.2 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting shellingham>=1.3.0 (from typer>=0.9.0->chromadb)\n",
            "  Downloading shellingham-1.5.4-py2.py3-none-any.whl (9.8 kB)\n",
            "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.10/dist-packages (from typer>=0.9.0->chromadb) (13.7.1)\n",
            "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.3.3)\n",
            "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.4.0)\n",
            "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\n",
            "Collecting httpcore==1.* (from httpx<1,>=0.23.0->anthropic<1,>=0.23.0->langchain-anthropic)\n",
            "  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (3.14.0)\n",
            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (2023.6.0)\n",
            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata<=7.0,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.18.1)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.11.2->fastapi>=0.95.2->chromadb) (2.1.5)\n",
            "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-core<0.2.0,>=0.1.52->langchain)\n",
            "  Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n",
            "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer>=0.9.0->chromadb) (3.0.0)\n",
            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer>=0.9.0->chromadb) (2.16.1)\n",
            "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n",
            "  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n",
            "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.14.1->chromadb)\n",
            "  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\n",
            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer>=0.9.0->chromadb) (0.1.2)\n",
            "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.6.0)\n",
            "Building wheels for collected packages: pypika\n",
            "  Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for pypika: filename=PyPika-0.48.9-py2.py3-none-any.whl size=53724 sha256=3ff203a1b31c897ebf0fac3374e42a5b4c75f0583cac7bec82bf586fd63566bd\n",
            "  Stored in directory: /root/.cache/pip/wheels/e1/26/51/d0bffb3d2fd82256676d7ad3003faea3bd6dddc9577af665f4\n",
            "Successfully built pypika\n",
            "Installing collected packages: pypika, monotonic, mmh3, websockets, uvloop, ujson, types-requests, shellingham, python-multipart, python-dotenv, packaging, overrides, orjson, opentelemetry-util-http, opentelemetry-semantic-conventions, opentelemetry-proto, mypy-extensions, jsonpointer, importlib-metadata, humanfriendly, httptools, h11, dnspython, deprecated, chroma-hnswlib, bcrypt, backoff, asgiref, watchfiles, uvicorn, typing-inspect, tiktoken, starlette, scikit-learn, posthog, opentelemetry-exporter-otlp-proto-common, opentelemetry-api, marshmallow, langchainhub, jsonpatch, httpcore, email_validator, coloredlogs, typer, pynndescent, opentelemetry-sdk, opentelemetry-instrumentation, onnxruntime, langsmith, kubernetes, httpx, dataclasses-json, umap-learn, opentelemetry-instrumentation-asgi, opentelemetry-exporter-otlp-proto-grpc, openai, langchain-core, anthropic, opentelemetry-instrumentation-fastapi, langchain-text-splitters, langchain-openai, langchain_community, langchain-anthropic, langchain, fastapi-cli, fastapi, chromadb\n",
            "  Attempting uninstall: packaging\n",
            "    Found existing installation: packaging 24.0\n",
            "    Uninstalling packaging-24.0:\n",
            "      Successfully uninstalled packaging-24.0\n",
            "  Attempting uninstall: importlib-metadata\n",
            "    Found existing installation: importlib_metadata 7.1.0\n",
            "    Uninstalling importlib_metadata-7.1.0:\n",
            "      Successfully uninstalled importlib_metadata-7.1.0\n",
            "  Attempting uninstall: scikit-learn\n",
            "    Found existing installation: scikit-learn 1.2.2\n",
            "    Uninstalling scikit-learn-1.2.2:\n",
            "      Successfully uninstalled scikit-learn-1.2.2\n",
            "  Attempting uninstall: typer\n",
            "    Found existing installation: typer 0.9.4\n",
            "    Uninstalling typer-0.9.4:\n",
            "      Successfully uninstalled typer-0.9.4\n",
            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
            "spacy 3.7.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.\n",
            "weasel 0.3.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.\u001b[0m\u001b[31m\n",
            "\u001b[0mSuccessfully installed anthropic-0.25.9 asgiref-3.8.1 backoff-2.2.1 bcrypt-4.1.3 chroma-hnswlib-0.7.3 chromadb-0.5.0 coloredlogs-15.0.1 dataclasses-json-0.6.6 deprecated-1.2.14 dnspython-2.6.1 email_validator-2.1.1 fastapi-0.111.0 fastapi-cli-0.0.3 h11-0.14.0 httpcore-1.0.5 httptools-0.6.1 httpx-0.27.0 humanfriendly-10.0 importlib-metadata-7.0.0 jsonpatch-1.33 jsonpointer-2.4 kubernetes-29.0.0 langchain-0.1.20 langchain-anthropic-0.1.12 langchain-core-0.1.52 langchain-openai-0.1.7 langchain-text-splitters-0.0.2 langchain_community-0.0.38 langchainhub-0.1.15 langsmith-0.1.59 marshmallow-3.21.2 mmh3-4.1.0 monotonic-1.6 mypy-extensions-1.0.0 onnxruntime-1.17.3 openai-1.30.1 opentelemetry-api-1.24.0 opentelemetry-exporter-otlp-proto-common-1.24.0 opentelemetry-exporter-otlp-proto-grpc-1.24.0 opentelemetry-instrumentation-0.45b0 opentelemetry-instrumentation-asgi-0.45b0 opentelemetry-instrumentation-fastapi-0.45b0 opentelemetry-proto-1.24.0 opentelemetry-sdk-1.24.0 opentelemetry-semantic-conventions-0.45b0 opentelemetry-util-http-0.45b0 orjson-3.10.3 overrides-7.7.0 packaging-23.2 posthog-3.5.0 pynndescent-0.5.12 pypika-0.48.9 python-dotenv-1.0.1 python-multipart-0.0.9 scikit-learn-1.4.2 shellingham-1.5.4 starlette-0.37.2 tiktoken-0.7.0 typer-0.12.3 types-requests-2.31.0.20240406 typing-inspect-0.9.0 ujson-5.10.0 umap-learn-0.5.6 uvicorn-0.29.0 uvloop-0.19.0 watchfiles-0.21.0 websockets-12.0\n"
          ]
        }
      ],
      "source": [
        "pip install -U langchain umap-learn scikit-learn langchain_community tiktoken langchain-openai langchainhub chromadb langchain-anthropic"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ea54c848-0df6-474e-b266-218a2acf67d3",
      "metadata": {
        "id": "ea54c848-0df6-474e-b266-218a2acf67d3"
      },
      "source": [
        "# RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval\n",
        "\n",
        "The [RAPTOR](https://arxiv.org/pdf/2401.18059.pdf) paper presents an interesting approaching for indexing and retrieval of documents:\n",
        "\n",
        "* The `leafs` are a set of starting documents\n",
        "* Leafs are embedded and clustered\n",
        "* Clusters are then summarized into higher level (more abstract) consolidations of information across similar documents\n",
        "\n",
        "This process is done recursivly, resulting in a \"tree\" going from raw docs (`leafs`) to more abstract summaries.\n",
        "\n",
        "We can applying this at varying scales; `leafs` can be:\n",
        "\n",
        "* Text chunks from a single doc (as shown in the paper)\n",
        "* Full docs (as we show below)\n",
        "\n",
        "With longer context LLMs, it's possible to perform this over full documents.\n",
        "\n",
        "![Screenshot 2024-03-04 at 12.45.25 PM.png](attachment:72039e0c-e8c4-4b17-8780-04ad9fc584f3.png)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "# Optional, add tracing in LangSmith\n",
        "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
        "os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'\n",
        "os.environ[\"LANGCHAIN_PROJECT\"] = \"RAPTOR\"\n",
        "os.environ['LANGCHAIN_API_KEY'] = ''"
      ],
      "metadata": {
        "id": "lk-aKD_W1kwq"
      },
      "id": "lk-aKD_W1kwq",
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "id": "083dd961-b401-4fc6-867c-8f8950059b02",
      "metadata": {
        "id": "083dd961-b401-4fc6-867c-8f8950059b02"
      },
      "source": [
        "### Docs\n",
        "\n",
        "Let's apply this to LangChain's LCEL documentation.\n",
        "\n",
        "In this case, each `doc` is a unique web page of the LCEL docs.\n",
        "\n",
        "The context varies from < 2k tokens on up to > 10k tokens."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "id": "b17c1331-373f-491d-8b53-ccf634e68c8e",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 733
        },
        "id": "b17c1331-373f-491d-8b53-ccf634e68c8e",
        "outputId": "fc439f9b-fe49-4e1a-9016-cc6ff65a1fae"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<function matplotlib.pyplot.show(close=None, block=None)>"
            ],
            "text/html": [
              "<div style=\"max-width:800px; border: 1px solid var(--colab-border-color);\"><style>\n",
              "      pre.function-repr-contents {\n",
              "        overflow-x: auto;\n",
              "        padding: 8px 12px;\n",
              "        max-height: 500px;\n",
              "      }\n",
              "\n",
              "      pre.function-repr-contents.function-repr-contents-collapsed {\n",
              "        cursor: pointer;\n",
              "        max-height: 100px;\n",
              "      }\n",
              "    </style>\n",
              "    <pre style=\"white-space: initial; background:\n",
              "         var(--colab-secondary-surface-color); padding: 8px 12px;\n",
              "         border-bottom: 1px solid var(--colab-border-color);\"><b>matplotlib.pyplot.show</b><br/>def show(*args, **kwargs)</pre><pre class=\"function-repr-contents function-repr-contents-collapsed\" style=\"\"><a class=\"filepath\" style=\"display:none\" href=\"#\">/usr/local/lib/python3.10/dist-packages/matplotlib/pyplot.py</a>Display all open figures.\n",
              "\n",
              "Parameters\n",
              "----------\n",
              "block : bool, optional\n",
              "    Whether to wait for all figures to be closed before returning.\n",
              "\n",
              "    If `True` block and run the GUI main loop until all figure windows\n",
              "    are closed.\n",
              "\n",
              "    If `False` ensure that all figure windows are displayed and return\n",
              "    immediately.  In this case, you are responsible for ensuring\n",
              "    that the event loop is running to have responsive figures.\n",
              "\n",
              "    Defaults to True in non-interactive mode and to False in interactive\n",
              "    mode (see `.pyplot.isinteractive`).\n",
              "\n",
              "See Also\n",
              "--------\n",
              "ion : Enable interactive mode, which shows / updates the figure after\n",
              "      every plotting command, so that calling ``show()`` is not necessary.\n",
              "ioff : Disable interactive mode.\n",
              "savefig : Save the figure to an image file instead of showing it on screen.\n",
              "\n",
              "Notes\n",
              "-----\n",
              "**Saving figures to file and showing a window at the same time**\n",
              "\n",
              "If you want an image file as well as a user interface window, use\n",
              "`.pyplot.savefig` before `.pyplot.show`. At the end of (a blocking)\n",
              "``show()`` the figure is closed and thus unregistered from pyplot. Calling\n",
              "`.pyplot.savefig` afterwards would save a new and thus empty figure. This\n",
              "limitation of command order does not apply if the show is non-blocking or\n",
              "if you keep a reference to the figure and use `.Figure.savefig`.\n",
              "\n",
              "**Auto-show in jupyter notebooks**\n",
              "\n",
              "The jupyter backends (activated via ``%matplotlib inline``,\n",
              "``%matplotlib notebook``, or ``%matplotlib widget``), call ``show()`` at\n",
              "the end of every cell by default. Thus, you usually don&#x27;t have to call it\n",
              "explicitly there.</pre>\n",
              "      <script>\n",
              "      if (google.colab.kernel.accessAllowed && google.colab.files && google.colab.files.view) {\n",
              "        for (const element of document.querySelectorAll('.filepath')) {\n",
              "          element.style.display = 'block'\n",
              "          element.onclick = (event) => {\n",
              "            event.preventDefault();\n",
              "            event.stopPropagation();\n",
              "            google.colab.files.view(element.textContent, 401);\n",
              "          };\n",
              "        }\n",
              "      }\n",
              "      for (const element of document.querySelectorAll('.function-repr-contents')) {\n",
              "        element.onclick = (event) => {\n",
              "          event.preventDefault();\n",
              "          event.stopPropagation();\n",
              "          element.classList.toggle('function-repr-contents-collapsed');\n",
              "        };\n",
              "      }\n",
              "      </script>\n",
              "      </div>"
            ]
          },
          "metadata": {},
          "execution_count": 3
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<Figure size 1000x600 with 1 Axes>"
            ],
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1cAAAIjCAYAAADvBuGTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABR8UlEQVR4nO3deXyM5/7/8fdMRlaSICRSWyyl1hSlKS0qFUsVPW3RxVLVjZYT1ZaeIuU0uthaWnVawjmKo6elv1JFbFUpDVJFrQ3aSmKNSESYzP37w8N8OxIkccckvJ6PxzyOue7Pfd3XNe723O/e91xjMQzDEAAAAADguljdPQAAAAAAuBkQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAKCEqlmzpvr37+/uYdz03nvvPdWqVUseHh4KDw8v1mOtXbtWFotFX3zxRbEeBwDgHoQrALgB4uLiZLFYlJiYmO/2du3aqVGjRtd9nGXLlmns2LHX3c+tYsWKFXr11VfVunVrzZ49W2+//XaemkuBqCCv0ujcuXOaPHmyWrVqpYCAAHl7e+v222/XkCFDtHfvXncPT5K0ceNGjR07Vunp6e4eCgBclc3dAwAA5G/Pnj2yWgv338CWLVum6dOnE7AKaPXq1bJarfrss8/k6emZb80dd9yhf//73y5tI0eOVNmyZfXGG2/ciGEWm+PHj6tTp07asmWLHnzwQT3++OMqW7as9uzZowULFmjmzJk6f/68u4epjRs3KiYmRv3791dgYKC7hwMAV0S4AoASysvLy91DKLSsrCz5+fm5exgFdvToUfn4+FwxWElScHCwnnzySZe2CRMmKCgoKE97adO/f39t27ZNX3zxhf72t7+5bBs3blypD48AcKPxWCAAlFCXf+fqwoULiomJUd26deXt7a2KFSuqTZs2WrlypaSLF8rTp0+XpHwfVcvKytLw4cNVrVo1eXl5qV69enr//fdlGIbLcbOzs/Xyyy8rKChI5cqV00MPPaQ///xTFovF5Y7Y2LFjZbFYtGvXLj3++OMqX7682rRpI0navn27+vfvr1q1asnb21shISF6+umndeLECZdjXepj7969evLJJxUQEKBKlSrpzTfflGEY+v3339W9e3f5+/srJCREEydOLNBnZ7fbNW7cONWuXVteXl6qWbOmRo0apZycHGeNxWLR7NmzlZWV5fys4uLiCtR/fn777Tc9+uijqlChgnx9fXX33Xdr6dKl19wvJydHDz74oAICArRx40ZJksPh0JQpU9SwYUN5e3srODhYzz33nE6dOuWyb82aNfXggw9qw4YNatmypby9vVWrVi3NnTv3msfdtGmTli5dqoEDB+YJVtLFcP/++++7tK1evVr33nuv/Pz8FBgYqO7du+vXX391qenfv79q1qyZp79Lf9d/ZbFYNGTIEC1evFiNGjWSl5eXGjZsqOXLl7vsN2LECElSWFiY8+/q4MGDkqSVK1eqTZs2CgwMVNmyZVWvXj2NGjXqmvMHgOLAnSsAuIFOnz6t48eP52m/cOHCNfcdO3asYmNj9cwzz6hly5bKyMhQYmKitm7dqgceeEDPPfecjhw5opUrV+Z5jM0wDD300ENas2aNBg4cqPDwcH333XcaMWKE/vzzT02ePNlZ279/f/33v//VU089pbvvvlvr1q1T165drziuRx99VHXr1tXbb7/tDGorV67Ub7/9pgEDBigkJEQ7d+7UzJkztXPnTv344495LrJ79eqlO+64QxMmTNDSpUs1fvx4VahQQZ988onuv/9+vfPOO5o3b55eeeUV3XXXXbrvvvuu+lk988wzmjNnjh555BENHz5cmzZtUmxsrH799Vd99dVXkqR///vfmjlzpjZv3qxPP/1UknTPPfdc8+8hP2lpabrnnnt09uxZvfzyy6pYsaLmzJmjhx56SF988YV69uyZ737Z2dnq3r27EhMTtWrVKt11112SpOeee05xcXEaMGCAXn75ZSUnJ2vatGnatm2bfvjhB5UpU8bZx/79+/XII49o4MCB6tevn2bNmqX+/furefPmatiw4RXH/PXXX0uSnnrqqQLNcdWqVercubNq1aqlsWPHKjs7Wx9++KFat26trVu35huoCmLDhg368ssv9eKLL6pcuXL64IMP9Le//U2HDx9WxYoV9fDDD2vv3r2aP3++Jk+erKCgIElSpUqVtHPnTj344INq0qSJ3nrrLXl5eWn//v364YcfijQWALhuBgCg2M2ePduQdNVXw4YNXfapUaOG0a9fP+f7pk2bGl27dr3qcQYPHmzk96/2xYsXG5KM8ePHu7Q/8sgjhsViMfbv328YhmFs2bLFkGQMGzbMpa5///6GJGPMmDHOtjFjxhiSjD59+uQ53tmzZ/O0zZ8/35BkrF+/Pk8fzz77rLPNbrcbVatWNSwWizFhwgRn+6lTpwwfHx+XzyQ/SUlJhiTjmWeecWl/5ZVXDEnG6tWrnW39+vUz/Pz8rtpffho2bGi0bdvW+X7YsGGGJOP77793tp05c8YICwszatasaeTm5hqGYRhr1qwxJBmLFi0yzpw5Y7Rt29YICgoytm3b5tzv+++/NyQZ8+bNcznm8uXL87TXqFEjz2d69OhRw8vLyxg+fPhV59CzZ09DknHq1KkCzTk8PNyoXLmyceLECWfbzz//bFitVqNv377Otn79+hk1atTIs/+lv+u/kmR4eno6z79LfUoyPvzwQ2fbe++9Z0gykpOTXfafPHmyIck4duxYgeYAAMWNxwIB4AaaPn26Vq5cmefVpEmTa+4bGBionTt3at++fYU+7rJly+Th4aGXX37ZpX348OEyDEPffvutJDkfx3rxxRdd6l566aUr9v3888/nafPx8XH++dy5czp+/LjuvvtuSdLWrVvz1D/zzDPOP3t4eKhFixYyDEMDBw50tgcGBqpevXr67bffrjgW6eJcJSk6Otqlffjw4ZJUoEf1CmvZsmVq2bKl87FISSpbtqyeffZZHTx4ULt27XKpP336tDp27Kjdu3dr7dq1LkvAL1q0SAEBAXrggQd0/Phx56t58+YqW7as1qxZ49JXgwYNdO+99zrfV6pUqUCfU0ZGhiSpXLly15xfSkqKkpKS1L9/f1WoUMHZ3qRJEz3wwAPOz7woIiMjVbt2bZc+/f39rzl+Sc7FLZYsWSKHw1HkMQCAWQhXAHADtWzZUpGRkXle5cuXv+a+b731ltLT03X77bercePGGjFihLZv316g4x46dEihoaF5LqTvuOMO5/ZL/2u1WhUWFuZSV6dOnSv2fXmtJJ08eVJDhw5VcHCwfHx8VKlSJWfd6dOn89RXr17d5f2lJcEvPQL21/bLv3d0uUtzuHzMISEhCgwMdM7VTIcOHVK9evXytF/++V4ybNgw/fTTT1q1alWeR/f27dun06dPq3LlyqpUqZLLKzMzU0ePHnWpv/yzk6Ty5ctf83Py9/eXJJ05c6ZA85N0xTkeP35cWVlZ1+wnP0Udv3TxcdLWrVvrmWeeUXBwsHr37q3//ve/BC0AbsN3rgCglLjvvvt04MABLVmyRCtWrNCnn36qyZMna8aMGS53fm60v96luuSxxx7Txo0bNWLECIWHh6ts2bJyOBzq1KlTvhe+Hh4eBWqTlGcBjispyb871b17dy1YsEATJkzQ3LlzXZbcdzgcqly5subNm5fvvpUqVXJ5X9TPqX79+pKkX375xeXO1/W60ueem5ubb/v1/D37+Pho/fr1WrNmjZYuXarly5dr4cKFuv/++7VixYor9g0AxYU7VwBQilSoUEEDBgzQ/Pnz9fvvv6tJkyYuK/hd6cK2Ro0aOnLkSJ67FLt373Zuv/S/DodDycnJLnX79+8v8BhPnTql+Ph4vf7664qJiVHPnj31wAMPqFatWgXu43pcmsPlj0+mpaUpPT3dOVezj7lnz5487Zd/vpf06NFDs2bN0ueff67Bgwe7bKtdu7ZOnDih1q1b53uXs2nTpqaMuVu3bpKk//znP9esvTT+K80xKCjIuQR/+fLl8/2x3+u5Y3i1oGy1WtWhQwdNmjRJu3bt0j//+U+tXr06z+OTAHAjEK4AoJS4fBnzsmXLqk6dOi7Li1+6wL384rZLly7Kzc3VtGnTXNonT54si8Wizp07S5KioqIkSR999JFL3YcffljgcV66W3D5nYcpU6YUuI/r0aVLl3yPN2nSJEm66sqH13PMzZs3KyEhwdmWlZWlmTNnqmbNmmrQoEGeffr27asPPvhAM2bM0GuvveZsf+yxx5Sbm6tx48bl2cdut+cbXIoiIiJCnTp10qeffqrFixfn2X7+/Hm98sorkqQqVaooPDxcc+bMcTn+jh07tGLFCudnLl0Mh6dPn3Z5ZDUlJcW5SmNRXOm8PnnyZJ7aS99f++s/FwBwo/BYIACUEg0aNFC7du3UvHlzVahQQYmJifriiy80ZMgQZ03z5s0lSS+//LKioqLk4eGh3r17q1u3bmrfvr3eeOMNHTx4UE2bNtWKFSu0ZMkSDRs2zLmgQPPmzfW3v/1NU6ZM0YkTJ5xLse/du1dSwR618/f313333ad3331XFy5c0G233aYVK1bkuRtWXJo2bap+/fpp5syZSk9PV9u2bbV582bNmTNHPXr0UPv27U0/5uuvv6758+erc+fOevnll1WhQgXNmTNHycnJ+t///ufy2N9fDRkyRBkZGXrjjTcUEBCgUaNGqW3btnruuecUGxurpKQkdezYUWXKlNG+ffu0aNEiTZ06VY888ogp4547d646duyohx9+WN26dVOHDh3k5+enffv2acGCBUpJSXH+1tV7772nzp07KyIiQgMHDnQuxR4QEOBy97R379567bXX1LNnT7388ss6e/asPv74Y91+++35LmZSEJfO6zfeeEO9e/dWmTJl1K1bN7311ltav369unbtqho1aujo0aP66KOPVLVqVZfFRQDghnHnUoUAcKu4tBT7Tz/9lO/2tm3bXnMp9vHjxxstW7Y0AgMDDR8fH6N+/frGP//5T+P8+fPOGrvdbrz00ktGpUqVDIvF4rL09ZkzZ4y///3vRmhoqFGmTBmjbt26xnvvvWc4HA6X42ZlZRmDBw82KlSoYJQtW9bo0aOHsWfPHkOSy9Lol5bWzm8Z7D/++MPo2bOnERgYaAQEBBiPPvqoceTIkSsu5355H1daIj2/zyk/Fy5cMGJiYoywsDCjTJkyRrVq1YyRI0ca586dK9BxruXypdgNwzAOHDhgPPLII0ZgYKDh7e1ttGzZ0vjmm29cav66FPtfvfrqq4YkY9q0ac62mTNnGs2bNzd8fHyMcuXKGY0bNzZeffVV48iRI86aGjVq5Ls8f9u2bfOM70rOnj1rvP/++8Zdd91llC1b1vD09DTq1q1rvPTSSy5LpBuGYaxatcpo3bq14ePjY/j7+xvdunUzdu3alafPFStWGI0aNTI8PT2NevXqGf/5z3+uuBT74MGD8+x/+blvGIYxbtw447bbbjOsVqtzWfb4+Hije/fuRmhoqOHp6WmEhoYaffr0Mfbu3VuguQOA2SyGUcBvBgMAbllJSUm688479Z///EdPPPGEu4cDAECJxHeuAAAusrOz87RNmTJFVqtV9913nxtGBABA6cB3rgAALt59911t2bJF7du3l81m07fffqtvv/1Wzz77rKpVq+bu4QEAUGLxWCAAwMXKlSsVExOjXbt2KTMzU9WrV9dTTz2lN954QzYb/00OAIArIVwBAAAAgAn4zhUAAAAAmIBwBQAAAAAm4OH5fDgcDh05ckTlypUr0A9mAgAAALg5GYahM2fOKDQ09Io/Cn8J4SofR44cYUUsAAAAAE6///67qlatetUawlU+ypUrJ+niB+jv7+/m0QAAAABwl4yMDFWrVs2ZEa6GcJWPS48C+vv7E64AAAAAFOjrQixoAQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAreGq9jYWN11110qV66cKleurB49emjPnj3X3G/RokWqX7++vL291bhxYy1btsxlu2EYGj16tKpUqSIfHx9FRkZq3759xTUNAAAAAHBvuFq3bp0GDx6sH3/8UStXrtSFCxfUsWNHZWVlXXGfjRs3qk+fPho4cKC2bdumHj16qEePHtqxY4ez5t1339UHH3ygGTNmaNOmTfLz81NUVJTOnTt3I6YFAAAA4BZkMQzDcPcgLjl27JgqV66sdevW6b777su3plevXsrKytI333zjbLv77rsVHh6uGTNmyDAMhYaGavjw4XrllVckSadPn1ZwcLDi4uLUu3fva44jIyNDAQEBOn36tPz9/c2ZHAAAAIBSpzDZwHaDxlQgp0+fliRVqFDhijUJCQmKjo52aYuKitLixYslScnJyUpNTVVkZKRze0BAgFq1aqWEhIR8w1VOTo5ycnKc7zMyMiRJdrtddru9yPMBAAAAblbHjx/XmTNniq3/cuXKKSgoqNj6L6jC5IESE64cDoeGDRum1q1bq1GjRlesS01NVXBwsEtbcHCwUlNTndsvtV2p5nKxsbGKiYnJ056YmCg/P79CzQMAAAC42Z0/f167du3VhQuOYjtGmTJWNWhwuzw9PYvtGAVxta8sXa7EhKvBgwdrx44d2rBhww0/9siRI13uhmVkZKhatWpq0aIFjwUCAAAAl0lOTtZrr02Vl9dQ+fhUNb3/7Ow/lJMzVfPm3a+wsDDT+y+MS0+1FUSJCFdDhgzRN998o/Xr16tq1av/5YSEhCgtLc2lLS0tTSEhIc7tl9qqVKniUhMeHp5vn15eXvLy8srTbrPZZLOViI8IAAAAKDGsVqvs9lyVLVtdXl61Te/fbrcqKytXVqvV7dfjhTm+W1cLNAxDQ4YM0VdffaXVq1cXKJVGREQoPj7epW3lypWKiIiQJIWFhSkkJMSlJiMjQ5s2bXLWAAAAAIDZ3BoDBw8erM8//1xLlixRuXLlnN+JCggIkI+PjySpb9++uu222xQbGytJGjp0qNq2bauJEyeqa9euWrBggRITEzVz5kxJksVi0bBhwzR+/HjVrVtXYWFhevPNNxUaGqoePXq4ZZ4AAAAAbn5uDVcff/yxJKldu3Yu7bNnz1b//v0lSYcPH5bV+n832O655x59/vnn+sc//qFRo0apbt26Wrx4scsiGK+++qqysrL07LPPKj09XW3atNHy5cvl7e1d7HMCAAAAcGtya7gqyE9srV27Nk/bo48+qkcfffSK+1gsFr311lt66623rmd4AAAAAFBgbv3OFQAAAADcLAhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJnBruFq/fr26deum0NBQWSwWLV68+Kr1/fv3l8ViyfNq2LChs2bs2LF5ttevX7+YZwIAAADgVufWcJWVlaWmTZtq+vTpBaqfOnWqUlJSnK/ff/9dFSpU0KOPPupS17BhQ5e6DRs2FMfwAQAAAMDJ5s6Dd+7cWZ07dy5wfUBAgAICApzvFy9erFOnTmnAgAEudTabTSEhIaaNEwAAAACuxa3h6np99tlnioyMVI0aNVza9+3bp9DQUHl7eysiIkKxsbGqXr36FfvJyclRTk6O831GRoYkyW63y263F8/gAQAAgFLK4XDIZvOQzeaQh4f518s228X+HQ6H26/HC3P8Uhuujhw5om+//Vaff/65S3urVq0UFxenevXqKSUlRTExMbr33nu1Y8cOlStXLt++YmNjFRMTk6c9MTFRfn5+xTJ+AAAAoLTKzs7W449HyWY7JA+Po6b3n5ubLbs9SocOHdLRo+b3XxhZWVkFrrUYhmEU41gKzGKx6KuvvlKPHj0KVB8bG6uJEyfqyJEj8vT0vGJdenq6atSooUmTJmngwIH51uR356patWo6ceKE/P39CzUPAAAA4GaXnJysJ54YocDA9+TrG2Z6/2fPJis9fYTmzXtPYWHm918YGRkZqlixok6fPn3NbFAq71wZhqFZs2bpqaeeumqwkqTAwEDdfvvt2r9//xVrvLy85OXllafdZrPJZiuVHxEAAABQbKxWq+z2XNntVuXmmn+9bLdf7N9qtbr9erwwxy+Vv3O1bt067d+//4p3ov4qMzNTBw4cUJUqVW7AyAAAAADcqtwarjIzM5WUlKSkpCRJF28vJiUl6fDhw5KkkSNHqm/fvnn2++yzz9SqVSs1atQoz7ZXXnlF69at08GDB7Vx40b17NlTHh4e6tOnT7HOBQAAAMCtza332BITE9W+fXvn++joaElSv379FBcXp5SUFGfQuuT06dP63//+p6lTp+bb5x9//KE+ffroxIkTqlSpktq0aaMff/xRlSpVKr6JAAAAALjluTVctWvXTldbTyMuLi5PW0BAgM6ePXvFfRYsWGDG0AAAAACgUErld64AAAAAoKQhXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJjAreFq/fr16tatm0JDQ2WxWLR48eKr1q9du1YWiyXPKzU11aVu+vTpqlmzpry9vdWqVStt3ry5GGcBAAAAAG4OV1lZWWratKmmT59eqP327NmjlJQU56ty5crObQsXLlR0dLTGjBmjrVu3qmnTpoqKitLRo0fNHj4AAAAAONncefDOnTurc+fOhd6vcuXKCgwMzHfbpEmTNGjQIA0YMECSNGPGDC1dulSzZs3S66+/fj3DBQAAAIArcmu4Kqrw8HDl5OSoUaNGGjt2rFq3bi1JOn/+vLZs2aKRI0c6a61WqyIjI5WQkHDF/nJycpSTk+N8n5GRIUmy2+2y2+3FNAsAAACgdHI4HLLZPGSzOeThYf71ss12sX+Hw+H26/HCHL9UhasqVapoxowZatGihXJycvTpp5+qXbt22rRpk5o1a6bjx48rNzdXwcHBLvsFBwdr9+7dV+w3NjZWMTExedoTExPl5+dn+jwAAACA0iw7O1uPPx4lm+2QPDzM//pNbm627PYoHTp0yO1f78nKyipwbakKV/Xq1VO9evWc7++55x4dOHBAkydP1r///e8i9zty5EhFR0c732dkZKhatWpq0aKF/P39r2vMAAAAwM0mOTlZo0ZNU2BgpHx9w0zv/+zZZKWnT9O8eZEKCzO//8K49FRbQZSqcJWfli1basOGDZKkoKAgeXh4KC0tzaUmLS1NISEhV+zDy8tLXl5eedptNptstlL/EQEAAACmslqtsttzZbdblZtr/vWy3X6xf6vV6vbr8cIcv9T/zlVSUpKqVKkiSfL09FTz5s0VHx/v3O5wOBQfH6+IiAh3DREAAADALcCtMTAzM1P79+93vk9OTlZSUpIqVKig6tWra+TIkfrzzz81d+5cSdKUKVMUFhamhg0b6ty5c/r000+1evVqrVixwtlHdHS0+vXrpxYtWqhly5aaMmWKsrKynKsHAgAAAEBxcGu4SkxMVPv27Z3vL33vqV+/foqLi1NKSooOHz7s3H7+/HkNHz5cf/75p3x9fdWkSROtWrXKpY9evXrp2LFjGj16tFJTUxUeHq7ly5fnWeQCAAAAAMxkMQzDcPcgSpqMjAwFBATo9OnTLGgBAAAAXObAgQN69NFhCgycIj+/2qb3n5V1QOnpw7Ro0RTVrm1+/4VRmGxQ6r9zBQAAAAAlAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABG4NV+vXr1e3bt0UGhoqi8WixYsXX7X+yy+/1AMPPKBKlSrJ399fERER+u6771xqxo4dK4vF4vKqX79+Mc4CAAAAANwcrrKystS0aVNNnz69QPXr16/XAw88oGXLlmnLli1q3769unXrpm3btrnUNWzYUCkpKc7Xhg0bimP4AAAAAOBkc+fBO3furM6dOxe4fsqUKS7v3377bS1ZskT/7//9P915553OdpvNppCQELOGCQAAAADX5NZwdb0cDofOnDmjChUquLTv27dPoaGh8vb2VkREhGJjY1W9evUr9pOTk6OcnBzn+4yMDEmS3W6X3W4vnsEDAAAApZTD4ZDN5iGbzSEPD/Ovl222i/07HA63X48X5vilOly9//77yszM1GOPPeZsa9WqleLi4lSvXj2lpKQoJiZG9957r3bs2KFy5crl209sbKxiYmLytCcmJsrPz6/Yxg8AAACURtnZ2Xr88SjZbIfk4XHU9P5zc7Nlt0fp0KFDOnrU/P4LIysrq8C1FsMwjGIcS4FZLBZ99dVX6tGjR4HqP//8cw0aNEhLlixRZGTkFevS09NVo0YNTZo0SQMHDsy3Jr87V9WqVdOJEyfk7+9fqHkAAAAAN7vk5GQ98cQIBQa+J1/fMNP7P3s2WenpIzRv3nsKCzO//8LIyMhQxYoVdfr06Wtmg1J552rBggV65plntGjRoqsGK0kKDAzU7bffrv3791+xxsvLS15eXnnabTabbLZS+REBAAAAxcZqtcpuz5XdblVurvnXy3b7xf6tVqvbr8cLc/xS9ztX8+fP14ABAzR//nx17dr1mvWZmZk6cOCAqlSpcgNGBwAAAOBW5dYYmJmZ6XJHKTk5WUlJSapQoYKqV6+ukSNH6s8//9TcuXMlXXwUsF+/fpo6dapatWql1NRUSZKPj48CAgIkSa+88oq6deumGjVq6MiRIxozZow8PDzUp0+fGz9BAAAAALcMt965SkxM1J133ulcRj06Olp33nmnRo8eLUlKSUnR4cOHnfUzZ86U3W7X4MGDVaVKFedr6NChzpo//vhDffr0Ub169fTYY4+pYsWK+vHHH1WpUqUbOzkAAAAAtxS33rlq166drraeRlxcnMv7tWvXXrPPBQsWXOeoAAAAAKDwSt13rgAAAACgJCJcAQAAAIAJCFcAAAAAYALCFQAAAACYoEjh6rfffjN7HAAAAABQqhUpXNWpU0ft27fXf/7zH507d87sMQEAAABAqVOkcLV161Y1adJE0dHRCgkJ0XPPPafNmzebPTYAAAAAKDWKFK7Cw8M1depUHTlyRLNmzVJKSoratGmjRo0aadKkSTp27JjZ4wQAAACAEu26FrSw2Wx6+OGHtWjRIr3zzjvav3+/XnnlFVWrVk19+/ZVSkqKWeMEAAAAgBLtusJVYmKiXnzxRVWpUkWTJk3SK6+8ogMHDmjlypU6cuSIunfvbtY4AQAAAKBEsxVlp0mTJmn27Nnas2ePunTporlz56pLly6yWi9mtbCwMMXFxalmzZpmjhUAAAAASqwihauPP/5YTz/9tPr3768qVarkW1O5cmV99tln1zU4AAAAACgtihSu9u3bd80aT09P9evXryjdAwAAAECpU6TvXM2ePVuLFi3K075o0SLNmTPnugcFAAAAAKVNkcJVbGysgoKC8rRXrlxZb7/99nUPCgAAAABKmyKFq8OHDyssLCxPe40aNXT48OHrHhQAAAAAlDZFCleVK1fW9u3b87T//PPPqlix4nUPCgAAAABKmyKFqz59+ujll1/WmjVrlJubq9zcXK1evVpDhw5V7969zR4jAAAAAJR4RVotcNy4cTp48KA6dOggm+1iFw6HQ3379uU7VwAAAABuSUUKV56enlq4cKHGjRunn3/+WT4+PmrcuLFq1Khh9vgAAAAAoFQoUri65Pbbb9ftt99u1lgAAAAAoNQqUrjKzc1VXFyc4uPjdfToUTkcDpftq1evNmVwAAAAAFBaFClcDR06VHFxceratasaNWoki8Vi9rgAAAAAoFQpUrhasGCB/vvf/6pLly5mjwcAAAAASqUiLcXu6empOnXqmD0WAAAAACi1ihSuhg8frqlTp8owDLPHAwAAAAClUpEeC9ywYYPWrFmjb7/9Vg0bNlSZMmVctn/55ZemDA4AAAAASosihavAwED17NnT7LEAAAAAQKlVpHA1e/Zss8cBAAAAAKVakb5zJUl2u12rVq3SJ598ojNnzkiSjhw5oszMTNMGBwAAAAClRZHuXB06dEidOnXS4cOHlZOTowceeEDlypXTO++8o5ycHM2YMcPscQIAAABAiVakO1dDhw5VixYtdOrUKfn4+Djbe/bsqfj4eNMGBwAAAAClRZHuXH3//ffauHGjPD09Xdpr1qypP//805SBAQAAAEBpUqQ7Vw6HQ7m5uXna//jjD5UrV+66BwUAAAAApU2RwlXHjh01ZcoU53uLxaLMzEyNGTNGXbp0MWtsAAAAAFBqFOmxwIkTJyoqKkoNGjTQuXPn9Pjjj2vfvn0KCgrS/PnzzR4jAAAAAJR4RQpXVatW1c8//6wFCxZo+/btyszM1MCBA/XEE0+4LHABAAAAALeKIoUrSbLZbHryySfNHAsAAAAAlFpFCldz58696va+ffsWaTAAAAAAUFoVKVwNHTrU5f2FCxd09uxZeXp6ytfXl3AFAAAA4JZTpNUCT5065fLKzMzUnj171KZNGxa0AAAAAHBLKlK4yk/dunU1YcKEPHe1AAAAAOBWYFq4ki4ucnHkyBEzuwQAAACAUqFI37n6+uuvXd4bhqGUlBRNmzZNrVu3NmVgAAAAAFCaFOnOVY8ePVxeDz/8sMaOHasmTZpo1qxZBe5n/fr16tatm0JDQ2WxWLR48eJr7rN27Vo1a9ZMXl5eqlOnjuLi4vLUTJ8+XTVr1pS3t7datWqlzZs3F2J2AAAAAFB4RQpXDofD5ZWbm6vU1FR9/vnnqlKlSoH7ycrKUtOmTTV9+vQC1ScnJ6tr165q3769kpKSNGzYMD3zzDP67rvvnDULFy5UdHS0xowZo61bt6pp06aKiorS0aNHCz1PAAAAACioIv+IsBk6d+6szp07F7h+xowZCgsL08SJEyVJd9xxhzZs2KDJkycrKipKkjRp0iQNGjRIAwYMcO6zdOlSzZo1S6+//rr5kwAAAAAAFTFcRUdHF7h20qRJRTlEvhISEhQZGenSFhUVpWHDhkmSzp8/ry1btmjkyJHO7VarVZGRkUpISLhivzk5OcrJyXG+z8jIkCTZ7XbZ7XbTxl9Ux48f15kzZ4qt/3LlyikoKKjY+gcAAMDNxeFwyGbzkM3mkIeH+dfLNtvF/h0Oh9uvxwtz/CKFq23btmnbtm26cOGC6tWrJ0nau3evPDw81KxZM2edxWIpSvdXlJqaquDgYJe24OBgZWRkKDs7W6dOnVJubm6+Nbt3775iv7GxsYqJicnTnpiYKD8/P3MGX0Tnz5/Xrl17deGCo9iOUaaMVQ0a3C5PT89iOwYAAABuHtnZ2Xr88SjZbIfk4WH+129yc7Nlt0fp0KFDbv96T1ZWVoFrixSuunXrpnLlymnOnDkqX768pIs/LDxgwADde++9Gj58eFG6dZuRI0e63I3LyMhQtWrV1KJFC/n7+7txZBe/Z/baa1Pl5TVUPj5VTe8/O/sP5eRM1bx59yssLMz0/gEAAHDzSU5O1qhR0xQYGClfX/OvIc+eTVZ6+jTNmxfp9mvUS0+1FUSRwtXEiRO1YsUKZ7CSpPLly2v8+PHq2LFjsYWrkJAQpaWlubSlpaXJ399fPj4+8vDwkIeHR741ISEhV+zXy8tLXl5eedptNptsNrd+LU1Wq1V2e67Klq0uL6/apvdvt1uVlZUrq9Xq9rkCAACgdLh0jWq3W5Wba/41pN1+sf+ScI1amOMXabXAjIwMHTt2LE/7sWPHivW7QREREYqPj3dpW7lypSIiIiRJnp6eat68uUuNw+FQfHy8swYAAAAAikORwlXPnj01YMAAffnll/rjjz/0xx9/6H//+58GDhyohx9+uMD9ZGZmKikpSUlJSZIu3l5MSkrS4cOHJV18XK9v377O+ueff16//fabXn31Ve3evVsfffSR/vvf/+rvf/+7syY6Olr/+te/NGfOHP3666964YUXlJWV5Vw9EAAAAACKQ5Husc2YMUOvvPKKHn/8cV24cOFiRzabBg4cqPfee6/A/SQmJqp9+/bO95e+99SvXz/FxcUpJSXFGbQkKSwsTEuXLtXf//53TZ06VVWrVtWnn37qXIZdknr16qVjx45p9OjRSk1NVXh4uJYvX55nkQsAAAAAMFORwpWvr68++ugjvffeezpw4IAkqXbt2oVeWa9du3YyDOOK2+Pi4vLdZ9u2bVftd8iQIRoyZEihxgIAAAAA16NIjwVekpKSopSUFNWtW1d+fn5XDUoAAAAAcDMrUrg6ceKEOnTooNtvv11dunRRSkqKJGngwIGlbhl2AAAAADBDkcLV3//+d5UpU0aHDx+Wr6+vs71Xr15avny5aYMDAAAAgNKiSN+5WrFihb777jtVrer6o7Z169bVoUOHTBkYAAAAAJQmRbpzlZWV5XLH6pKTJ0/m+2O8AAAAAHCzK1K4uvfeezV37lzne4vFIofDoXfffddlaXUAAAAAuFUU6bHAd999Vx06dFBiYqLOnz+vV199VTt37tTJkyf1ww8/mD1GAAAAACjxinTnqlGjRtq7d6/atGmj7t27KysrSw8//LC2bdum2rVrmz1GAAAAACjxCn3n6sKFC+rUqZNmzJihN954ozjGBAAAAAClTqHvXJUpU0bbt28vjrEAAAAAQKlVpMcCn3zySX322WdmjwUAAAAASq0iLWhht9s1a9YsrVq1Ss2bN5efn5/L9kmTJpkyOAAAAAAoLQoVrn777TfVrFlTO3bsULNmzSRJe/fudamxWCzmjQ4AAAAASolChau6desqJSVFa9askST16tVLH3zwgYKDg4tlcAAAAABQWhTqO1eGYbi8//bbb5WVlWXqgAAAAACgNCrSghaXXB62AAAAAOBWVahwZbFY8nyniu9YAQAAAEAhv3NlGIb69+8vLy8vSdK5c+f0/PPP51kt8MsvvzRvhAAAAABQChQqXPXr18/l/ZNPPmnqYAAAAACgtCpUuJo9e3ZxjQMAAAAASrXrWtACAAAAAHAR4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABCUiXE2fPl01a9aUt7e3WrVqpc2bN1+xtl27drJYLHleXbt2ddb0798/z/ZOnTrdiKkAAAAAuEXZ3D2AhQsXKjo6WjNmzFCrVq00ZcoURUVFac+ePapcuXKe+i+//FLnz593vj9x4oSaNm2qRx991KWuU6dOmj17tvO9l5dX8U0CAAAAwC3P7XeuJk2apEGDBmnAgAFq0KCBZsyYIV9fX82aNSvf+goVKigkJMT5WrlypXx9ffOEKy8vL5e68uXL34jpAAAAALhFufXO1fnz57VlyxaNHDnS2Wa1WhUZGamEhIQC9fHZZ5+pd+/e8vPzc2lfu3atKleurPLly+v+++/X+PHjVbFixXz7yMnJUU5OjvN9RkaGJMlut8tutxd2WqZyOByy2Txksznk4WH+WGy2i/07HA63zxUAAAClw610jVqY47s1XB0/fly5ubkKDg52aQ8ODtbu3buvuf/mzZu1Y8cOffbZZy7tnTp10sMPP6ywsDAdOHBAo0aNUufOnZWQkCAPD488/cTGxiomJiZPe2JiYp7QdqNlZ2fr8cejZLMdkofHUdP7z83Nlt0epUOHDunoUfP7BwAAwM3nVrpGzcrKKnCt279zdT0+++wzNW7cWC1btnRp7927t/PPjRs3VpMmTVS7dm2tXbtWHTp0yNPPyJEjFR0d7XyfkZGhatWqqUWLFvL39y++CRRAcnKyRo2apsDASPn6hpne/9mzyUpPn6Z58yIVFmZ+/wAAALj53ErXqJeeaisIt4aroKAgeXh4KC0tzaU9LS1NISEhV903KytLCxYs0FtvvXXN49SqVUtBQUHav39/vuHKy8sr3wUvbDabbDb35k+r1Sq7PVd2u1W5ueaPxW6/2L/VanX7XAEAAFA63ErXqIU5vlsXtPD09FTz5s0VHx/vbHM4HIqPj1dERMRV9120aJFycnL05JNPXvM4f/zxh06cOKEqVapc95gBAAAAID9uXy0wOjpa//rXvzRnzhz9+uuveuGFF5SVlaUBAwZIkvr27euy4MUln332mXr06JFnkYrMzEyNGDFCP/74ow4ePKj4+Hh1795dderUUVRU1A2ZEwAAAIBbj9ufA+vVq5eOHTum0aNHKzU1VeHh4Vq+fLlzkYvDhw/LanXNgHv27NGGDRu0YsWKPP15eHho+/btmjNnjtLT0xUaGqqOHTtq3Lhx/NYVAAAAgGLj9nAlSUOGDNGQIUPy3bZ27do8bfXq1ZNhGPnW+/j46LvvvjNzeAAAAABwTW5/LBAAAAAAbgaEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABOUiHA1ffp01axZU97e3mrVqpU2b958xdq4uDhZLBaXl7e3t0uNYRgaPXq0qlSpIh8fH0VGRmrfvn3FPQ0AAAAAtzC3h6uFCxcqOjpaY8aM0datW9W0aVNFRUXp6NGjV9zH399fKSkpztehQ4dctr/77rv64IMPNGPGDG3atEl+fn6KiorSuXPnins6AAAAAG5Rbg9XkyZN0qBBgzRgwAA1aNBAM2bMkK+vr2bNmnXFfSwWi0JCQpyv4OBg5zbDMDRlyhT94x//UPfu3dWkSRPNnTtXR44c0eLFi2/AjAAAAADcimzuPPj58+e1ZcsWjRw50tlmtVoVGRmphISEK+6XmZmpGjVqyOFwqFmzZnr77bfVsGFDSVJycrJSU1MVGRnprA8ICFCrVq2UkJCg3r175+kvJydHOTk5zvcZGRmSJLvdLrvdft3zvB4Oh0M2m4dsNoc8PMwfi812sX+Hw+H2uQIAAKB0uJWuUQtzfLeGq+PHjys3N9flzpMkBQcHa/fu3fnuU69ePc2aNUtNmjTR6dOn9f777+uee+7Rzp07VbVqVaWmpjr7uLzPS9suFxsbq5iYmDztiYmJ8vPzK8rUTJOdna3HH4+SzXZIHh5XflSyqHJzs2W3R+nQoUNXfRQTAAAAuORWukbNysoqcK1bw1VRREREKCIiwvn+nnvu0R133KFPPvlE48aNK1KfI0eOVHR0tPN9RkaGqlWrphYtWsjf3/+6x3w9kpOTNWrUNAUGRsrXN8z0/s+eTVZ6+jTNmxepsDDz+wcAAMDN51a6Rr30VFtBuDVcBQUFycPDQ2lpaS7taWlpCgkJKVAfZcqU0Z133qn9+/dLknO/tLQ0ValSxaXP8PDwfPvw8vKSl5dXnnabzSabzb3502q1ym7Pld1uVW6u+WOx2y/2b7Va3T5XAAAAlA630jVqYY7v1gUtPD091bx5c8XHxzvbHA6H4uPjXe5OXU1ubq5++eUXZ5AKCwtTSEiIS58ZGRnatGlTgfsEAAAAgMJy+62K6Oho9evXTy1atFDLli01ZcoUZWVlacCAAZKkvn376rbbblNsbKwk6a233tLdd9+tOnXqKD09Xe+9954OHTqkZ555RtLFlQSHDRum8ePHq27dugoLC9Obb76p0NBQ9ejRw13TBAAAAHCTc3u46tWrl44dO6bRo0crNTVV4eHhWr58uXNBisOHD8tq/b8bbKdOndKgQYOUmpqq8uXLq3nz5tq4caMaNGjgrHn11VeVlZWlZ599Vunp6WrTpo2WL1+e58eGAQAAAMAsbg9XkjRkyBANGTIk321r1651eT958mRNnjz5qv1ZLBa99dZbeuutt8waIgAAAABcldt/RBgAAAAAbgaEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwAAAAAwAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhCsAAAAAMAHhCgAAAABMQLgCAAAAABOUiHA1ffp01axZU97e3mrVqpU2b958xdp//etfuvfee1W+fHmVL19ekZGReer79+8vi8Xi8urUqVNxTwMAAADALczt4WrhwoWKjo7WmDFjtHXrVjVt2lRRUVE6evRovvVr165Vnz59tGbNGiUkJKhatWrq2LGj/vzzT5e6Tp06KSUlxfmaP3/+jZgOAAAAgFuU28PVpEmTNGjQIA0YMEANGjTQjBkz5Ovrq1mzZuVbP2/ePL344osKDw9X/fr19emnn8rhcCg+Pt6lzsvLSyEhIc5X+fLlb8R0AAAAANyibO48+Pnz57VlyxaNHDnS2Wa1WhUZGamEhIQC9XH27FlduHBBFSpUcGlfu3atKleurPLly+v+++/X+PHjVbFixXz7yMnJUU5OjvN9RkaGJMlut8tutxd2WqZyOByy2Txksznk4WH+WGy2i/07HA63zxUAAAClw610jVqY47s1XB0/fly5ubkKDg52aQ8ODtbu3bsL1Mdrr72m0NBQRUZGOts6deqkhx9+WGFhYTpw4IBGjRqlzp07KyEhQR4eHnn6iI2NVUxMTJ72xMRE+fn5FXJW5srOztbjj0fJZjskD4/8H5W8Hrm52bLbo3To0KErPooJAAAA/NWtdI2alZVV4Fq3hqvrNWHCBC1YsEBr166Vt7e3s713797OPzdu3FhNmjRR7dq1tXbtWnXo0CFPPyNHjlR0dLTzfUZGhqpVq6YWLVrI39+/eCdxDcnJyRo1apoCAyPl6xtmev9nzyYrPX2a5s2LVFiY+f0DAADg5nMrXaNeeqqtINwaroKCguTh4aG0tDSX9rS0NIWEhFx13/fff18TJkzQqlWr1KRJk6vW1qpVS0FBQdq/f3++4crLy0teXl552m02m2w29+ZPq9Uquz1XdrtVubnmj8Vuv9i/1Wp1+1wBAABQOtxK16iFOb5bF7Tw9PRU8+bNXRajuLQ4RURExBX3e/fddzVu3DgtX75cLVq0uOZx/vjjD504cUJVqlQxZdwAAAAAcDm3rxYYHR2tf/3rX5ozZ45+/fVXvfDCC8rKytKAAQMkSX379nVZ8OKdd97Rm2++qVmzZqlmzZpKTU1VamqqMjMzJUmZmZkaMWKEfvzxRx08eFDx8fHq3r276tSpo6ioKLfMEQAAAMDNz+3PgfXq1UvHjh3T6NGjlZqaqvDwcC1fvty5yMXhw4dltf5fBvz44491/vx5PfLIIy79jBkzRmPHjpWHh4e2b9+uOXPmKD09XaGhoerYsaPGjRuX76N/AAAAAGAGt4crSRoyZIiGDBmS77a1a9e6vD948OBV+/Lx8dF3331n0sgAAAAAoGDc/lggAAAAANwMCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJCFcAAAAAYALCFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmKBHhavr06apZs6a8vb3VqlUrbd68+ar1ixYtUv369eXt7a3GjRtr2bJlLtsNw9Do0aNVpUoV+fj4KDIyUvv27SvOKQAAAAC4xbk9XC1cuFDR0dEaM2aMtm7dqqZNmyoqKkpHjx7Nt37jxo3q06ePBg4cqG3btqlHjx7q0aOHduzY4ax599139cEHH2jGjBnatGmT/Pz8FBUVpXPnzt2oaQEAAAC4xbg9XE2aNEmDBg3SgAED1KBBA82YMUO+vr6aNWtWvvVTp05Vp06dNGLECN1xxx0aN26cmjVrpmnTpkm6eNdqypQp+sc//qHu3burSZMmmjt3ro4cOaLFixffwJkBAAAAuJXY3Hnw8+fPa8uWLRo5cqSzzWq1KjIyUgkJCfnuk5CQoOjoaJe2qKgoZ3BKTk5WamqqIiMjndsDAgLUqlUrJSQkqHfv3nn6zMnJUU5OjvP96dOnJUknT56U3W4v8vzMkJGRIYvFoezsXyVlmN5/dvafcjhytHPnTmVkmN8/AAAAbj6///67HI4LxXqNarE4lJGRoZMnT5ref2FcukY2DOOatW4NV8ePH1dubq6Cg4Nd2oODg7V79+5890lNTc23PjU11bn9UtuVai4XGxurmJiYPO1hYWEFm8gNsezaJdehe/eVxdo/AAAAbkbfFWvvzZoV7zVwYZw5c0YBAQFXrXFruCopRo4c6XI3zOFw6OTJk6pYsaIsFouki4m1WrVq+v333+Xv7++uoeIWwfmGG4nzDTcS5xtuJM43mMEwDJ05c0ahoaHXrHVruAoKCpKHh4fS0tJc2tPS0hQSEpLvPiEhIVetv/S/aWlpqlKliktNeHh4vn16eXnJy8vLpS0wMDDfWn9/f/7hxA3D+YYbifMNNxLnG24kzjdcr2vdsbrErQtaeHp6qnnz5oqPj3e2ORwOxcfHKyIiIt99IiIiXOolaeXKlc76sLAwhYSEuNRkZGRo06ZNV+wTAAAAAK6X2x8LjI6OVr9+/dSiRQu1bNlSU6ZMUVZWlgYMGCBJ6tu3r2677TbFxsZKkoYOHaq2bdtq4sSJ6tq1qxYsWKDExETNnDlTkmSxWDRs2DCNHz9edevWVVhYmN58802FhoaqR48e7pomAAAAgJuc28NVr169dOzYMY0ePVqpqakKDw/X8uXLnQtSHD58WFbr/91gu+eee/T555/rH//4h0aNGqW6detq8eLFatSokbPm1VdfVVZWlp599lmlp6erTZs2Wr58uby9vYs8Ti8vL40ZMybP44NAceB8w43E+YYbifMNNxLnG240i1GQNQUBAAAAAFfl9h8RBgAAAICbAeEKAAAAAExAuAIAAAAAExCuAAAAAMAEhKsCmj59umrWrClvb2+1atVKmzdvdveQUMKtX79e3bp1U2hoqCwWixYvXuyy3TAMjR49WlWqVJGPj48iIyO1b98+l5qTJ0/qiSeekL+/vwIDAzVw4EBlZma61Gzfvl333nuvvL29Va1aNb377rvFPTWUQLGxsbrrrrtUrlw5Va5cWT169NCePXtcas6dO6fBgwerYsWKKlu2rP72t7/l+VH2w4cPq2vXrvL19VXlypU1YsQI2e12l5q1a9eqWbNm8vLyUp06dRQXF1fc00MJ8/HHH6tJkybOH2aNiIjQt99+69zOuYbiMmHCBOfP7lzC+YYSxcA1LViwwPD09DRmzZpl7Ny50xg0aJARGBhopKWluXtoKMGWLVtmvPHGG8aXX35pSDK++uorl+0TJkwwAgICjMWLFxs///yz8dBDDxlhYWFGdna2s6ZTp05G06ZNjR9//NH4/vvvjTp16hh9+vRxbj99+rQRHBxsPPHEE8aOHTuM+fPnGz4+PsYnn3xyo6aJEiIqKsqYPXu2sWPHDiMpKcno0qWLUb16dSMzM9NZ8/zzzxvVqlUz4uPjjcTEROPuu+827rnnHud2u91uNGrUyIiMjDS2bdtmLFu2zAgKCjJGjhzprPntt98MX19fIzo62ti1a5fx4YcfGh4eHsby5ctv6HzhXl9//bWxdOlSY+/evcaePXuMUaNGGWXKlDF27NhhGAbnGorH5s2bjZo1axpNmjQxhg4d6mznfENJQrgqgJYtWxqDBw92vs/NzTVCQ0ON2NhYN44Kpcnl4crhcBghISHGe++952xLT083vLy8jPnz5xuGYRi7du0yJBk//fSTs+bbb781LBaL8eeffxqGYRgfffSRUb58eSMnJ8dZ89prrxn16tUr5hmhpDt69KghyVi3bp1hGBfPrzJlyhiLFi1y1vz666+GJCMhIcEwjIv/QcBqtRqpqanOmo8//tjw9/d3nmOvvvqq0bBhQ5dj9erVy4iKiiruKaGEK1++vPHpp59yrqFYnDlzxqhbt66xcuVKo23bts5wxfmGkobHAq/h/Pnz2rJliyIjI51tVqtVkZGRSkhIcOPIUJolJycrNTXV5bwKCAhQq1atnOdVQkKCAgMD1aJFC2dNZGSkrFarNm3a5Ky577775Onp6ayJiorSnj17dOrUqRs0G5REp0+fliRVqFBBkrRlyxZduHDB5ZyrX7++qlev7nLONW7c2Pkj7tLF8ykjI0M7d+501vy1j0s1/Pvw1pWbm6sFCxYoKytLERERnGsoFoMHD1bXrl3znBOcbyhpbO4eQEl3/Phx5ebmuvwDKUnBwcHavXu3m0aF0i41NVWS8j2vLm1LTU1V5cqVXbbbbDZVqFDBpSYsLCxPH5e2lS9fvljGj5LN4XBo2LBhat26tRo1aiTp4vng6empwMBAl9rLz7n8zslL265Wk5GRoezsbPn4+BTHlFAC/fLLL4qIiNC5c+dUtmxZffXVV2rQoIGSkpI412CqBQsWaOvWrfrpp5/ybOPfbShpCFcAcJMZPHiwduzYoQ0bNrh7KLiJ1atXT0lJSTp9+rS++OIL9evXT+vWrXP3sHCT+f333zV06FCtXLlS3t7e7h4OcE08FngNQUFB8vDwyLPqTFpamkJCQtw0KpR2l86dq51XISEhOnr0qMt2u92ukydPutTk18dfj4Fby5AhQ/TNN99ozZo1qlq1qrM9JCRE58+fV3p6ukv95efctc6nK9X4+/vzX3ZvMZ6enqpTp46aN2+u2NhYNW3aVFOnTuVcg6m2bNmio0ePqlmzZrLZbLLZbFq3bp0++OAD2Ww2BQcHc76hRCFcXYOnp6eaN2+u+Ph4Z5vD4VB8fLwiIiLcODKUZmFhYQoJCXE5rzIyMrRp0ybneRUREaH09HRt2bLFWbN69Wo5HA61atXKWbN+/XpduHDBWbNy5UrVq1ePRwJvMYZhaMiQIfrqq6+0evXqPI+LNm/eXGXKlHE55/bs2aPDhw+7nHO//PKLS6hfuXKl/P391aBBA2fNX/u4VMO/D+FwOJSTk8O5BlN16NBBv/zyi5KSkpyvFi1a6IknnnD+mfMNJYq7V9QoDRYsWGB4eXkZcXFxxq5du4xnn33WCAwMdFl1BrjcmTNnjG3bthnbtm0zJBmTJk0ytm3bZhw6dMgwjItLsQcGBhpLliwxtm/fbnTv3j3fpdjvvPNOY9OmTcaGDRuMunXruizFnp6ebgQHBxtPPfWUsWPHDmPBggWGr68vS7Hfgl544QUjICDAWLt2rZGSkuJ8nT171lnz/PPPG9WrVzdWr15tJCYmGhEREUZERIRz+6Xlijt27GgkJSUZy5cvNypVqpTvcsUjRowwfv31V2P69OksV3wLev31141169YZycnJxvbt243XX3/dsFgsxooVKwzD4FxD8frraoGGwfmGkoVwVUAffvihUb16dcPT09No2bKl8eOPP7p7SCjh1qxZY0jK8+rXr59hGBeXY3/zzTeN4OBgw8vLy+jQoYOxZ88elz5OnDhh9OnTxyhbtqzh7+9vDBgwwDhz5oxLzc8//2y0adPG8PLyMm677TZjwoQJN2qKKEHyO9ckGbNnz3bWZGdnGy+++KJRvnx5w9fX1+jZs6eRkpLi0s/BgweNzp07Gz4+PkZQUJAxfPhw48KFCy41a9asMcLDww1PT0+jVq1aLsfAreHpp582atSoYXh6ehqVKlUyOnTo4AxWhsG5huJ1ebjifENJYjEMw3DPPTMAAAAAuHnwnSsAAAAAMAHhCgAAAABMQLgCAAAAABMQrgAAAADABIQrAAAAADAB4QoAAAAATEC4AgAAAAATEK4AAAAAwASEKwBAqXLw4EFZLBYlJSW5eygAALggXAEAbjiLxXLV19ixY909xHzt379fAwYMUNWqVeXl5aWwsDD16dNHiYmJN3QcBEwAKJls7h4AAODWk5KS4vzzwoULNXr0aO3Zs8fZVrZsWXcM66oSExPVoUMHNWrUSJ988onq16+vM2fOaMmSJRo+fLjWrVvn7iECANyMO1cAgBsuJCTE+QoICJDFYnG+r1y5siZNmuS8OxQeHq7ly5dfsa/c3Fw9/fTTql+/vg4fPixJWrJkiZo1ayZvb2/VqlVLMTExstvtzn0sFos+/fRT9ezZU76+vqpbt66+/vrrKx7DMAz1799fdevW1ffff6+uXbuqdu3aCg8P15gxY7RkyRJn7S+//KL7779fPj4+qlixop599lllZmY6t7dr107Dhg1z6b9Hjx7q37+/833NmjX19ttv6+mnn1a5cuVUvXp1zZw507k9LCxMknTnnXfKYrGoXbt2V/28AQA3BuEKAFCiTJ06VRMnTtT777+v7du3KyoqSg899JD27duXpzYnJ0ePPvqokpKS9P3336t69er6/vvv1bdvXw0dOlS7du3SJ598ori4OP3zn/902TcmJkaPPfaYtm/fri5duuiJJ57QyZMn8x1TUlKSdu7cqeHDh8tqzft/nYGBgZKkrKwsRUVFqXz58vrpp5+0aNEirVq1SkOGDCn05zBx4kS1aNFC27Zt04svvqgXXnjBeXdv8+bNkqRVq1YpJSVFX375ZaH7BwCYj3AFAChR3n//fb322mvq3bu36tWrp3feeUfh4eGaMmWKS11mZqa6du2qY8eOac2aNapUqZKki6Hp9ddfV79+/VSrVi098MADGjdunD755BOX/fv3768+ffqoTp06evvtt5WZmekMLZe7FOzq169/1bF//vnnOnfunObOnatGjRrp/vvv17Rp0/Tvf/9baWlphfocunTpohdffFF16tTRa6+9pqCgIK1Zs0aSnHOtWLGiQkJCVKFChUL1DQAoHnznCgBQYmRkZOjIkSNq3bq1S3vr1q31888/u7T16dNHVatW1erVq+Xj4+Ns//nnn/XDDz+43KnKzc3VuXPndPbsWfn6+kqSmjRp4tzu5+cnf39/HT16NN9xGYZRoPH/+uuvatq0qfz8/FzG7nA4tGfPHgUHBxeon8vHd+mxySuNDwBQMnDnCgBQKnXp0kXbt29XQkKCS3tmZqZiYmKUlJTkfP3yyy/at2+fvL29nXVlypRx2c9iscjhcOR7rNtvv12StHv37uset9VqzRPWLly4kKeuMOMDAJQMhCsAQInh7++v0NBQ/fDDDy7tP/zwgxo0aODS9sILL2jChAl66KGHXFbqa9asmfbs2aM6derkeeX3famCCA8PV4MGDTRx4sR8A056erok6Y477tDPP/+srKwsl7FbrVbVq1dP0sVH+v66WmJubq527NhRqPF4eno69wUAlByEKwBAiTJixAi98847Wrhwofbs2aPXX39dSUlJGjp0aJ7al156SePHj9eDDz6oDRs2SJJGjx6tuXPnKiYmRjt37tSvv/6qBQsW6B//+EeRx2SxWDR79mzt3btX9957r5YtW6bffvtN27dv1z//+U91795dkvTEE0/I29tb/fr1044dO7RmzRq99NJLeuqpp5yPBN5///1aunSpli5dqt27d+uFF15whrOCqly5snx8fLR8+XKlpaXp9OnTRZ4bAMA8hCsAQIny8ssvKzo6WsOHD1fjxo21fPlyff3116pbt26+9cOGDVNMTIy6dOmijRs3KioqSt98841WrFihu+66S3fffbcmT56sGjVqXNe4WrZsqcTERNWpU0eDBg3SHXfcoYceekg7d+50Lrbh6+ur7777TidPntRdd92lRx55RB06dNC0adOc/Tz99NPq16+f+vbtq7Zt26pWrVpq3759ocZis9n0wQcf6JNPPlFoaKgz3AEA3MtiFPRbugAAAACAK+LOFQAAAACYgHAFAAAAACYgXAEAAACACQhXAAAAAGACwhUAAAAAmIBwBQAAAAAmIFwBAAAAgAkIVwAAAABgAsIVAAAAAJiAcAUAAAAAJiBcAQAAAIAJ/j8q2Y+J8ctKiQAAAABJRU5ErkJggg==\n"
          },
          "metadata": {}
        }
      ],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import os\n",
        "import tiktoken\n",
        "from bs4 import BeautifulSoup as Soup\n",
        "from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader\n",
        "\n",
        "\n",
        "def num_tokens_from_string(string: str, encoding_name: str) -> int:\n",
        "    \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
        "    encoding = tiktoken.get_encoding(encoding_name)\n",
        "    num_tokens = len(encoding.encode(string))\n",
        "    return num_tokens\n",
        "\n",
        "\n",
        "# LCEL docs\n",
        "url = \"https://www.sciencedirect.com/science/article/pii/S135964462400117X\"\n",
        "loader = RecursiveUrlLoader(\n",
        "    url=url, max_depth=1, extractor=lambda x: Soup(x, \"html.parser\").text\n",
        ")\n",
        "docs = loader.load()\n",
        "\n",
        "# LCEL w/ PydanticOutputParser (outside the primary LCEL docs)\n",
        "url = \"https://www.chemicalqdevice.com/cancer-drug-discovery-innovation\"\n",
        "loader = RecursiveUrlLoader(\n",
        "    url=url, max_depth=1, extractor=lambda x: Soup(x, \"html.parser\").text\n",
        ")\n",
        "docs_pydantic = loader.load()\n",
        "\n",
        "# LCEL w/ Self Query (outside the primary LCEL docs)\n",
        "url = \"https://www.chemicalqdevice.com/cancer-drug-discovery-ai\"\n",
        "loader = RecursiveUrlLoader(\n",
        "    url=url, max_depth=1, extractor=lambda x: Soup(x, \"html.parser\").text\n",
        ")\n",
        "docs_sq = loader.load()\n",
        "\n",
        "# Doc texts\n",
        "docs.extend([*docs_pydantic, *docs_sq])\n",
        "docs_texts = [d.page_content for d in docs]\n",
        "\n",
        "# Calculate the number of tokens for each document\n",
        "counts = [num_tokens_from_string(d, \"cl100k_base\") for d in docs_texts]\n",
        "\n",
        "# Plotting the histogram of token counts\n",
        "plt.figure(figsize=(10, 6))\n",
        "plt.hist(counts, bins=30, color=\"blue\", edgecolor=\"black\", alpha=0.7)\n",
        "plt.title(\"Histogram of Token Counts\")\n",
        "plt.xlabel(\"Token Count\")\n",
        "plt.ylabel(\"Frequency\")\n",
        "plt.grid(axis=\"y\", alpha=0.75)\n",
        "\n",
        "# Display the histogram\n",
        "plt.show"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "id": "70750603-ec82-4439-9b32-d22014b5ff2c",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "id": "70750603-ec82-4439-9b32-d22014b5ff2c",
        "outputId": "d838e9cb-446f-4855-d499-075d83fe4739"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Num tokens in all context: 9515\n"
          ]
        }
      ],
      "source": [
        "# Doc texts concat\n",
        "d_sorted = sorted(docs, key=lambda x: x.metadata[\"source\"])\n",
        "d_reversed = list(reversed(d_sorted))\n",
        "concatenated_content = \"\\n\\n\\n --- \\n\\n\\n\".join(\n",
        "    [doc.page_content for doc in d_reversed]\n",
        ")\n",
        "print(\n",
        "    \"Num tokens in all context: %s\"\n",
        "    % num_tokens_from_string(concatenated_content, \"cl100k_base\")\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "id": "25ca3cf2-0f6b-40f9-a2ff-285a8dcb33dc",
      "metadata": {
        "id": "25ca3cf2-0f6b-40f9-a2ff-285a8dcb33dc"
      },
      "outputs": [],
      "source": [
        "# Doc texts split\n",
        "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
        "\n",
        "chunk_size_tok = 2000\n",
        "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
        "    chunk_size=chunk_size_tok, chunk_overlap=0\n",
        ")\n",
        "texts_split = text_splitter.split_text(concatenated_content)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "797a5469-0942-45a5-adb6-f12e05d76798",
      "metadata": {
        "id": "797a5469-0942-45a5-adb6-f12e05d76798"
      },
      "source": [
        "## Models\n",
        "\n",
        "We can test various models, including the new [Claude3](https://www.anthropic.com/news/claude-3-family) family.\n",
        "\n",
        "Be sure to set the relevant API keys:\n",
        "\n",
        "* `ANTHROPIC_API_KEY`\n",
        "* `OPENAI_API_KEY`"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "id": "033e71d3-5dc8-42a3-a0b7-4df116048c14",
      "metadata": {
        "id": "033e71d3-5dc8-42a3-a0b7-4df116048c14"
      },
      "outputs": [],
      "source": [
        "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
        "\n",
        "from langchain_openai import OpenAIEmbeddings\n",
        "\n",
        "embd = OpenAIEmbeddings()\n",
        "\n",
        "from langchain_openai import ChatOpenAI\n",
        "\n",
        "model = ChatOpenAI(temperature=0, model=\"gpt-4o-2024-05-13\")\n",
        "\n",
        "# from langchain_anthropic import ChatAnthropic\n",
        "\n",
        "# model = ChatAnthropic(temperature=0, model=\"claude-3-opus-20240229\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "5c63db01-cf95-4c17-ae5d-8dc7267ad58a",
      "metadata": {
        "id": "5c63db01-cf95-4c17-ae5d-8dc7267ad58a"
      },
      "source": [
        "### Tree Constrution\n",
        "\n",
        "The clustering approach in tree construction includes a few interesting ideas.\n",
        "\n",
        "**GMM (Gaussian Mixture Model)**\n",
        "\n",
        "- Model the distribution of data points across different clusters\n",
        "- Optimal number of clusters by evaluating the model's Bayesian Information Criterion (BIC)\n",
        "\n",
        "**UMAP (Uniform Manifold Approximation and Projection)**\n",
        "\n",
        "- Supports clustering\n",
        "- Reduces the dimensionality of high-dimensional data\n",
        "- UMAP helps to highlight the natural grouping of data points based on their similarities\n",
        "\n",
        "**Local and Global Clustering**\n",
        "\n",
        "- Used to analyze data at different scales\n",
        "- Both fine-grained and broader patterns within the data are captured effectively\n",
        "\n",
        "**Thresholding**\n",
        "\n",
        "- Apply in the context of GMM to determine cluster membership\n",
        "- Based on the probability distribution (assignment of data points to ≥ 1 cluster)\n",
        "---\n",
        "\n",
        "Code for GMM and thresholding is from Sarthi et al, as noted in the below two sources:\n",
        "\n",
        "* [Origional repo](https://github.com/parthsarthi03/raptor/blob/master/raptor/cluster_tree_builder.py)\n",
        "* [Minor tweaks](https://github.com/run-llama/llama_index/blob/main/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/clustering.py)\n",
        "\n",
        "Full credit to both authors."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "id": "a849980c-27d4-48e0-87a0-c2a5143cb8c0",
      "metadata": {
        "id": "a849980c-27d4-48e0-87a0-c2a5143cb8c0"
      },
      "outputs": [],
      "source": [
        "from typing import Dict, List, Optional, Tuple\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import umap\n",
        "from langchain.prompts import ChatPromptTemplate\n",
        "from langchain_core.output_parsers import StrOutputParser\n",
        "from sklearn.mixture import GaussianMixture\n",
        "\n",
        "RANDOM_SEED = 224  # Fixed seed for reproducibility\n",
        "\n",
        "### --- Code from citations referenced above (added comments and docstrings) --- ###\n",
        "\n",
        "\n",
        "def global_cluster_embeddings(\n",
        "    embeddings: np.ndarray,\n",
        "    dim: int,\n",
        "    n_neighbors: Optional[int] = None,\n",
        "    metric: str = \"cosine\",\n",
        ") -> np.ndarray:\n",
        "    \"\"\"\n",
        "    Perform global dimensionality reduction on the embeddings using UMAP.\n",
        "\n",
        "    Parameters:\n",
        "    - embeddings: The input embeddings as a numpy array.\n",
        "    - dim: The target dimensionality for the reduced space.\n",
        "    - n_neighbors: Optional; the number of neighbors to consider for each point.\n",
        "                   If not provided, it defaults to the square root of the number of embeddings.\n",
        "    - metric: The distance metric to use for UMAP.\n",
        "\n",
        "    Returns:\n",
        "    - A numpy array of the embeddings reduced to the specified dimensionality.\n",
        "    \"\"\"\n",
        "    if n_neighbors is None:\n",
        "        n_neighbors = int((len(embeddings) - 1) ** 0.5)\n",
        "    return umap.UMAP(\n",
        "        n_neighbors=n_neighbors, n_components=dim, metric=metric\n",
        "    ).fit_transform(embeddings)\n",
        "\n",
        "\n",
        "def local_cluster_embeddings(\n",
        "    embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = \"cosine\"\n",
        ") -> np.ndarray:\n",
        "    \"\"\"\n",
        "    Perform local dimensionality reduction on the embeddings using UMAP, typically after global clustering.\n",
        "\n",
        "    Parameters:\n",
        "    - embeddings: The input embeddings as a numpy array.\n",
        "    - dim: The target dimensionality for the reduced space.\n",
        "    - num_neighbors: The number of neighbors to consider for each point.\n",
        "    - metric: The distance metric to use for UMAP.\n",
        "\n",
        "    Returns:\n",
        "    - A numpy array of the embeddings reduced to the specified dimensionality.\n",
        "    \"\"\"\n",
        "    return umap.UMAP(\n",
        "        n_neighbors=num_neighbors, n_components=dim, metric=metric\n",
        "    ).fit_transform(embeddings)\n",
        "\n",
        "\n",
        "def get_optimal_clusters(\n",
        "    embeddings: np.ndarray, max_clusters: int = 50, random_state: int = RANDOM_SEED\n",
        ") -> int:\n",
        "    \"\"\"\n",
        "    Determine the optimal number of clusters using the Bayesian Information Criterion (BIC) with a Gaussian Mixture Model.\n",
        "\n",
        "    Parameters:\n",
        "    - embeddings: The input embeddings as a numpy array.\n",
        "    - max_clusters: The maximum number of clusters to consider.\n",
        "    - random_state: Seed for reproducibility.\n",
        "\n",
        "    Returns:\n",
        "    - An integer representing the optimal number of clusters found.\n",
        "    \"\"\"\n",
        "    max_clusters = min(max_clusters, len(embeddings))\n",
        "    n_clusters = np.arange(1, max_clusters)\n",
        "    bics = []\n",
        "    for n in n_clusters:\n",
        "        gm = GaussianMixture(n_components=n, random_state=random_state)\n",
        "        gm.fit(embeddings)\n",
        "        bics.append(gm.bic(embeddings))\n",
        "    return n_clusters[np.argmin(bics)]\n",
        "\n",
        "\n",
        "def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 0):\n",
        "    \"\"\"\n",
        "    Cluster embeddings using a Gaussian Mixture Model (GMM) based on a probability threshold.\n",
        "\n",
        "    Parameters:\n",
        "    - embeddings: The input embeddings as a numpy array.\n",
        "    - threshold: The probability threshold for assigning an embedding to a cluster.\n",
        "    - random_state: Seed for reproducibility.\n",
        "\n",
        "    Returns:\n",
        "    - A tuple containing the cluster labels and the number of clusters determined.\n",
        "    \"\"\"\n",
        "    n_clusters = get_optimal_clusters(embeddings)\n",
        "    gm = GaussianMixture(n_components=n_clusters, random_state=random_state)\n",
        "    gm.fit(embeddings)\n",
        "    probs = gm.predict_proba(embeddings)\n",
        "    labels = [np.where(prob > threshold)[0] for prob in probs]\n",
        "    return labels, n_clusters\n",
        "\n",
        "\n",
        "def perform_clustering(\n",
        "    embeddings: np.ndarray,\n",
        "    dim: int,\n",
        "    threshold: float,\n",
        ") -> List[np.ndarray]:\n",
        "    \"\"\"\n",
        "    Perform clustering on the embeddings by first reducing their dimensionality globally, then clustering\n",
        "    using a Gaussian Mixture Model, and finally performing local clustering within each global cluster.\n",
        "\n",
        "    Parameters:\n",
        "    - embeddings: The input embeddings as a numpy array.\n",
        "    - dim: The target dimensionality for UMAP reduction.\n",
        "    - threshold: The probability threshold for assigning an embedding to a cluster in GMM.\n",
        "\n",
        "    Returns:\n",
        "    - A list of numpy arrays, where each array contains the cluster IDs for each embedding.\n",
        "    \"\"\"\n",
        "    if len(embeddings) <= dim + 1:\n",
        "        # Avoid clustering when there's insufficient data\n",
        "        return [np.array([0]) for _ in range(len(embeddings))]\n",
        "\n",
        "    # Global dimensionality reduction\n",
        "    reduced_embeddings_global = global_cluster_embeddings(embeddings, dim)\n",
        "    # Global clustering\n",
        "    global_clusters, n_global_clusters = GMM_cluster(\n",
        "        reduced_embeddings_global, threshold\n",
        "    )\n",
        "\n",
        "    all_local_clusters = [np.array([]) for _ in range(len(embeddings))]\n",
        "    total_clusters = 0\n",
        "\n",
        "    # Iterate through each global cluster to perform local clustering\n",
        "    for i in range(n_global_clusters):\n",
        "        # Extract embeddings belonging to the current global cluster\n",
        "        global_cluster_embeddings_ = embeddings[\n",
        "            np.array([i in gc for gc in global_clusters])\n",
        "        ]\n",
        "\n",
        "        if len(global_cluster_embeddings_) == 0:\n",
        "            continue\n",
        "        if len(global_cluster_embeddings_) <= dim + 1:\n",
        "            # Handle small clusters with direct assignment\n",
        "            local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]\n",
        "            n_local_clusters = 1\n",
        "        else:\n",
        "            # Local dimensionality reduction and clustering\n",
        "            reduced_embeddings_local = local_cluster_embeddings(\n",
        "                global_cluster_embeddings_, dim\n",
        "            )\n",
        "            local_clusters, n_local_clusters = GMM_cluster(\n",
        "                reduced_embeddings_local, threshold\n",
        "            )\n",
        "\n",
        "        # Assign local cluster IDs, adjusting for total clusters already processed\n",
        "        for j in range(n_local_clusters):\n",
        "            local_cluster_embeddings_ = global_cluster_embeddings_[\n",
        "                np.array([j in lc for lc in local_clusters])\n",
        "            ]\n",
        "            indices = np.where(\n",
        "                (embeddings == local_cluster_embeddings_[:, None]).all(-1)\n",
        "            )[1]\n",
        "            for idx in indices:\n",
        "                all_local_clusters[idx] = np.append(\n",
        "                    all_local_clusters[idx], j + total_clusters\n",
        "                )\n",
        "\n",
        "        total_clusters += n_local_clusters\n",
        "\n",
        "    return all_local_clusters\n",
        "\n",
        "\n",
        "### --- Our code below --- ###\n",
        "\n",
        "\n",
        "def embed(texts):\n",
        "    \"\"\"\n",
        "    Generate embeddings for a list of text documents.\n",
        "\n",
        "    This function assumes the existence of an `embd` object with a method `embed_documents`\n",
        "    that takes a list of texts and returns their embeddings.\n",
        "\n",
        "    Parameters:\n",
        "    - texts: List[str], a list of text documents to be embedded.\n",
        "\n",
        "    Returns:\n",
        "    - numpy.ndarray: An array of embeddings for the given text documents.\n",
        "    \"\"\"\n",
        "    text_embeddings = embd.embed_documents(texts)\n",
        "    text_embeddings_np = np.array(text_embeddings)\n",
        "    return text_embeddings_np\n",
        "\n",
        "\n",
        "def embed_cluster_texts(texts):\n",
        "    \"\"\"\n",
        "    Embeds a list of texts and clusters them, returning a DataFrame with texts, their embeddings, and cluster labels.\n",
        "\n",
        "    This function combines embedding generation and clustering into a single step. It assumes the existence\n",
        "    of a previously defined `perform_clustering` function that performs clustering on the embeddings.\n",
        "\n",
        "    Parameters:\n",
        "    - texts: List[str], a list of text documents to be processed.\n",
        "\n",
        "    Returns:\n",
        "    - pandas.DataFrame: A DataFrame containing the original texts, their embeddings, and the assigned cluster labels.\n",
        "    \"\"\"\n",
        "    text_embeddings_np = embed(texts)  # Generate embeddings\n",
        "    cluster_labels = perform_clustering(\n",
        "        text_embeddings_np, 10, 0.1\n",
        "    )  # Perform clustering on the embeddings\n",
        "    df = pd.DataFrame()  # Initialize a DataFrame to store the results\n",
        "    df[\"text\"] = texts  # Store original texts\n",
        "    df[\"embd\"] = list(text_embeddings_np)  # Store embeddings as a list in the DataFrame\n",
        "    df[\"cluster\"] = cluster_labels  # Store cluster labels\n",
        "    return df\n",
        "\n",
        "\n",
        "def fmt_txt(df: pd.DataFrame) -> str:\n",
        "    \"\"\"\n",
        "    Formats the text documents in a DataFrame into a single string.\n",
        "\n",
        "    Parameters:\n",
        "    - df: DataFrame containing the 'text' column with text documents to format.\n",
        "\n",
        "    Returns:\n",
        "    - A single string where all text documents are joined by a specific delimiter.\n",
        "    \"\"\"\n",
        "    unique_txt = df[\"text\"].tolist()\n",
        "    return \"--- --- \\n --- --- \".join(unique_txt)\n",
        "\n",
        "\n",
        "def embed_cluster_summarize_texts(\n",
        "    texts: List[str], level: int\n",
        ") -> Tuple[pd.DataFrame, pd.DataFrame]:\n",
        "    \"\"\"\n",
        "    Embeds, clusters, and summarizes a list of texts. This function first generates embeddings for the texts,\n",
        "    clusters them based on similarity, expands the cluster assignments for easier processing, and then summarizes\n",
        "    the content within each cluster.\n",
        "\n",
        "    Parameters:\n",
        "    - texts: A list of text documents to be processed.\n",
        "    - level: An integer parameter that could define the depth or detail of processing.\n",
        "\n",
        "    Returns:\n",
        "    - Tuple containing two DataFrames:\n",
        "      1. The first DataFrame (`df_clusters`) includes the original texts, their embeddings, and cluster assignments.\n",
        "      2. The second DataFrame (`df_summary`) contains summaries for each cluster, the specified level of detail,\n",
        "         and the cluster identifiers.\n",
        "    \"\"\"\n",
        "\n",
        "    # Embed and cluster the texts, resulting in a DataFrame with 'text', 'embd', and 'cluster' columns\n",
        "    df_clusters = embed_cluster_texts(texts)\n",
        "\n",
        "    # Prepare to expand the DataFrame for easier manipulation of clusters\n",
        "    expanded_list = []\n",
        "\n",
        "    # Expand DataFrame entries to document-cluster pairings for straightforward processing\n",
        "    for index, row in df_clusters.iterrows():\n",
        "        for cluster in row[\"cluster\"]:\n",
        "            expanded_list.append(\n",
        "                {\"text\": row[\"text\"], \"embd\": row[\"embd\"], \"cluster\": cluster}\n",
        "            )\n",
        "\n",
        "    # Create a new DataFrame from the expanded list\n",
        "    expanded_df = pd.DataFrame(expanded_list)\n",
        "\n",
        "    # Retrieve unique cluster identifiers for processing\n",
        "    all_clusters = expanded_df[\"cluster\"].unique()\n",
        "\n",
        "    print(f\"--Generated {len(all_clusters)} clusters--\")\n",
        "\n",
        "    # Summarization\n",
        "    template = \"\"\"Here is a sub-set of Generative AI Drug Discovery doc.\n",
        "\n",
        "    Generative AI Drug Discovery provides a way to improve Drug Discovery.\n",
        "\n",
        "    Give a detailed summary of the documentation provided.\n",
        "\n",
        "    Documentation:\n",
        "    {context}\n",
        "    \"\"\"\n",
        "    prompt = ChatPromptTemplate.from_template(template)\n",
        "    chain = prompt | model | StrOutputParser()\n",
        "\n",
        "    # Format text within each cluster for summarization\n",
        "    summaries = []\n",
        "    for i in all_clusters:\n",
        "        df_cluster = expanded_df[expanded_df[\"cluster\"] == i]\n",
        "        formatted_txt = fmt_txt(df_cluster)\n",
        "        summaries.append(chain.invoke({\"context\": formatted_txt}))\n",
        "\n",
        "    # Create a DataFrame to store summaries with their corresponding cluster and level\n",
        "    df_summary = pd.DataFrame(\n",
        "        {\n",
        "            \"summaries\": summaries,\n",
        "            \"level\": [level] * len(summaries),\n",
        "            \"cluster\": list(all_clusters),\n",
        "        }\n",
        "    )\n",
        "\n",
        "    return df_clusters, df_summary\n",
        "\n",
        "\n",
        "def recursive_embed_cluster_summarize(\n",
        "    texts: List[str], level: int = 1, n_levels: int = 3\n",
        ") -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:\n",
        "    \"\"\"\n",
        "    Recursively embeds, clusters, and summarizes texts up to a specified level or until\n",
        "    the number of unique clusters becomes 1, storing the results at each level.\n",
        "\n",
        "    Parameters:\n",
        "    - texts: List[str], texts to be processed.\n",
        "    - level: int, current recursion level (starts at 1).\n",
        "    - n_levels: int, maximum depth of recursion.\n",
        "\n",
        "    Returns:\n",
        "    - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion\n",
        "      levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level.\n",
        "    \"\"\"\n",
        "    results = {}  # Dictionary to store results at each level\n",
        "\n",
        "    # Perform embedding, clustering, and summarization for the current level\n",
        "    df_clusters, df_summary = embed_cluster_summarize_texts(texts, level)\n",
        "\n",
        "    # Store the results of the current level\n",
        "    results[level] = (df_clusters, df_summary)\n",
        "\n",
        "    # Determine if further recursion is possible and meaningful\n",
        "    unique_clusters = df_summary[\"cluster\"].nunique()\n",
        "    if level < n_levels and unique_clusters > 1:\n",
        "        # Use summaries as the input texts for the next level of recursion\n",
        "        new_texts = df_summary[\"summaries\"].tolist()\n",
        "        next_level_results = recursive_embed_cluster_summarize(\n",
        "            new_texts, level + 1, n_levels\n",
        "        )\n",
        "\n",
        "        # Merge the results from the next level into the current results dictionary\n",
        "        results.update(next_level_results)\n",
        "\n",
        "    return results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "id": "f0d8cd3e-cd49-484d-9617-1b9811cc08b3",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "id": "f0d8cd3e-cd49-484d-9617-1b9811cc08b3",
        "outputId": "f5d3038b-609f-4266-8380-40611ac2983d"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--Generated 1 clusters--\n"
          ]
        }
      ],
      "source": [
        "# Build tree\n",
        "leaf_texts = docs_texts\n",
        "results = recursive_embed_cluster_summarize(leaf_texts, level=1, n_levels=3)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e80d7098-5d16-4fa6-837c-968e5c9f118d",
      "metadata": {
        "id": "e80d7098-5d16-4fa6-837c-968e5c9f118d"
      },
      "source": [
        "The paper reports best performance from `collapsed tree retrieval`.\n",
        "\n",
        "This involves flattening the tree structure into a single layer and then applying a k-nearest neighbors (kNN) search across all nodes simultaneously.\n",
        "\n",
        "We do simply do this below."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "id": "d28ba9e6-9124-41a8-b4fd-55a6ef4ac062",
      "metadata": {
        "id": "d28ba9e6-9124-41a8-b4fd-55a6ef4ac062"
      },
      "outputs": [],
      "source": [
        "from langchain_community.vectorstores import Chroma\n",
        "\n",
        "# Initialize all_texts with leaf_texts\n",
        "all_texts = leaf_texts.copy()\n",
        "\n",
        "# Iterate through the results to extract summaries from each level and add them to all_texts\n",
        "for level in sorted(results.keys()):\n",
        "    # Extract summaries from the current level's DataFrame\n",
        "    summaries = results[level][1][\"summaries\"].tolist()\n",
        "    # Extend all_texts with the summaries from the current level\n",
        "    all_texts.extend(summaries)\n",
        "\n",
        "# Now, use all_texts to build the vectorstore with Chroma\n",
        "vectorstore = Chroma.from_texts(texts=all_texts, embedding=embd)\n",
        "retriever = vectorstore.as_retriever()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "0d497627-44c6-41f7-bb63-1d858d3f188f",
      "metadata": {
        "id": "0d497627-44c6-41f7-bb63-1d858d3f188f"
      },
      "source": [
        "Now we can using our flattened, indexed tree in a RAG chain."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "id": "9d6c894b-b3a3-4a01-b779-3e98ea382ff5",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 72
        },
        "id": "9d6c894b-b3a3-4a01-b779-3e98ea382ff5",
        "outputId": "2a3ac72b-43f6-4551-a9e5-5e528c26c0fd"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'Cancer is being addressed in Generative AI Drug Discovery through the development of unique workflows that incorporate various methodologies. A specific example is the use of Meta Llama 3 for fine-tuning and retrieval-augmented generation (RAG) in cancer drug discovery. This approach leverages advanced AI models like GPT and BERT to generate de novo proteins and identify potential drug candidates.'"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 10
        }
      ],
      "source": [
        "from langchain import hub\n",
        "from langchain_core.runnables import RunnablePassthrough\n",
        "\n",
        "# Prompt\n",
        "prompt = hub.pull(\"rlm/rag-prompt\")\n",
        "\n",
        "\n",
        "# Post-processing\n",
        "def format_docs(docs):\n",
        "    return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
        "\n",
        "\n",
        "# Chain\n",
        "rag_chain = (\n",
        "    {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
        "    | prompt\n",
        "    | model\n",
        "    | StrOutputParser()\n",
        ")\n",
        "\n",
        "# Question\n",
        "rag_chain.invoke(\"How is cancer being addressed in Generative AI Drug Discovery? Give me a specific example.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "0c585b37-ad83-4069-8f5d-4a6a3e15128d",
      "metadata": {
        "id": "0c585b37-ad83-4069-8f5d-4a6a3e15128d"
      },
      "source": [
        "Trace:\n",
        "\n",
        "https://smith.langchain.com/public/1dabf475-1675-4494-b16c-928fbf079851/r"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.16"
    },
    "colab": {
      "provenance": [],
      "machine_shape": "hm",
      "gpuType": "A100"
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 5
}