--- a +++ b/Code/LangChain/RAPTOR/gpt4o_RAPTOR, kkawchak.ipynb @@ -0,0 +1,1182 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3058e9ca-07c3-4eef-b98c-bc2f2dbb9cc6", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "3058e9ca-07c3-4eef-b98c-bc2f2dbb9cc6", + "outputId": "911fd243-c023-4d63-f559-7e60e8fd78cb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting langchain\n", + " Downloading langchain-0.1.20-py3-none-any.whl (1.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting umap-learn\n", + " Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.7/85.7 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", + "Collecting scikit-learn\n", + " Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.1/12.1 MB\u001b[0m \u001b[31m84.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langchain_community\n", + " Downloading langchain_community-0.0.38-py3-none-any.whl (2.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m65.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting tiktoken\n", + " Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m55.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langchain-openai\n", + " Downloading langchain_openai-0.1.7-py3-none-any.whl (34 kB)\n", + "Collecting langchainhub\n", + " Downloading langchainhub-0.1.15-py3-none-any.whl (4.6 kB)\n", + "Collecting chromadb\n", + " Downloading chromadb-0.5.0-py3-none-any.whl (526 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m526.8/526.8 kB\u001b[0m \u001b[31m47.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langchain-anthropic\n", + " Downloading langchain_anthropic-0.1.12-py3-none-any.whl (16 kB)\n", + "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.30)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.5)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", + "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n", + " Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)\n", + "Collecting langchain-core<0.2.0,>=0.1.52 (from langchain)\n", + " Downloading langchain_core-0.1.52-py3-none-any.whl (302 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.9/302.9 kB\u001b[0m \u001b[31m27.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)\n", + " Downloading langchain_text_splitters-0.0.2-py3-none-any.whl (23 kB)\n", + "Collecting langsmith<0.2.0,>=0.1.17 (from langchain)\n", + " Downloading langsmith-0.1.59-py3-none-any.whl (121 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.2/121.2 kB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.25.2)\n", + "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.7.1)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n", + "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.3.0)\n", + "Requirement already satisfied: scipy>=1.3.1 in /usr/local/lib/python3.10/dist-packages (from umap-learn) (1.11.4)\n", + "Requirement already satisfied: numba>=0.51.2 in /usr/local/lib/python3.10/dist-packages (from umap-learn) (0.58.1)\n", + "Collecting pynndescent>=0.5 (from umap-learn)\n", + " Downloading pynndescent-0.5.12-py3-none-any.whl (56 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.8/56.8 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from umap-learn) (4.66.4)\n", + "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.5.0)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2023.12.25)\n", + "Collecting openai<2.0.0,>=1.24.0 (from langchain-openai)\n", + " Downloading openai-1.30.1-py3-none-any.whl (320 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.6/320.6 kB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting types-requests<3.0.0.0,>=2.31.0.2 (from langchainhub)\n", + " Downloading types_requests-2.31.0.20240406-py3-none-any.whl (15 kB)\n", + "Requirement already satisfied: build>=1.0.3 in /usr/local/lib/python3.10/dist-packages (from chromadb) (1.2.1)\n", + "Collecting chroma-hnswlib==0.7.3 (from chromadb)\n", + " Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m88.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting fastapi>=0.95.2 (from chromadb)\n", + " Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)\n", + " Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.8/60.8 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting posthog>=2.4.0 (from chromadb)\n", + " Downloading posthog-3.5.0-py2.py3-none-any.whl (41 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.3/41.3 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (4.11.0)\n", + "Collecting onnxruntime>=1.14.1 (from chromadb)\n", + " Downloading onnxruntime-1.17.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m100.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting opentelemetry-api>=1.2.0 (from chromadb)\n", + " Downloading opentelemetry_api-1.24.0-py3-none-any.whl (60 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.1/60.1 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)\n", + " Downloading opentelemetry_exporter_otlp_proto_grpc-1.24.0-py3-none-any.whl (18 kB)\n", + "Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)\n", + " Downloading opentelemetry_instrumentation_fastapi-0.45b0-py3-none-any.whl (11 kB)\n", + "Collecting opentelemetry-sdk>=1.2.0 (from chromadb)\n", + " Downloading opentelemetry_sdk-1.24.0-py3-none-any.whl (106 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m106.1/106.1 kB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: tokenizers>=0.13.2 in /usr/local/lib/python3.10/dist-packages (from chromadb) (0.19.1)\n", + "Collecting pypika>=0.48.9 (from chromadb)\n", + " Downloading PyPika-0.48.9.tar.gz (67 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting overrides>=7.3.1 (from chromadb)\n", + " Downloading overrides-7.7.0-py3-none-any.whl (17 kB)\n", + "Requirement already satisfied: importlib-resources in /usr/local/lib/python3.10/dist-packages (from chromadb) (6.4.0)\n", + "Requirement already satisfied: grpcio>=1.58.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (1.63.0)\n", + "Collecting bcrypt>=4.0.1 (from chromadb)\n", + " Downloading bcrypt-4.1.3-cp39-abi3-manylinux_2_28_x86_64.whl (283 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m283.7/283.7 kB\u001b[0m \u001b[31m29.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: typer>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (0.9.4)\n", + "Collecting kubernetes>=28.1.0 (from chromadb)\n", + " Downloading kubernetes-29.0.0-py2.py3-none-any.whl (1.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m79.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting mmh3>=4.0.1 (from chromadb)\n", + " Downloading mmh3-4.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (67 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.6/67.6 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting orjson>=3.9.12 (from chromadb)\n", + " Downloading orjson-3.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (142 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m142.5/142.5 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting anthropic<1,>=0.23.0 (from langchain-anthropic)\n", + " Downloading anthropic-0.25.9-py3-none-any.whl (871 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m871.1/871.1 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: defusedxml<0.8.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from langchain-anthropic) (0.7.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from anthropic<1,>=0.23.0->langchain-anthropic) (3.7.1)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from anthropic<1,>=0.23.0->langchain-anthropic) (1.7.0)\n", + "Collecting httpx<1,>=0.23.0 (from anthropic<1,>=0.23.0->langchain-anthropic)\n", + " Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from anthropic<1,>=0.23.0->langchain-anthropic) (1.3.1)\n", + "Requirement already satisfied: packaging>=19.1 in /usr/local/lib/python3.10/dist-packages (from build>=1.0.3->chromadb) (24.0)\n", + "Requirement already satisfied: pyproject_hooks in /usr/local/lib/python3.10/dist-packages (from build>=1.0.3->chromadb) (1.1.0)\n", + "Requirement already satisfied: tomli>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from build>=1.0.3->chromadb) (2.0.1)\n", + "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading marshmallow-3.21.2-py3-none-any.whl (49 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", + "Collecting starlette<0.38.0,>=0.37.2 (from fastapi>=0.95.2->chromadb)\n", + " Downloading starlette-0.37.2-py3-none-any.whl (71 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.9/71.9 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting fastapi-cli>=0.0.2 (from fastapi>=0.95.2->chromadb)\n", + " Downloading fastapi_cli-0.0.3-py3-none-any.whl (9.2 kB)\n", + "Requirement already satisfied: jinja2>=2.11.2 in /usr/local/lib/python3.10/dist-packages (from fastapi>=0.95.2->chromadb) (3.1.4)\n", + "Collecting python-multipart>=0.0.7 (from fastapi>=0.95.2->chromadb)\n", + " Downloading python_multipart-0.0.9-py3-none-any.whl (22 kB)\n", + "Collecting ujson!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,>=4.0.1 (from fastapi>=0.95.2->chromadb)\n", + " Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.6/53.6 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting email_validator>=2.0.0 (from fastapi>=0.95.2->chromadb)\n", + " Downloading email_validator-2.1.1-py3-none-any.whl (30 kB)\n", + "Requirement already satisfied: certifi>=14.05.14 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2024.2.2)\n", + "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\n", + "Requirement already satisfied: python-dateutil>=2.5.3 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.8.2)\n", + "Requirement already satisfied: google-auth>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.27.0)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.8.0)\n", + "Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.3.1)\n", + "Requirement already satisfied: oauthlib>=3.2.2 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\n", + "Requirement already satisfied: urllib3>=1.24.2 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.0.7)\n", + "Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.2.0,>=0.1.52->langchain)\n", + " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", + "Collecting packaging>=19.1 (from build>=1.0.3->chromadb)\n", + " Downloading packaging-23.2-py3-none-any.whl (53 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.0/53.0 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.2->umap-learn) (0.41.1)\n", + "Collecting coloredlogs (from onnxruntime>=1.14.1->chromadb)\n", + " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.14.1->chromadb) (24.3.25)\n", + "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.14.1->chromadb) (3.20.3)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.14.1->chromadb) (1.12)\n", + "Collecting deprecated>=1.2.6 (from opentelemetry-api>=1.2.0->chromadb)\n", + " Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n", + "Collecting importlib-metadata<=7.0,>=6.0 (from opentelemetry-api>=1.2.0->chromadb)\n", + " Downloading importlib_metadata-7.0.0-py3-none-any.whl (23 kB)\n", + "Requirement already satisfied: googleapis-common-protos~=1.52 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.63.0)\n", + "Collecting opentelemetry-exporter-otlp-proto-common==1.24.0 (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb)\n", + " Downloading opentelemetry_exporter_otlp_proto_common-1.24.0-py3-none-any.whl (17 kB)\n", + "Collecting opentelemetry-proto==1.24.0 (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb)\n", + " Downloading opentelemetry_proto-1.24.0-py3-none-any.whl (50 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting opentelemetry-instrumentation-asgi==0.45b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", + " Downloading opentelemetry_instrumentation_asgi-0.45b0-py3-none-any.whl (14 kB)\n", + "Collecting opentelemetry-instrumentation==0.45b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", + " Downloading opentelemetry_instrumentation-0.45b0-py3-none-any.whl (28 kB)\n", + "Collecting opentelemetry-semantic-conventions==0.45b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", + " Downloading opentelemetry_semantic_conventions-0.45b0-py3-none-any.whl (36 kB)\n", + "Collecting opentelemetry-util-http==0.45b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", + " Downloading opentelemetry_util_http-0.45b0-py3-none-any.whl (6.9 kB)\n", + "Requirement already satisfied: setuptools>=16.0 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (67.7.2)\n", + "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (1.14.1)\n", + "Collecting asgiref~=3.0 (from opentelemetry-instrumentation-asgi==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", + " Downloading asgiref-3.8.1-py3-none-any.whl (23 kB)\n", + "Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb)\n", + " Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n", + "Collecting backoff>=1.10.0 (from posthog>=2.4.0->chromadb)\n", + " Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.18.2 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (2.18.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.7)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from tokenizers>=0.13.2->chromadb) (0.20.3)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.10/dist-packages (from typer>=0.9.0->chromadb) (8.1.7)\n", + "Collecting h11>=0.8 (from uvicorn[standard]>=0.18.3->chromadb)\n", + " Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting httptools>=0.5.0 (from uvicorn[standard]>=0.18.3->chromadb)\n", + " Downloading httptools-0.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (341 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m37.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting python-dotenv>=0.13 (from uvicorn[standard]>=0.18.3->chromadb)\n", + " Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n", + "Collecting uvloop!=0.15.0,!=0.15.1,>=0.14.0 (from uvicorn[standard]>=0.18.3->chromadb)\n", + " Downloading uvloop-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m103.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting watchfiles>=0.13 (from uvicorn[standard]>=0.18.3->chromadb)\n", + " Downloading watchfiles-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m74.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting websockets>=10.4 (from uvicorn[standard]>=0.18.3->chromadb)\n", + " Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->anthropic<1,>=0.23.0->langchain-anthropic) (1.2.1)\n", + "Collecting dnspython>=2.0.0 (from email_validator>=2.0.0->fastapi>=0.95.2->chromadb)\n", + " Downloading dnspython-2.6.1-py3-none-any.whl (307 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m307.7/307.7 kB\u001b[0m \u001b[31m35.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting typer>=0.9.0 (from chromadb)\n", + " Downloading typer-0.12.3-py3-none-any.whl (47 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.2/47.2 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting shellingham>=1.3.0 (from typer>=0.9.0->chromadb)\n", + " Downloading shellingham-1.5.4-py2.py3-none-any.whl (9.8 kB)\n", + "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.10/dist-packages (from typer>=0.9.0->chromadb) (13.7.1)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.3.3)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.4.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\n", + "Collecting httpcore==1.* (from httpx<1,>=0.23.0->anthropic<1,>=0.23.0->langchain-anthropic)\n", + " Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m11.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (3.14.0)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (2023.6.0)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata<=7.0,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.18.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.11.2->fastapi>=0.95.2->chromadb) (2.1.5)\n", + "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-core<0.2.0,>=0.1.52->langchain)\n", + " Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer>=0.9.0->chromadb) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer>=0.9.0->chromadb) (2.16.1)\n", + "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", + "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.14.1->chromadb)\n", + " Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer>=0.9.0->chromadb) (0.1.2)\n", + "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.6.0)\n", + "Building wheels for collected packages: pypika\n", + " Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pypika: filename=PyPika-0.48.9-py2.py3-none-any.whl size=53724 sha256=0173162f2de8f5b354e35275c3885edf958f5c4a6a295441c8e8ef5557ae7f2e\n", + " Stored in directory: /root/.cache/pip/wheels/e1/26/51/d0bffb3d2fd82256676d7ad3003faea3bd6dddc9577af665f4\n", + "Successfully built pypika\n", + "Installing collected packages: pypika, monotonic, mmh3, websockets, uvloop, ujson, types-requests, shellingham, python-multipart, python-dotenv, packaging, overrides, orjson, opentelemetry-util-http, opentelemetry-semantic-conventions, opentelemetry-proto, mypy-extensions, jsonpointer, importlib-metadata, humanfriendly, httptools, h11, dnspython, deprecated, chroma-hnswlib, bcrypt, backoff, asgiref, watchfiles, uvicorn, typing-inspect, tiktoken, starlette, scikit-learn, posthog, opentelemetry-exporter-otlp-proto-common, opentelemetry-api, marshmallow, langchainhub, jsonpatch, httpcore, email_validator, coloredlogs, typer, pynndescent, opentelemetry-sdk, opentelemetry-instrumentation, onnxruntime, langsmith, kubernetes, httpx, dataclasses-json, umap-learn, opentelemetry-instrumentation-asgi, opentelemetry-exporter-otlp-proto-grpc, openai, langchain-core, anthropic, opentelemetry-instrumentation-fastapi, langchain-text-splitters, langchain-openai, langchain_community, langchain-anthropic, langchain, fastapi-cli, fastapi, chromadb\n", + " Attempting uninstall: packaging\n", + " Found existing installation: packaging 24.0\n", + " Uninstalling packaging-24.0:\n", + " Successfully uninstalled packaging-24.0\n", + " Attempting uninstall: importlib-metadata\n", + " Found existing installation: importlib_metadata 7.1.0\n", + " Uninstalling importlib_metadata-7.1.0:\n", + " Successfully uninstalled importlib_metadata-7.1.0\n", + " Attempting uninstall: scikit-learn\n", + " Found existing installation: scikit-learn 1.2.2\n", + " Uninstalling scikit-learn-1.2.2:\n", + " Successfully uninstalled scikit-learn-1.2.2\n", + " Attempting uninstall: typer\n", + " Found existing installation: typer 0.9.4\n", + " Uninstalling typer-0.9.4:\n", + " Successfully uninstalled typer-0.9.4\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "spacy 3.7.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.\n", + "weasel 0.3.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed anthropic-0.25.9 asgiref-3.8.1 backoff-2.2.1 bcrypt-4.1.3 chroma-hnswlib-0.7.3 chromadb-0.5.0 coloredlogs-15.0.1 dataclasses-json-0.6.6 deprecated-1.2.14 dnspython-2.6.1 email_validator-2.1.1 fastapi-0.111.0 fastapi-cli-0.0.3 h11-0.14.0 httpcore-1.0.5 httptools-0.6.1 httpx-0.27.0 humanfriendly-10.0 importlib-metadata-7.0.0 jsonpatch-1.33 jsonpointer-2.4 kubernetes-29.0.0 langchain-0.1.20 langchain-anthropic-0.1.12 langchain-core-0.1.52 langchain-openai-0.1.7 langchain-text-splitters-0.0.2 langchain_community-0.0.38 langchainhub-0.1.15 langsmith-0.1.59 marshmallow-3.21.2 mmh3-4.1.0 monotonic-1.6 mypy-extensions-1.0.0 onnxruntime-1.17.3 openai-1.30.1 opentelemetry-api-1.24.0 opentelemetry-exporter-otlp-proto-common-1.24.0 opentelemetry-exporter-otlp-proto-grpc-1.24.0 opentelemetry-instrumentation-0.45b0 opentelemetry-instrumentation-asgi-0.45b0 opentelemetry-instrumentation-fastapi-0.45b0 opentelemetry-proto-1.24.0 opentelemetry-sdk-1.24.0 opentelemetry-semantic-conventions-0.45b0 opentelemetry-util-http-0.45b0 orjson-3.10.3 overrides-7.7.0 packaging-23.2 posthog-3.5.0 pynndescent-0.5.12 pypika-0.48.9 python-dotenv-1.0.1 python-multipart-0.0.9 scikit-learn-1.4.2 shellingham-1.5.4 starlette-0.37.2 tiktoken-0.7.0 typer-0.12.3 types-requests-2.31.0.20240406 typing-inspect-0.9.0 ujson-5.10.0 umap-learn-0.5.6 uvicorn-0.29.0 uvloop-0.19.0 watchfiles-0.21.0 websockets-12.0\n" + ] + } + ], + "source": [ + "pip install -U langchain umap-learn scikit-learn langchain_community tiktoken langchain-openai langchainhub chromadb langchain-anthropic" + ] + }, + { + "cell_type": "markdown", + "id": "ea54c848-0df6-474e-b266-218a2acf67d3", + "metadata": { + "id": "ea54c848-0df6-474e-b266-218a2acf67d3" + }, + "source": [ + "# RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval\n", + "\n", + "The [RAPTOR](https://arxiv.org/pdf/2401.18059.pdf) paper presents an interesting approaching for indexing and retrieval of documents:\n", + "\n", + "* The `leafs` are a set of starting documents\n", + "* Leafs are embedded and clustered\n", + "* Clusters are then summarized into higher level (more abstract) consolidations of information across similar documents\n", + "\n", + "This process is done recursivly, resulting in a \"tree\" going from raw docs (`leafs`) to more abstract summaries.\n", + "\n", + "We can applying this at varying scales; `leafs` can be:\n", + "\n", + "* Text chunks from a single doc (as shown in the paper)\n", + "* Full docs (as we show below)\n", + "\n", + "With longer context LLMs, it's possible to perform this over full documents.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "# Optional, add tracing in LangSmith\n", + "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", + "os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'\n", + "os.environ[\"LANGCHAIN_PROJECT\"] = \"RAPTOR\"\n", + "os.environ['LANGCHAIN_API_KEY'] = ''" + ], + "metadata": { + "id": "lk-aKD_W1kwq" + }, + "id": "lk-aKD_W1kwq", + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "markdown", + "id": "083dd961-b401-4fc6-867c-8f8950059b02", + "metadata": { + "id": "083dd961-b401-4fc6-867c-8f8950059b02" + }, + "source": [ + "### Docs\n", + "\n", + "Let's apply this to LangChain's LCEL documentation.\n", + "\n", + "In this case, each `doc` is a unique web page of the LCEL docs.\n", + "\n", + "The context varies from < 2k tokens on up to > 10k tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b17c1331-373f-491d-8b53-ccf634e68c8e", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 733 + }, + "id": "b17c1331-373f-491d-8b53-ccf634e68c8e", + "outputId": "37db4d71-7316-4525-b04d-53d48a23c5eb" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "<function matplotlib.pyplot.show(close=None, block=None)>" + ], + "text/html": [ + "<div style=\"max-width:800px; border: 1px solid var(--colab-border-color);\"><style>\n", + " pre.function-repr-contents {\n", + " overflow-x: auto;\n", + " padding: 8px 12px;\n", + " max-height: 500px;\n", + " }\n", + "\n", + " pre.function-repr-contents.function-repr-contents-collapsed {\n", + " cursor: pointer;\n", + " max-height: 100px;\n", + " }\n", + " </style>\n", + " <pre style=\"white-space: initial; background:\n", + " var(--colab-secondary-surface-color); padding: 8px 12px;\n", + " border-bottom: 1px solid var(--colab-border-color);\"><b>matplotlib.pyplot.show</b><br/>def show(*args, **kwargs)</pre><pre class=\"function-repr-contents function-repr-contents-collapsed\" style=\"\"><a class=\"filepath\" style=\"display:none\" href=\"#\">/usr/local/lib/python3.10/dist-packages/matplotlib/pyplot.py</a>Display all open figures.\n", + "\n", + "Parameters\n", + "----------\n", + "block : bool, optional\n", + " Whether to wait for all figures to be closed before returning.\n", + "\n", + " If `True` block and run the GUI main loop until all figure windows\n", + " are closed.\n", + "\n", + " If `False` ensure that all figure windows are displayed and return\n", + " immediately. In this case, you are responsible for ensuring\n", + " that the event loop is running to have responsive figures.\n", + "\n", + " Defaults to True in non-interactive mode and to False in interactive\n", + " mode (see `.pyplot.isinteractive`).\n", + "\n", + "See Also\n", + "--------\n", + "ion : Enable interactive mode, which shows / updates the figure after\n", + " every plotting command, so that calling ``show()`` is not necessary.\n", + "ioff : Disable interactive mode.\n", + "savefig : Save the figure to an image file instead of showing it on screen.\n", + "\n", + "Notes\n", + "-----\n", + "**Saving figures to file and showing a window at the same time**\n", + "\n", + "If you want an image file as well as a user interface window, use\n", + "`.pyplot.savefig` before `.pyplot.show`. At the end of (a blocking)\n", + "``show()`` the figure is closed and thus unregistered from pyplot. Calling\n", + "`.pyplot.savefig` afterwards would save a new and thus empty figure. This\n", + "limitation of command order does not apply if the show is non-blocking or\n", + "if you keep a reference to the figure and use `.Figure.savefig`.\n", + "\n", + "**Auto-show in jupyter notebooks**\n", + "\n", + "The jupyter backends (activated via ``%matplotlib inline``,\n", + "``%matplotlib notebook``, or ``%matplotlib widget``), call ``show()`` at\n", + "the end of every cell by default. Thus, you usually don't have to call it\n", + "explicitly there.</pre>\n", + " <script>\n", + " if (google.colab.kernel.accessAllowed && google.colab.files && google.colab.files.view) {\n", + " for (const element of document.querySelectorAll('.filepath')) {\n", + " element.style.display = 'block'\n", + " element.onclick = (event) => {\n", + " event.preventDefault();\n", + " event.stopPropagation();\n", + " google.colab.files.view(element.textContent, 401);\n", + " };\n", + " }\n", + " }\n", + " for (const element of document.querySelectorAll('.function-repr-contents')) {\n", + " element.onclick = (event) => {\n", + " event.preventDefault();\n", + " event.stopPropagation();\n", + " element.classList.toggle('function-repr-contents-collapsed');\n", + " };\n", + " }\n", + " </script>\n", + " </div>" + ] + }, + "metadata": {}, + "execution_count": 3 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "<Figure size 1000x600 with 1 Axes>" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import tiktoken\n", + "from bs4 import BeautifulSoup as Soup\n", + "from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader\n", + "\n", + "\n", + "def num_tokens_from_string(string: str, encoding_name: str) -> int:\n", + " \"\"\"Returns the number of tokens in a text string.\"\"\"\n", + " encoding = tiktoken.get_encoding(encoding_name)\n", + " num_tokens = len(encoding.encode(string))\n", + " return num_tokens\n", + "\n", + "\n", + "# LCEL docs\n", + "url = \"https://www.sciencedirect.com/science/article/pii/S135964462400117X\"\n", + "loader = RecursiveUrlLoader(\n", + " url=url, max_depth=1, extractor=lambda x: Soup(x, \"html.parser\").text\n", + ")\n", + "docs = loader.load()\n", + "\n", + "# LCEL w/ PydanticOutputParser (outside the primary LCEL docs)\n", + "url = \"https://www.chemicalqdevice.com/cancer-drug-discovery-innovation\"\n", + "loader = RecursiveUrlLoader(\n", + " url=url, max_depth=1, extractor=lambda x: Soup(x, \"html.parser\").text\n", + ")\n", + "docs_pydantic = loader.load()\n", + "\n", + "# LCEL w/ Self Query (outside the primary LCEL docs)\n", + "url = \"https://www.chemicalqdevice.com/cancer-drug-discovery-ai\"\n", + "loader = RecursiveUrlLoader(\n", + " url=url, max_depth=1, extractor=lambda x: Soup(x, \"html.parser\").text\n", + ")\n", + "docs_sq = loader.load()\n", + "\n", + "# Doc texts\n", + "docs.extend([*docs_pydantic, *docs_sq])\n", + "docs_texts = [d.page_content for d in docs]\n", + "\n", + "# Calculate the number of tokens for each document\n", + "counts = [num_tokens_from_string(d, \"cl100k_base\") for d in docs_texts]\n", + "\n", + "# Plotting the histogram of token counts\n", + "plt.figure(figsize=(10, 6))\n", + "plt.hist(counts, bins=30, color=\"blue\", edgecolor=\"black\", alpha=0.7)\n", + "plt.title(\"Histogram of Token Counts\")\n", + "plt.xlabel(\"Token Count\")\n", + "plt.ylabel(\"Frequency\")\n", + "plt.grid(axis=\"y\", alpha=0.75)\n", + "\n", + "# Display the histogram\n", + "plt.show" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "70750603-ec82-4439-9b32-d22014b5ff2c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "70750603-ec82-4439-9b32-d22014b5ff2c", + "outputId": "1ea8ed1c-2bf9-4be5-8e31-d5ec8a2b64ba" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Num tokens in all context: 9514\n" + ] + } + ], + "source": [ + "# Doc texts concat\n", + "d_sorted = sorted(docs, key=lambda x: x.metadata[\"source\"])\n", + "d_reversed = list(reversed(d_sorted))\n", + "concatenated_content = \"\\n\\n\\n --- \\n\\n\\n\".join(\n", + " [doc.page_content for doc in d_reversed]\n", + ")\n", + "print(\n", + " \"Num tokens in all context: %s\"\n", + " % num_tokens_from_string(concatenated_content, \"cl100k_base\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "25ca3cf2-0f6b-40f9-a2ff-285a8dcb33dc", + "metadata": { + "id": "25ca3cf2-0f6b-40f9-a2ff-285a8dcb33dc" + }, + "outputs": [], + "source": [ + "# Doc texts split\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "\n", + "chunk_size_tok = 2000\n", + "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", + " chunk_size=chunk_size_tok, chunk_overlap=0\n", + ")\n", + "texts_split = text_splitter.split_text(concatenated_content)" + ] + }, + { + "cell_type": "markdown", + "id": "797a5469-0942-45a5-adb6-f12e05d76798", + "metadata": { + "id": "797a5469-0942-45a5-adb6-f12e05d76798" + }, + "source": [ + "## Models\n", + "\n", + "We can test various models, including the new [Claude3](https://www.anthropic.com/news/claude-3-family) family.\n", + "\n", + "Be sure to set the relevant API keys:\n", + "\n", + "* `ANTHROPIC_API_KEY`\n", + "* `OPENAI_API_KEY`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "033e71d3-5dc8-42a3-a0b7-4df116048c14", + "metadata": { + "id": "033e71d3-5dc8-42a3-a0b7-4df116048c14" + }, + "outputs": [], + "source": [ + "os.environ[\"OPENAI_API_KEY\"] = \"\"\n", + "\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embd = OpenAIEmbeddings()\n", + "\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "model = ChatOpenAI(temperature=0, model=\"gpt-4o-2024-05-13\")\n", + "\n", + "# from langchain_anthropic import ChatAnthropic\n", + "\n", + "# model = ChatAnthropic(temperature=0, model=\"claude-3-opus-20240229\")" + ] + }, + { + "cell_type": "markdown", + "id": "5c63db01-cf95-4c17-ae5d-8dc7267ad58a", + "metadata": { + "id": "5c63db01-cf95-4c17-ae5d-8dc7267ad58a" + }, + "source": [ + "### Tree Constrution\n", + "\n", + "The clustering approach in tree construction includes a few interesting ideas.\n", + "\n", + "**GMM (Gaussian Mixture Model)**\n", + "\n", + "- Model the distribution of data points across different clusters\n", + "- Optimal number of clusters by evaluating the model's Bayesian Information Criterion (BIC)\n", + "\n", + "**UMAP (Uniform Manifold Approximation and Projection)**\n", + "\n", + "- Supports clustering\n", + "- Reduces the dimensionality of high-dimensional data\n", + "- UMAP helps to highlight the natural grouping of data points based on their similarities\n", + "\n", + "**Local and Global Clustering**\n", + "\n", + "- Used to analyze data at different scales\n", + "- Both fine-grained and broader patterns within the data are captured effectively\n", + "\n", + "**Thresholding**\n", + "\n", + "- Apply in the context of GMM to determine cluster membership\n", + "- Based on the probability distribution (assignment of data points to ≥ 1 cluster)\n", + "---\n", + "\n", + "Code for GMM and thresholding is from Sarthi et al, as noted in the below two sources:\n", + "\n", + "* [Origional repo](https://github.com/parthsarthi03/raptor/blob/master/raptor/cluster_tree_builder.py)\n", + "* [Minor tweaks](https://github.com/run-llama/llama_index/blob/main/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/clustering.py)\n", + "\n", + "Full credit to both authors." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a849980c-27d4-48e0-87a0-c2a5143cb8c0", + "metadata": { + "id": "a849980c-27d4-48e0-87a0-c2a5143cb8c0" + }, + "outputs": [], + "source": [ + "from typing import Dict, List, Optional, Tuple\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import umap\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from sklearn.mixture import GaussianMixture\n", + "\n", + "RANDOM_SEED = 224 # Fixed seed for reproducibility\n", + "\n", + "### --- Code from citations referenced above (added comments and docstrings) --- ###\n", + "\n", + "\n", + "def global_cluster_embeddings(\n", + " embeddings: np.ndarray,\n", + " dim: int,\n", + " n_neighbors: Optional[int] = None,\n", + " metric: str = \"cosine\",\n", + ") -> np.ndarray:\n", + " \"\"\"\n", + " Perform global dimensionality reduction on the embeddings using UMAP.\n", + "\n", + " Parameters:\n", + " - embeddings: The input embeddings as a numpy array.\n", + " - dim: The target dimensionality for the reduced space.\n", + " - n_neighbors: Optional; the number of neighbors to consider for each point.\n", + " If not provided, it defaults to the square root of the number of embeddings.\n", + " - metric: The distance metric to use for UMAP.\n", + "\n", + " Returns:\n", + " - A numpy array of the embeddings reduced to the specified dimensionality.\n", + " \"\"\"\n", + " if n_neighbors is None:\n", + " n_neighbors = int((len(embeddings) - 1) ** 0.5)\n", + " return umap.UMAP(\n", + " n_neighbors=n_neighbors, n_components=dim, metric=metric\n", + " ).fit_transform(embeddings)\n", + "\n", + "\n", + "def local_cluster_embeddings(\n", + " embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = \"cosine\"\n", + ") -> np.ndarray:\n", + " \"\"\"\n", + " Perform local dimensionality reduction on the embeddings using UMAP, typically after global clustering.\n", + "\n", + " Parameters:\n", + " - embeddings: The input embeddings as a numpy array.\n", + " - dim: The target dimensionality for the reduced space.\n", + " - num_neighbors: The number of neighbors to consider for each point.\n", + " - metric: The distance metric to use for UMAP.\n", + "\n", + " Returns:\n", + " - A numpy array of the embeddings reduced to the specified dimensionality.\n", + " \"\"\"\n", + " return umap.UMAP(\n", + " n_neighbors=num_neighbors, n_components=dim, metric=metric\n", + " ).fit_transform(embeddings)\n", + "\n", + "\n", + "def get_optimal_clusters(\n", + " embeddings: np.ndarray, max_clusters: int = 50, random_state: int = RANDOM_SEED\n", + ") -> int:\n", + " \"\"\"\n", + " Determine the optimal number of clusters using the Bayesian Information Criterion (BIC) with a Gaussian Mixture Model.\n", + "\n", + " Parameters:\n", + " - embeddings: The input embeddings as a numpy array.\n", + " - max_clusters: The maximum number of clusters to consider.\n", + " - random_state: Seed for reproducibility.\n", + "\n", + " Returns:\n", + " - An integer representing the optimal number of clusters found.\n", + " \"\"\"\n", + " max_clusters = min(max_clusters, len(embeddings))\n", + " n_clusters = np.arange(1, max_clusters)\n", + " bics = []\n", + " for n in n_clusters:\n", + " gm = GaussianMixture(n_components=n, random_state=random_state)\n", + " gm.fit(embeddings)\n", + " bics.append(gm.bic(embeddings))\n", + " return n_clusters[np.argmin(bics)]\n", + "\n", + "\n", + "def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 0):\n", + " \"\"\"\n", + " Cluster embeddings using a Gaussian Mixture Model (GMM) based on a probability threshold.\n", + "\n", + " Parameters:\n", + " - embeddings: The input embeddings as a numpy array.\n", + " - threshold: The probability threshold for assigning an embedding to a cluster.\n", + " - random_state: Seed for reproducibility.\n", + "\n", + " Returns:\n", + " - A tuple containing the cluster labels and the number of clusters determined.\n", + " \"\"\"\n", + " n_clusters = get_optimal_clusters(embeddings)\n", + " gm = GaussianMixture(n_components=n_clusters, random_state=random_state)\n", + " gm.fit(embeddings)\n", + " probs = gm.predict_proba(embeddings)\n", + " labels = [np.where(prob > threshold)[0] for prob in probs]\n", + " return labels, n_clusters\n", + "\n", + "\n", + "def perform_clustering(\n", + " embeddings: np.ndarray,\n", + " dim: int,\n", + " threshold: float,\n", + ") -> List[np.ndarray]:\n", + " \"\"\"\n", + " Perform clustering on the embeddings by first reducing their dimensionality globally, then clustering\n", + " using a Gaussian Mixture Model, and finally performing local clustering within each global cluster.\n", + "\n", + " Parameters:\n", + " - embeddings: The input embeddings as a numpy array.\n", + " - dim: The target dimensionality for UMAP reduction.\n", + " - threshold: The probability threshold for assigning an embedding to a cluster in GMM.\n", + "\n", + " Returns:\n", + " - A list of numpy arrays, where each array contains the cluster IDs for each embedding.\n", + " \"\"\"\n", + " if len(embeddings) <= dim + 1:\n", + " # Avoid clustering when there's insufficient data\n", + " return [np.array([0]) for _ in range(len(embeddings))]\n", + "\n", + " # Global dimensionality reduction\n", + " reduced_embeddings_global = global_cluster_embeddings(embeddings, dim)\n", + " # Global clustering\n", + " global_clusters, n_global_clusters = GMM_cluster(\n", + " reduced_embeddings_global, threshold\n", + " )\n", + "\n", + " all_local_clusters = [np.array([]) for _ in range(len(embeddings))]\n", + " total_clusters = 0\n", + "\n", + " # Iterate through each global cluster to perform local clustering\n", + " for i in range(n_global_clusters):\n", + " # Extract embeddings belonging to the current global cluster\n", + " global_cluster_embeddings_ = embeddings[\n", + " np.array([i in gc for gc in global_clusters])\n", + " ]\n", + "\n", + " if len(global_cluster_embeddings_) == 0:\n", + " continue\n", + " if len(global_cluster_embeddings_) <= dim + 1:\n", + " # Handle small clusters with direct assignment\n", + " local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]\n", + " n_local_clusters = 1\n", + " else:\n", + " # Local dimensionality reduction and clustering\n", + " reduced_embeddings_local = local_cluster_embeddings(\n", + " global_cluster_embeddings_, dim\n", + " )\n", + " local_clusters, n_local_clusters = GMM_cluster(\n", + " reduced_embeddings_local, threshold\n", + " )\n", + "\n", + " # Assign local cluster IDs, adjusting for total clusters already processed\n", + " for j in range(n_local_clusters):\n", + " local_cluster_embeddings_ = global_cluster_embeddings_[\n", + " np.array([j in lc for lc in local_clusters])\n", + " ]\n", + " indices = np.where(\n", + " (embeddings == local_cluster_embeddings_[:, None]).all(-1)\n", + " )[1]\n", + " for idx in indices:\n", + " all_local_clusters[idx] = np.append(\n", + " all_local_clusters[idx], j + total_clusters\n", + " )\n", + "\n", + " total_clusters += n_local_clusters\n", + "\n", + " return all_local_clusters\n", + "\n", + "\n", + "### --- Our code below --- ###\n", + "\n", + "\n", + "def embed(texts):\n", + " \"\"\"\n", + " Generate embeddings for a list of text documents.\n", + "\n", + " This function assumes the existence of an `embd` object with a method `embed_documents`\n", + " that takes a list of texts and returns their embeddings.\n", + "\n", + " Parameters:\n", + " - texts: List[str], a list of text documents to be embedded.\n", + "\n", + " Returns:\n", + " - numpy.ndarray: An array of embeddings for the given text documents.\n", + " \"\"\"\n", + " text_embeddings = embd.embed_documents(texts)\n", + " text_embeddings_np = np.array(text_embeddings)\n", + " return text_embeddings_np\n", + "\n", + "\n", + "def embed_cluster_texts(texts):\n", + " \"\"\"\n", + " Embeds a list of texts and clusters them, returning a DataFrame with texts, their embeddings, and cluster labels.\n", + "\n", + " This function combines embedding generation and clustering into a single step. It assumes the existence\n", + " of a previously defined `perform_clustering` function that performs clustering on the embeddings.\n", + "\n", + " Parameters:\n", + " - texts: List[str], a list of text documents to be processed.\n", + "\n", + " Returns:\n", + " - pandas.DataFrame: A DataFrame containing the original texts, their embeddings, and the assigned cluster labels.\n", + " \"\"\"\n", + " text_embeddings_np = embed(texts) # Generate embeddings\n", + " cluster_labels = perform_clustering(\n", + " text_embeddings_np, 10, 0.1\n", + " ) # Perform clustering on the embeddings\n", + " df = pd.DataFrame() # Initialize a DataFrame to store the results\n", + " df[\"text\"] = texts # Store original texts\n", + " df[\"embd\"] = list(text_embeddings_np) # Store embeddings as a list in the DataFrame\n", + " df[\"cluster\"] = cluster_labels # Store cluster labels\n", + " return df\n", + "\n", + "\n", + "def fmt_txt(df: pd.DataFrame) -> str:\n", + " \"\"\"\n", + " Formats the text documents in a DataFrame into a single string.\n", + "\n", + " Parameters:\n", + " - df: DataFrame containing the 'text' column with text documents to format.\n", + "\n", + " Returns:\n", + " - A single string where all text documents are joined by a specific delimiter.\n", + " \"\"\"\n", + " unique_txt = df[\"text\"].tolist()\n", + " return \"--- --- \\n --- --- \".join(unique_txt)\n", + "\n", + "\n", + "def embed_cluster_summarize_texts(\n", + " texts: List[str], level: int\n", + ") -> Tuple[pd.DataFrame, pd.DataFrame]:\n", + " \"\"\"\n", + " Embeds, clusters, and summarizes a list of texts. This function first generates embeddings for the texts,\n", + " clusters them based on similarity, expands the cluster assignments for easier processing, and then summarizes\n", + " the content within each cluster.\n", + "\n", + " Parameters:\n", + " - texts: A list of text documents to be processed.\n", + " - level: An integer parameter that could define the depth or detail of processing.\n", + "\n", + " Returns:\n", + " - Tuple containing two DataFrames:\n", + " 1. The first DataFrame (`df_clusters`) includes the original texts, their embeddings, and cluster assignments.\n", + " 2. The second DataFrame (`df_summary`) contains summaries for each cluster, the specified level of detail,\n", + " and the cluster identifiers.\n", + " \"\"\"\n", + "\n", + " # Embed and cluster the texts, resulting in a DataFrame with 'text', 'embd', and 'cluster' columns\n", + " df_clusters = embed_cluster_texts(texts)\n", + "\n", + " # Prepare to expand the DataFrame for easier manipulation of clusters\n", + " expanded_list = []\n", + "\n", + " # Expand DataFrame entries to document-cluster pairings for straightforward processing\n", + " for index, row in df_clusters.iterrows():\n", + " for cluster in row[\"cluster\"]:\n", + " expanded_list.append(\n", + " {\"text\": row[\"text\"], \"embd\": row[\"embd\"], \"cluster\": cluster}\n", + " )\n", + "\n", + " # Create a new DataFrame from the expanded list\n", + " expanded_df = pd.DataFrame(expanded_list)\n", + "\n", + " # Retrieve unique cluster identifiers for processing\n", + " all_clusters = expanded_df[\"cluster\"].unique()\n", + "\n", + " print(f\"--Generated {len(all_clusters)} clusters--\")\n", + "\n", + " # Summarization\n", + " template = \"\"\"Here is a sub-set of Generative AI Drug Discovery doc.\n", + "\n", + " Generative AI Drug Discovery provides a way to improve Drug Discovery.\n", + "\n", + " Give a detailed summary of the documentation provided.\n", + "\n", + " Documentation:\n", + " {context}\n", + " \"\"\"\n", + " prompt = ChatPromptTemplate.from_template(template)\n", + " chain = prompt | model | StrOutputParser()\n", + "\n", + " # Format text within each cluster for summarization\n", + " summaries = []\n", + " for i in all_clusters:\n", + " df_cluster = expanded_df[expanded_df[\"cluster\"] == i]\n", + " formatted_txt = fmt_txt(df_cluster)\n", + " summaries.append(chain.invoke({\"context\": formatted_txt}))\n", + "\n", + " # Create a DataFrame to store summaries with their corresponding cluster and level\n", + " df_summary = pd.DataFrame(\n", + " {\n", + " \"summaries\": summaries,\n", + " \"level\": [level] * len(summaries),\n", + " \"cluster\": list(all_clusters),\n", + " }\n", + " )\n", + "\n", + " return df_clusters, df_summary\n", + "\n", + "\n", + "def recursive_embed_cluster_summarize(\n", + " texts: List[str], level: int = 1, n_levels: int = 3\n", + ") -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:\n", + " \"\"\"\n", + " Recursively embeds, clusters, and summarizes texts up to a specified level or until\n", + " the number of unique clusters becomes 1, storing the results at each level.\n", + "\n", + " Parameters:\n", + " - texts: List[str], texts to be processed.\n", + " - level: int, current recursion level (starts at 1).\n", + " - n_levels: int, maximum depth of recursion.\n", + "\n", + " Returns:\n", + " - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion\n", + " levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level.\n", + " \"\"\"\n", + " results = {} # Dictionary to store results at each level\n", + "\n", + " # Perform embedding, clustering, and summarization for the current level\n", + " df_clusters, df_summary = embed_cluster_summarize_texts(texts, level)\n", + "\n", + " # Store the results of the current level\n", + " results[level] = (df_clusters, df_summary)\n", + "\n", + " # Determine if further recursion is possible and meaningful\n", + " unique_clusters = df_summary[\"cluster\"].nunique()\n", + " if level < n_levels and unique_clusters > 1:\n", + " # Use summaries as the input texts for the next level of recursion\n", + " new_texts = df_summary[\"summaries\"].tolist()\n", + " next_level_results = recursive_embed_cluster_summarize(\n", + " new_texts, level + 1, n_levels\n", + " )\n", + "\n", + " # Merge the results from the next level into the current results dictionary\n", + " results.update(next_level_results)\n", + "\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f0d8cd3e-cd49-484d-9617-1b9811cc08b3", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 0 + }, + "id": "f0d8cd3e-cd49-484d-9617-1b9811cc08b3", + "outputId": "f357d357-b457-4c67-d7a6-06d360c34533" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--Generated 1 clusters--\n" + ] + } + ], + "source": [ + "# Build tree\n", + "leaf_texts = docs_texts\n", + "results = recursive_embed_cluster_summarize(leaf_texts, level=1, n_levels=3)" + ] + }, + { + "cell_type": "markdown", + "id": "e80d7098-5d16-4fa6-837c-968e5c9f118d", + "metadata": { + "id": "e80d7098-5d16-4fa6-837c-968e5c9f118d" + }, + "source": [ + "The paper reports best performance from `collapsed tree retrieval`.\n", + "\n", + "This involves flattening the tree structure into a single layer and then applying a k-nearest neighbors (kNN) search across all nodes simultaneously.\n", + "\n", + "We do simply do this below." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d28ba9e6-9124-41a8-b4fd-55a6ef4ac062", + "metadata": { + "id": "d28ba9e6-9124-41a8-b4fd-55a6ef4ac062" + }, + "outputs": [], + "source": [ + "from langchain_community.vectorstores import Chroma\n", + "\n", + "# Initialize all_texts with leaf_texts\n", + "all_texts = leaf_texts.copy()\n", + "\n", + "# Iterate through the results to extract summaries from each level and add them to all_texts\n", + "for level in sorted(results.keys()):\n", + " # Extract summaries from the current level's DataFrame\n", + " summaries = results[level][1][\"summaries\"].tolist()\n", + " # Extend all_texts with the summaries from the current level\n", + " all_texts.extend(summaries)\n", + "\n", + "# Now, use all_texts to build the vectorstore with Chroma\n", + "vectorstore = Chroma.from_texts(texts=all_texts, embedding=embd)\n", + "retriever = vectorstore.as_retriever()" + ] + }, + { + "cell_type": "markdown", + "id": "0d497627-44c6-41f7-bb63-1d858d3f188f", + "metadata": { + "id": "0d497627-44c6-41f7-bb63-1d858d3f188f" + }, + "source": [ + "Now we can using our flattened, indexed tree in a RAG chain." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9d6c894b-b3a3-4a01-b779-3e98ea382ff5", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + }, + "id": "9d6c894b-b3a3-4a01-b779-3e98ea382ff5", + "outputId": "fd62d9e3-2dca-4b62-d4ea-6c261c614e1c" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'In Generative AI Drug Discovery, cancer is being addressed through the use of advanced AI models like GPT and BERT to design new drugs. A specific example is the fine-tuning of Meta Llama 3 for cancer drug discovery, which involves creating de novo proteins tailored for therapeutic purposes.'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "from langchain import hub\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "\n", + "# Prompt\n", + "prompt = hub.pull(\"rlm/rag-prompt\")\n", + "\n", + "\n", + "# Post-processing\n", + "def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + "\n", + "# Chain\n", + "rag_chain = (\n", + " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", + " | prompt\n", + " | model\n", + " | StrOutputParser()\n", + ")\n", + "\n", + "# Question\n", + "rag_chain.invoke(\"How is cancer being addressed in Generative AI Drug Discovery? Give me a specific example.\")" + ] + }, + { + "cell_type": "markdown", + "id": "0c585b37-ad83-4069-8f5d-4a6a3e15128d", + "metadata": { + "id": "0c585b37-ad83-4069-8f5d-4a6a3e15128d" + }, + "source": [ + "Trace:\n", + "\n", + "https://smith.langchain.com/public/1dabf475-1675-4494-b16c-928fbf079851/r" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "colab": { + "provenance": [], + "machine_shape": "hm", + "gpuType": "L4" + }, + "accelerator": "GPU" + }, + "nbformat": 4, + "nbformat_minor": 5 +}