{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "h2dngSJ8J1h8" }, "source": [ "### Annotations for the CHIA dataset using mistral-7B" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "y6X9ot2qJ1h-", "outputId": "7cd81d9a-d713-4ecb-c84a-8b452bb05301" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.0/105.0 MB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m279.7/279.7 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Building wheel for peft (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for accelerate (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" ] } ], "source": [ "!pip install -q -U git+https://github.com/huggingface/transformers.git\n", "!pip install -q -U bitsandbytes\n", "!pip install -q -U git+https://github.com/huggingface/peft.git\n", "!pip install -q -U git+https://github.com/huggingface/accelerate.git" ] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fEbMdJLLKCwX", "outputId": "23871af7-1e63-4d17-9b84-c659be597f11" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "code", "source": [ "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextGenerationPipeline\n", "import torch\n", "import os\n", "from utils import *" ], "metadata": { "id": "yxdJkCNNKaem" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "model_name = \"mistralai/Mistral-7B-v0.1\"" ], "metadata": { "id": "12ZiQ6jMKhTU" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "# Load base model(Mistral 7B)\n", "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit= True,\n", " bnb_4bit_quant_type= \"nf4\",\n", " bnb_4bit_compute_dtype= torch.bfloat16,\n", " bnb_4bit_use_double_quant= False,\n", ")\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " quantization_config=bnb_config,\n", " device_map={\"\": 0}\n", ")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 368, "referenced_widgets": [ "623351d84656420faa70ae4889536f78", "6e8db672b53f4ca3b65e861a8977e7f1", "c626b5aa49ba4a8ab4364d4a299baa1f", "d9a04c47e5aa4ccdafcff20d276bbbb1", "8d274858eef7465b8cb32b2556f65afb", "8665bd0d3d4d425786bbb64bdc57a36b", "ee3d9951387948089cd9360d9a9871ba", "b6269fd3f5324daa812c212b368ff0ad", "55b69b66ea004214b3d66af7843a7c99", "01e179ba9b6d4989bb908266137fab42", "51aacda23c324a9db4730410989c33e8", "f23ed67b7458460a86bc4c010542675a", "df6e7f48e48647cf9085a5143f7afd3c", "fca53ddc2b39498087fb8d76d4ccba83", "495a05fbb4984421897ee2beea733c63", "4290143e54344798bc7c79236338da91", "f3df546b544a4086a93e835f52f1374a", "044f3fdb95394ed4909e52d58761f60a", "44a254651d6d4d2f96627de75bc5e7dd", "a86ef33bb25644a5a313f1387bfab6e0", "39ea441013f94dd1b79abca45f7aa4e5", "f10d1774aa89441e9bbf7bd48633eeea", "2382371f37a74c86888483132a686039", "62c9043d2fb949998931c38ef6a84d66", "f549d20a326b4eeaa9e8f8694afc0f9d", "e6c39c8ec6d147e2b4376bf0a442ed1b", "8f2db049d8d24fc0b804a451b97f38b1", "cf2f5777aaa943218c01002705ad49a4", "a1d585f864ec4f96a1facb1c8fd1f228", "32fbb6cf09ae4485b5e800d739a11ef9", "4e98e5b117bd4a90866bc9bf8d397365", "6fa04422b4f2473ebed8cc51bfb8bbf9", "ebe4f4facd81490aaae3d9df1f84ef7a", "d5f20cf2d1354e54916a41d01958d663", "3f28950c5a1b433fad59457f3bfd0986", "060d6ac506cb40d4a01202a1f43773a0", "6718ab690cbd471a80b04df221148ccb", "d0a90bebc6ba424193998f6aa2871161", "630bbfdad2514be7bd6480610ed94aae", "61ced3c82e3d4299ac96838fee912cf7", "6b6cad15707a4fc4b9a6798d25ab0fec", "922a346ec1c443ce8c5082015e3b771d", "d54b32439cf5402b8f0a5a2a71d23ffb", "7867c50a7a46448a85e733a08a14074d", "ad18e4416984499686863f51bc363a75", "7c623e92720348ad8be453d2f9c8c10e", "67f9b070448648698e34ba6dce2140e4", "98c3b7b7483d40a7a92de523fa972dd0", "9ee6cee7294d45cb955f70fb1d7ec588", "c4040a2d270a4738b80e25d49db207d4", "c49a18d373fa4949b3be1c9ab15e99fd", "84d4c96405f945a58f3cbb6bd524ba08", "6f3ae0267d954f26bc961f6d645f2b9a", "972e6dc2d6e54a948e5e4690cc9b93ab", "d7db41f69a624e79bc57b15e9d2f9696", "91eebcebff2847dd923301c7722e2c99", "271682b7217b4205b43dfd89e801efe2", "b25a6da9b9cb48ebb01d8f1e8577cbd6", "1590c52cba7a41388b30be59b8225a3d", "aeb3fb1572034a84927e8df385126915", "a35ab30440bc40b7a6cb4262d6289c2c", "548e397f8e3a47a9bcaa7ba2c1f4ae48", "56fb139dd067428696acafeb2a9ee5e2", "7462a3aac96d442e9d6b93af57dc0be9", "4621e0960953441c9012a40de237e416", "4517f87fdc9e44aa9b8426d11613e76a", "d08ce15ed7ad4efb828b74f915784ec3", "7a99db0f716544448a19210f8690f9b0", "4ae5f91a5ddb47e6ad710dbba9e80187", "7d800a2659d44bec977835eb3db4326e", "81d1f7011258462699d12ee5b0a70a5d", "206fc938b0334936adb194547cfb7cc5", "72abe3b940ab473c82ba6b903530af65", "e56a79bf5c7f440c978478e88a26e0f2", "9c77ea2e33874b9e81d0d397a4662b43", "8772b2b1134a45bba71734c4400fa2c0", "8c914abb6ac5489e9c13046b66a7dd5d" ] }, "id": "-lQ8vJ4sKiNh", "outputId": "0e0cbe37-d56f-4cef-da0d-c38fc06682f9" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "config.json: 0%| | 0.00/571 [00:00 or I- depending on if they are the beginning or the inside of the entity.\n", " Please, just answer the question for this specific example and stop writting after that.\n", " Input text: 18 years or older patients who are proven to be infected by Helicobacter pylori based on positive in Urea Breath Test or positive in histopathologic examination of biopsy in antrum and corpus of gaster through esophagoduodenoscopy .\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "print(\"Second prompt:\")\n", "print(build_prompt(first_sentence, 2))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "n8X0eXX4PXwz", "outputId": "20374009-1ba6-4ed1-c057-0ca60257a776" }, "execution_count": 21, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Second prompt:\n", "I am working on a named entity recognition problem, in the context of clinical\n", " trials eligibility criteria. I will show you the list of entities:\n", " - Condition\n", " - Value\n", " - Drug\n", " - Procedure\n", " - Measurement\n", " - Temporal\n", " - Observation\n", " - Person\n", " - Mood\n", " - Device\n", "\n", " Your task consists in annotate the named entities in a given sentence in the format I will explain you.\n", " I will explain you with some examples:\n", "\n", " Example 1:\n", " Input: Patients who have received prior chemotherapy for unresectable disease.\n", " Output: Patients who have received prior chemotherapy for unresectable disease.\n", "\n", " Example 2:\n", " Input: Patients with any other severe concurrent disease, which in the judgment of the investigator, would make the patient inappropriate for entry into this study.\n", " Ouput: Patients with any other severe concurrent disease, which in the judgment of the investigator, would make the patient inappropriate for entry into this study.\n", "\n", " As you can see, in each example, the extracted entities are enclosed using the sintax: text of the entity.\n", "\n", " Please now annotate as explained before the following sentence:\n", "\n", " Input: 18 years or older patients who are proven to be infected by Helicobacter pylori based on positive in Urea Breath Test or positive in histopathologic examination of biopsy in antrum and corpus of gaster through esophagoduodenoscopy .\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "from tqdm import tqdm" ], "metadata": { "id": "igfJpR5SVevy" }, "execution_count": 27, "outputs": [] }, { "cell_type": "code", "source": [ "# generate annotations for first prompt\n", "for file in tqdm(files[:20]):\n", " sentences = []\n", " with open(f\"{data_path}{file}\", 'r+', encoding='utf-8') as fd:\n", " sentences = fd.readlines()\n", " annotated_sentences = []\n", " for sentence in sentences:\n", " if len(sentence) == 0:\n", " continue\n", " input = build_prompt(sentence, 1)\n", " output = pipe(input, max_new_tokens = 500, return_full_text = False, handle_long_generation = \"hole\")[0]['generated_text']\n", " output = output.split('\\n\\n')[0]#to get just the first sentence in the ouput and avoid noise\n", " annotated_sentences.append(output)\n", " with open(f\"./drive/MyDrive/HandsOn-NLP/data/Annotations_Mistral_Prompt_1/{file}\", 'w+', encoding='utf-8') as fd:\n", " fd.write('\\n\\n'.join(annotated_sentences))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AVdymoJWQD0P", "outputId": "f7f0af37-04cb-4f51-874d-31e23dc1f79d" }, "execution_count": 32, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "\r 0%| | 0/6 [00:00