Biomedical-Text-Extract / Git / [0f33c7] /pubmed/Notebooks/Pubmed_Data_Extraction

Models:
philipB/
Biomedical-Text-Extract
Downloads: 1
[0f33c7]: / pubmed / Notebooks / Pubmed_Data_Extraction_1.ipynb
History
Download this file
1232 lines (1231 with data), 102.6 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6ee9654d-4a35-4256-92f1-3af38249b7d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from Bio import Entrez\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8fecb811-5b11-4fec-be8a-82b2d7e9f0de",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0b0aef8e-3f83-49c0-8b07-1ae6d5600e69",
   "metadata": {},
   "outputs": [],
   "source": [
    "from Bio import Entrez\n",
    "from datetime import datetime\n",
    "\n",
    "def search_pubmed(email):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "    - email (str): Email address for Entrez login.\n",
    "\n",
    "    Returns:\n",
    "    - list : List of PubMed IDs.\n",
    "    \"\"\"\n",
    "\n",
    "    # Define email for entrez login\n",
    "    Entrez.email = email\n",
    "\n",
    "    # Setup Date range for past 5 years\n",
    "    current_year = datetime.now().year\n",
    "    date_range = f\"{current_year - 5}[PDAT] : {current_year}[PDAT]\"\n",
    "\n",
    "    # Create top 5 list of diseases\n",
    "    diseases = [\"Diabetes\", \"Cardiovascular disease\", \"Cancer\", \"Alzheimer's\", \"Dementia\"]\n",
    "\n",
    "    # Initialize list to collect all PubMed IDs\n",
    "    pubmed_ids = []\n",
    "\n",
    "    for disease in diseases:\n",
    "        query = f\"{disease} AND {date_range}\"\n",
    "        handle = Entrez.esearch(db='pubmed', term=query, retmax=1000)\n",
    "        record = Entrez.read(handle)\n",
    "        handle.close()\n",
    "\n",
    "        # Append the list of IDs for the current disease to the master list\n",
    "        pubmed_ids.extend(record['IdList'])\n",
    "\n",
    "    # Return the collected list of PubMed IDs after the loop\n",
    "    return pubmed_ids\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "eea4cde5-ea69-4696-ae96-f343fd91f6d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from http.client import IncompleteRead\n",
    "\n",
    "def fetch_articles(email, ids_list, retries=3):\n",
    "    \"\"\"\n",
    "    Fetch details for a list of PubMed IDs.\n",
    "\n",
    "    Args:\n",
    "    - email (str): Email address for Entrez login.\n",
    "    - ids_list (list): List of PubMed IDs.\n",
    "\n",
    "    Returns:\n",
    "    - list: List of dictionaries with article details.\n",
    "    \"\"\"\n",
    "    ids = ','.join(ids_list)\n",
    "    Entrez.email = email\n",
    "    attempt = 0\n",
    "    while attempt < retries:\n",
    "        try:\n",
    "            # Fetch article details\n",
    "            handle = Entrez.efetch(db='pubmed', retmode='xml', id=ids)\n",
    "            results = Entrez.read(handle)\n",
    "            handle.close()\n",
    "            \n",
    "            # Add citation counts\n",
    "            for paper in results['PubmedArticle']:\n",
    "                pmid = paper['MedlineCitation']['PMID']\n",
    "                #citation_count = get_citation_count(pmid, email)\n",
    "                #paper['CitationCount'] = citation_count\n",
    "            \n",
    "            return results\n",
    "        except IncompleteRead as e:\n",
    "            print(f\"Incomplete read error encountered. Attempt {attempt + 1} of {retries}. Retrying...\")\n",
    "            attempt += 1\n",
    "            if attempt == retries:\n",
    "                print(\"Maximum retries reached. Raising last exception.\")\n",
    "                raise\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1727fcc6-d10d-4f3e-9820-d045d63c226a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_article_details(paper):\n",
    "    \"\"\"\n",
    "    Extract specific details from a PubMed article, including citation count.\n",
    "\n",
    "    Args:\n",
    "    - paper (dict): Dictionary of article details.\n",
    "\n",
    "    Returns:\n",
    "    - tuple: Extracted article details, including citation count.\n",
    "    \"\"\"\n",
    "\n",
    "    title = paper.get('MedlineCitation', {}).get('Article', {}).get('ArticleTitle', 'No Title').lower()\n",
    "    abstract_data = paper.get('MedlineCitation', {}).get('Article', {}).get('Abstract', {}).get('AbstractText', ['No Abstract'])\n",
    "    abstract = abstract_data[0].lower() if isinstance(abstract_data, list) else abstract_data.lower()\n",
    "    journal = paper.get('MedlineCitation', {}).get('Article', {}).get('Journal', {}).get('Title', 'No Journal').lower()\n",
    "    language = paper.get('MedlineCitation', {}).get('Article', {}).get('Language', ['No Language'])[0]\n",
    "    pubdate = paper.get('MedlineCitation', {}).get('Article', {}).get('Journal', {}).get('JournalIssue', {}).get('PubDate', {})\n",
    "    year = pubdate.get('Year', 'No Data')\n",
    "    month = pubdate.get('Month', 'No Data')\n",
    "    authors_data = paper.get('MedlineCitation', {}).get('Article', {}).get('AuthorList', [])\n",
    "    authors_list = []\n",
    "    affiliations_list = []\n",
    "\n",
    "    for author in authors_data:\n",
    "        # Initialize variables for each author\n",
    "        author_name = None\n",
    "        affiliation = 'No Affiliation'\n",
    "\n",
    "        # Check for author name and concatenate if present\n",
    "        if 'LastName' in author and 'ForeName' in author:\n",
    "            author_name = f\"{author['LastName']} {author['ForeName']}\"\n",
    "            authors_list.append(author_name)\n",
    "\n",
    "            # Check if 'AffiliationInfo' exists and is not an empty list\n",
    "            affiliation_info = author.get('AffiliationInfo')\n",
    "            if affiliation_info and isinstance(affiliation_info, list) and affiliation_info[0]:\n",
    "                affiliation = affiliation_info[0].get('Affiliation', 'No Affiliation').lower()\n",
    "\n",
    "        # Append affiliation to the list\n",
    "        affiliations_list.append(affiliation)\n",
    "\n",
    "    # Get Citation Count\n",
    "    #citation_count = paper.get('CitationCount', 'No Citation Count')\n",
    "\n",
    "    # Join the authors and affiliations into strings\n",
    "    authors = ', '.join(authors_list)\n",
    "    affiliations = ', '.join(affiliations_list)\n",
    "\n",
    "    # Return the extracted information\n",
    "    return title, abstract, journal, language, year, month, authors, affiliations\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "68777e37-ce8d-493c-99b6-b742e4e0540a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def create_dataframe(email, ids_list, chunk_size=1000):\n",
    "    \"\"\"\n",
    "    Create a DataFrame containing details of PubMed articles, including citation count.\n",
    "\n",
    "    This function fetches articles from PubMed in chunks and extracts relevant details\n",
    "    such as title, abstract, journal, etc., to populate a DataFrame.\n",
    "\n",
    "    Args:\n",
    "    - email (str): Email address for Entrez login.\n",
    "    - ids_list (list of str): List of PubMed IDs to fetch.\n",
    "    - chunk_size (int, optional): The number of articles to fetch in each request. Default is 1000.\n",
    "\n",
    "    Returns:\n",
    "    - pandas.DataFrame: A DataFrame where each row represents an article and columns\n",
    "      contain details like title, abstract, journal, language, year, month, study type,\n",
    "      authors, affiliations, and citation count.\n",
    "    \"\"\"\n",
    "    pubmed_df = {\n",
    "        'Title': [], 'Abstract': [], 'Journal': [], 'Language': [], 'Year': [], 'Month': [],\n",
    "         'Authors': [], 'Affiliations': []\n",
    "    }\n",
    "\n",
    "    for chunk_i in range(0, len(ids_list), chunk_size):\n",
    "        chunk = ids_list[chunk_i:chunk_i + chunk_size]\n",
    "        papers = fetch_articles(email, chunk)\n",
    "\n",
    "        if papers is None or 'PubmedArticle' not in papers:\n",
    "            print(f\"Warning: No data returned for chunk starting at index {chunk_i}\")\n",
    "            continue\n",
    "\n",
    "        for paper in papers[\"PubmedArticle\"]:\n",
    "            # Extract article details from the paper\n",
    "            title, abstract, journal, language, year, month, authors, affiliations = extract_article_details(paper)\n",
    "\n",
    "            # Append the details to the respective lists in the dictionary\n",
    "            pubmed_df['Title'].append(title)\n",
    "            pubmed_df['Abstract'].append(abstract)\n",
    "            pubmed_df['Journal'].append(journal)\n",
    "            pubmed_df['Language'].append(language)\n",
    "            pubmed_df['Year'].append(year)\n",
    "            pubmed_df['Month'].append(month)\n",
    "            pubmed_df['Authors'].append(authors)\n",
    "            pubmed_df['Affiliations'].append(affiliations)\n",
    "\n",
    "    # Convert the dictionary to a pandas DataFrame\n",
    "    pubmed_df = pd.DataFrame(pubmed_df)\n",
    "\n",
    "    return pubmed_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa356505-232c-4b70-8903-dfe681391b20",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1cac8556-b429-4867-852f-c7514ecec1bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "ids_list = search_pubmed(\"fhirshotlearning@gmail.com\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7a37470d-52f2-47c2-9a74-c40c0c74bed7",
   "metadata": {},
   "outputs": [],
   "source": [
    "pubmed_df= create_dataframe(\"fhirshotlearning@gmail.com\",ids_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5ecbecda-105e-41fd-a41c-8e06e8389562",
   "metadata": {},
   "outputs": [],
   "source": [
    "impact_factor_path='CopyofImpactFactor2024.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7e311313-5603-41c0-baee-b675d2779b90",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def merge_impact_factors(pubmed_df, impact_factor_csv_path, journal_col='Journal'):\n",
    "    \"\"\"\n",
    "    Merge impact factors into the PubMed articles DataFrame, retain articles with impact factors,\n",
    "    and drop columns that only contain NaN values.\n",
    "\n",
    "    Args:\n",
    "    - pubmed_df (DataFrame): DataFrame containing PubMed articles.\n",
    "    - impact_factor_csv_path (str): Path to the CSV file with impact factors.\n",
    "    - journal_col (str): Column name for journal titles in the PubMed DataFrame.\n",
    "\n",
    "    Returns:\n",
    "    - DataFrame: The merged DataFrame with impact factors and without NaN-only columns.\n",
    "    \"\"\"\n",
    "\n",
    "    # Load the impact factor CSV file\n",
    "    impact_factors_df = pd.read_csv(impact_factor_csv_path)\n",
    "\n",
    "    # Format the journal titles consistently (strip whitespaces and convert to lowercase)\n",
    "    pubmed_df[journal_col] = pubmed_df[journal_col].str.strip().str.lower()\n",
    "    impact_factors_df['Name'] = impact_factors_df['Name'].str.strip().str.lower()\n",
    "    impact_factors_df['Abbr Name'] = impact_factors_df['Abbr Name'].str.strip().str.lower()\n",
    "\n",
    "    # Attempt to merge based on multiple keys: Name, Abbreviated Name, ISSN, and EISSN\n",
    "    merged_df = pubmed_df.merge(\n",
    "        impact_factors_df,\n",
    "        how='left',\n",
    "        left_on=journal_col,\n",
    "        right_on='Name'\n",
    "    )\n",
    "\n",
    "    # Attempt merging with additional identifiers if no matches are found\n",
    "    if merged_df['JIF'].isna().all():\n",
    "        merged_df = pubmed_df.merge(\n",
    "            impact_factors_df,\n",
    "            how='left',\n",
    "            left_on=journal_col,\n",
    "            right_on='Abbr Name'\n",
    "        )\n",
    "    elif merged_df['JIF'].isna().all() and 'ISSN' in pubmed_df.columns:\n",
    "        merged_df = pubmed_df.merge(\n",
    "            impact_factors_df,\n",
    "            how='left',\n",
    "            left_on='ISSN',\n",
    "            right_on='ISSN'\n",
    "        )\n",
    "    elif merged_df['JIF'].isna().all() and 'EISSN' in pubmed_df.columns:\n",
    "        merged_df = pubmed_df.merge(\n",
    "            impact_factors_df,\n",
    "            how='left',\n",
    "            left_on='EISSN',\n",
    "            right_on='EISSN'\n",
    "        )\n",
    "\n",
    "    # Rename relevant columns for clarity\n",
    "    merged_df.rename(columns={\n",
    "        'JIF': 'Impact_Factor',\n",
    "        'JIF5Years': 'Impact_Factor_5Years',\n",
    "        'Category': 'Journal_Category'\n",
    "    }, inplace=True)\n",
    "\n",
    "    # Retain only articles with available impact factors\n",
    "    merged_df = merged_df.dropna(subset=['Impact_Factor'])\n",
    "\n",
    "    # Drop columns that only contain NaN values\n",
    "    merged_df = merged_df.dropna(axis=1, how='all')\n",
    "    return merged_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "fa46bdec-50c9-4019-93f7-acedfb1bb832",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df = merge_impact_factors(pubmed_df,impact_factor_path , 'Journal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "3a91bd10-c030-4d50-a119-1a0d2024639b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 3267 entries, 0 to 4996\n",
      "Data columns (total 15 columns):\n",
      " #   Column                Non-Null Count  Dtype \n",
      "---  ------                --------------  ----- \n",
      " 0   Title                 3267 non-null   object\n",
      " 1   Abstract              3267 non-null   object\n",
      " 2   Journal               3267 non-null   object\n",
      " 3   Language              3267 non-null   object\n",
      " 4   Year                  3267 non-null   object\n",
      " 5   Month                 3267 non-null   object\n",
      " 6   Authors               3267 non-null   object\n",
      " 7   Affiliations          3267 non-null   object\n",
      " 8   Name                  3267 non-null   object\n",
      " 9   Abbr Name             3267 non-null   object\n",
      " 10  ISSN                  2299 non-null   object\n",
      " 11  EISSN                 3252 non-null   object\n",
      " 12  Impact_Factor         3267 non-null   object\n",
      " 13  Impact_Factor_5Years  3199 non-null   object\n",
      " 14  Journal_Category      3267 non-null   object\n",
      "dtypes: object(15)\n",
      "memory usage: 408.4+ KB\n"
     ]
    }
   ],
   "source": [
    "final_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "5920f61c-41f4-4993-b25c-8b69ddea2b25",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "19db3af468154700bfaa0e219316dbb0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
      "  warnings.warn(\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py:551: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.\n",
      "  warnings.warn(\n",
      "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 567 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 393 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 470 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 446 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 465 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 390 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 502 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 411 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 615 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 471 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 944 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 903 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 1048 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 436 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 445 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 495 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 540 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 474 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 399 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 469 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 459 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 2139 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 1313 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 759 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 389 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 510 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 477 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 386 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 454 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 571 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 462 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 475 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 409 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 595 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 433 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 428 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 407 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 514 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 506 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 412 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 488 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 815 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 627 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 447 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 763 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 408 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 855 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 385 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 591 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 653 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 487 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 857 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 468 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 448 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 458 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 456 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 623 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 913 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 482 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 440 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 413 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 426 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 509 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 422 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 450 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 538 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 388 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 410 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 746 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 439 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 451 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 391 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 449 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 496 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 866 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 714 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 697 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 674 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 414 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 438 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 562 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 478 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 1133 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 473 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 644 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 534 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 387 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 508 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 402 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 460 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 499 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 421 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 392 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 405 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 452 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 611 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 662 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 515 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 519 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 787 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 605 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 500 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 443 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 598 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 677 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 585 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 575 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 1155 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 818 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 1096 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 642 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 634 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 566 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 724 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 501 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 778 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 968 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 453 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 863 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 823 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 400 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 16409 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 569 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 676 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 493 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 630 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 625 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 1989 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 498 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 479 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 555 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 432 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 806 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 954 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 951 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 638 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 461 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 764 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 5213 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 442 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 617 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 398 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 4391 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 578 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 481 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 570 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 416 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 902 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 542 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 406 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 431 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 673 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 5394 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 418 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 466 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 732 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 521 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 688 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 403 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 5453 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 602 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 704 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 484 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 3418 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 16097 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 397 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 425 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 576 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 1519 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 669 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 829 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 430 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 17852 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 36368 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 522 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 409 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 499 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 578 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 397 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 430 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 414 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 572 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 479 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 445 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 393 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 470 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 593 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 398 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 386 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 438 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 446 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 427 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 541 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 395 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 433 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 387 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 461 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 420 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 453 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 477 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 411 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 388 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 455 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 698 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 468 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 483 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 407 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 557 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 440 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 403 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 404 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 437 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 396 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 391 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 419 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 467 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 598 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 835 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 472 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 492 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 471 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 526 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 399 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 485 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 410 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 390 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 522 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 424 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 587 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 592 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 408 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 486 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 426 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 415 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 495 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 406 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 394 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 434 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 463 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 389 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 413 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 527 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 449 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 432 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 503 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 501 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 425 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 392 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 400 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 428 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 491 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 633 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 678 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 448 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 478 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 533 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 511 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n",
      "/Users/amulya/miniforge3/envs/pytorch-test/lib/python3.8/site-packages/gliner/data_processing/processor.py:269: UserWarning: Sentence of length 457 has been truncated to 384\n",
      "  warnings.warn(f\"Sentence of length {len(tokens)} has been truncated to {max_len}\")\n"
     ]
    }
   ],
   "source": [
    "from gliner import GLiNER\n",
    "import pandas as pd\n",
    "\n",
    "# Load the GLiNER model\n",
    "model = GLiNER.from_pretrained(\"urchade/gliner_medium-v2.1\")\n",
    "\n",
    "# Labels for entity prediction\n",
    "labels_universities = [\"Organization\"]\n",
    "labels_study_types = [\"Study Type\"]\n",
    "\n",
    "# Function to extract universities from affiliations using GLiNER\n",
    "def extract_universities_gliner(affiliation):\n",
    "    \"\"\"\n",
    "    Extract universities from the affiliation string using GLiNER.\n",
    "\n",
    "    Args:\n",
    "    - affiliation (str): The affiliation string.\n",
    "\n",
    "    Returns:\n",
    "    - str: Extracted university names.\n",
    "    \"\"\"\n",
    "    if not isinstance(affiliation, str) or affiliation.strip() == \"\":\n",
    "        return \"Unknown\"\n",
    "\n",
    "    # Perform entity prediction using GLiNER\n",
    "    entities = model.predict_entities(affiliation, labels_universities, threshold=0.5)\n",
    "\n",
    "    # Extract universities from the identified entities\n",
    "    universities = [entity[\"text\"] for entity in entities if entity[\"label\"] == \"Organization\"]\n",
    "\n",
    "    # Return universities as a comma-separated string or 'Unknown' if none found\n",
    "    return \", \".join(universities) if universities else \"Unknown\"\n",
    "\n",
    "# Function to extract study types from abstract using GLiNER\n",
    "def extract_study_type_gliner(abstract):\n",
    "    \"\"\"\n",
    "    Extract study types from the abstract text using GLiNER.\n",
    "\n",
    "    Args:\n",
    "    - abstract (str): Abstract of the study.\n",
    "\n",
    "    Returns:\n",
    "    - str: The type of study.\n",
    "    \"\"\"\n",
    "    if not isinstance(abstract, str) or abstract.strip() == \"\":\n",
    "        return \"Unknown\"\n",
    "\n",
    "    # Perform entity prediction using GLiNER\n",
    "    entities = model.predict_entities(abstract, labels_study_types, threshold=0.5)\n",
    "\n",
    "    # Extract study type from the identified entities\n",
    "    study_types = [entity[\"text\"] for entity in entities if entity[\"label\"] == \"Study Type\"]\n",
    "\n",
    "    # Return the first matched study type or 'Unknown' if none found\n",
    "    return study_types[0] if study_types else \"Unknown\"\n",
    "\n",
    "# Apply the GLiNER extraction functions to the DataFrame\n",
    "final_df['Universities'] = final_df['Affiliations'].apply(extract_universities_gliner)\n",
    "final_df['Study_Type_Extracted'] = final_df['Abstract'].apply(extract_study_type_gliner)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "33075d4c-83c1-4578-9971-5807d7696bcb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def standardize_university_names(universities_column):\n",
    "    standardized_names = []\n",
    "    for university in universities_column:\n",
    "        if university.lower() == 'unknown':\n",
    "            standardized_names.append('Unknown')\n",
    "            continue\n",
    "\n",
    "        # Extract main university name using regex\n",
    "        match = re.search(r'([a-zA-Z]+\\s*(university|institute|college|academy|school))', university, re.IGNORECASE)\n",
    "        if match:\n",
    "            standardized_names.append(match.group(0).strip().lower())\n",
    "        else:\n",
    "            standardized_names.append('Unknown')\n",
    "\n",
    "    return standardized_names\n",
    "\n",
    "final_df['Standardized_University'] = standardize_university_names(final_df['Universities'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "b0d4eee4-2fec-41bd-9761-36e4b33c8cce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                               Title  \\\n",
      "0  corrigendum to \"consensus report on glucagon-l...   \n",
      "1  human primary macrophages can transmit coxsack...   \n",
      "2  association between insulin-associated gene po...   \n",
      "3  improvement of glycemia risk index and continu...   \n",
      "4  transmucosal glucagon rapidly increases blood ...   \n",
      "\n",
      "                                            Abstract  \\\n",
      "0                                        no abstract   \n",
      "1                                        no abstract   \n",
      "2  while statins are effective at managing lipid ...   \n",
      "3  managing glycemia during ramadan is challengin...   \n",
      "4  to evaluate the effect of transmucosal glucago...   \n",
      "\n",
      "                                      Journal Language  Year Month  \\\n",
      "0  journal of diabetes science and technology      eng  2024   Nov   \n",
      "1                 journal of medical virology      eng  2024   Dec   \n",
      "2  european journal of clinical investigation      eng  2024   Nov   \n",
      "3  journal of diabetes science and technology      eng  2024   Nov   \n",
      "4      journal of feline medicine and surgery      eng  2024   Nov   \n",
      "\n",
      "                                             Authors  \\\n",
      "0                                                      \n",
      "1                             Brisse Morgan, Ly Hinh   \n",
      "2  Park Minju, Kim Jung Sun, Park Yoon-A, Lee Da ...   \n",
      "3                Al Hayek Ayman, Al Dawish Mohamed A   \n",
      "4  Cohen Emily A, Porter Lauren, Crews Chiquitha ...   \n",
      "\n",
      "                                        Affiliations  \\\n",
      "0                                                      \n",
      "1  viral immunity and pathogenesis unit, laborato...   \n",
      "2  college of pharmacy and graduate school of pha...   \n",
      "3  department of endocrinology and diabetes, diab...   \n",
      "4  department of small animal clinical sciences, ...   \n",
      "\n",
      "                                         Name             Abbr Name  ...  \\\n",
      "0  journal of diabetes science and technology  j diabetes sci techn  ...   \n",
      "1                 journal of medical virology           j med virol  ...   \n",
      "2  european journal of clinical investigation     eur j clin invest  ...   \n",
      "3  journal of diabetes science and technology  j diabetes sci techn  ...   \n",
      "4      journal of feline medicine and surgery     j feline med surg  ...   \n",
      "\n",
      "       EISSN Impact_Factor Impact_Factor_5Years  \\\n",
      "0  1932-2968           4.1                  NaN   \n",
      "1  1096-9071           6.8                  6.6   \n",
      "2  1365-2362           4.4                  4.7   \n",
      "3  1932-2968           4.1                  NaN   \n",
      "4  1532-2750           1.9                  2.0   \n",
      "\n",
      "                              Journal_Category  \\\n",
      "0         ENDOCRINOLOGY & METABOLISM|Q2|49/186   \n",
      "1                             VIROLOGY|Q1|4/41   \n",
      "2  MEDICINE, RESEARCH & EXPERIMENTAL|Q2|57/189   \n",
      "3         ENDOCRINOLOGY & METABOLISM|Q2|49/186   \n",
      "4                VETERINARY SCIENCES|Q2|46/167   \n",
      "\n",
      "                                        Universities Study_Type_Extracted  \\\n",
      "0                                            Unknown              Unknown   \n",
      "1  national institute of allergy and infectious d...              Unknown   \n",
      "2  college of pharmacy, ewha womans university, c...              Unknown   \n",
      "3  department of endocrinology and diabetes, diab...              Unknown   \n",
      "4  university of florida, college of veterinary m...              Unknown   \n",
      "\n",
      "  Standardized_University University Rank Research_Score  \n",
      "0                 unknown        NaN  NaN            NaN  \n",
      "1      national institute        NaN  NaN            NaN  \n",
      "2       womans university        NaN  NaN            NaN  \n",
      "3                 unknown        NaN  NaN            NaN  \n",
      "4                 unknown        NaN  NaN            NaN  \n",
      "\n",
      "[5 rows x 21 columns]\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "\n",
    "# Function to fetch rankings from API and merge with final_df\n",
    "def extract_and_merge_university_ranking(final_df, api_url):\n",
    "    \"\"\"\n",
    "    Extracts university rankings from a given API and merges them with the existing DataFrame.\n",
    "\n",
    "    Args:\n",
    "        final_df (DataFrame): Existing DataFrame with a column named 'Standardized_University'.\n",
    "        api_url (str): URL to the API that provides university rankings.\n",
    "\n",
    "    Returns:\n",
    "        DataFrame: Updated DataFrame containing 'Rank' and 'Research_Score' columns.\n",
    "    \"\"\"\n",
    "    headers = {\n",
    "        \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3\"\n",
    "    }\n",
    "\n",
    "    try:\n",
    "        # Sending GET request to the URL\n",
    "        response = requests.get(api_url, headers=headers)\n",
    "        response.raise_for_status()  # Raise an error for bad responses\n",
    "        data = response.json()\n",
    "\n",
    "        # Extracting relevant data from the API response\n",
    "        university_names = []\n",
    "        ranks = []\n",
    "        research_scores = []\n",
    "\n",
    "        for university in data.get('data', []):\n",
    "            uni_name = university.get('name')\n",
    "            rank = university.get('rank')\n",
    "            research_score = university.get('scores_research')\n",
    "\n",
    "            university_names.append(uni_name.lower())  # Convert to lowercase for standardization\n",
    "            ranks.append(rank)\n",
    "            research_scores.append(research_score)\n",
    "\n",
    "        # Creating DataFrame from extracted data\n",
    "        ranking_df = pd.DataFrame({\n",
    "            'University': university_names,\n",
    "            'Rank': ranks,\n",
    "            'Research_Score': research_scores\n",
    "        })\n",
    "\n",
    "        # Cleaning up rank values to remove symbols like '=' and converting to int\n",
    "        ranking_df['Rank'] = ranking_df['Rank'].replace('=', '', regex=True).astype(str)\n",
    "\n",
    "        # Standardizing 'Standardized_University' column in final_df to lowercase for matching\n",
    "        final_df['Standardized_University'] = final_df['Standardized_University'].str.lower()\n",
    "\n",
    "        # Merging rankings with the original DataFrame\n",
    "        final_df = final_df.merge(ranking_df, left_on='Standardized_University', right_on='University', how='left', suffixes=('', '_Ranking'))\n",
    "        return final_df\n",
    "\n",
    "    except requests.exceptions.RequestException as e:\n",
    "        print(f\"An error occurred while fetching university rankings: {e}\")\n",
    "        return final_df\n",
    "\n",
    "# Example usage\n",
    "api_url = \"https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2024_0__91239a4509dc50911f1949984e3fb8c5.json\"\n",
    "\n",
    "\n",
    "# Call the method and update final_df\n",
    "pubmed_final = extract_and_merge_university_ranking(final_df, api_url)\n",
    "print(pubmed_final.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "88596576-6ff5-4804-8d35-873db537446a",
   "metadata": {},
   "outputs": [],
   "source": [
    "pubmed_final.to_csv('pubmed_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f59468a7-e4d3-4dbd-aed9-814d34ad94dc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}