AIAgents4Pharma / Git / [3af7d7] /aiagents4pharma/talk2biomodels/api/kegg.py

Models:
Amanda-D/
AIAgents4Pharma
Downloads: 1
[3af7d7]: / aiagents4pharma / talk2biomodels / api / kegg.py
History
Download this file
84 lines (70 with data), 2.9 kB

"""
This module contains the API for fetching Kegg database
"""
import re
from typing import List, Dict
import requests

def fetch_from_api(base_url: str, query: str) -> str:
    """Fetch data from the given API endpoint."""
    try:
        response = requests.get(base_url + query, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for query {query}: {e}")
        return ""

def fetch_kegg_names(ids: List[str], batch_size: int = 10) -> Dict[str, str]:
    """
    Fetch the names of multiple KEGG entries using the KEGG REST API in batches.

    Args:
        ids (List[str]): List of KEGG IDs.
        batch_size (int): Maximum number of IDs to include in a single request.

    Returns:
        Dict[str, str]: A mapping of KEGG IDs to their names.
    """
    if not ids:
        return {}

    base_url = "https://rest.kegg.jp/get/"
    entry_name_map = {}

    # Process IDs in batches
    for i in range(0, len(ids), batch_size):
        batch = ids[i:i + batch_size]
        query = "+".join(batch)
        entry_data = fetch_from_api(base_url, query)

        # if not entry_data:
        #     continue
        entries = entry_data.split("///")
        for entry in entries:
            if not entry.strip():
                continue
            lines = entry.strip().split("\n")
            entry_line = next((line for line in lines
                                if line.startswith("ENTRY")), None)
            name_line = next((line for line in lines
                                if line.startswith("NAME")), None)

            # if not entry_line and not name_line:
            #     continue
            entry_id = entry_line.split()[1]
            # Split multiple names in the NAME field and clean them
            names = [
                re.sub(r'[^a-zA-Z0-9\s]', '', name).strip()
                for name in name_line.replace("NAME", "").strip().split(";")
            ]
            # Join cleaned names into a single string
            entry_name_map[entry_id] = " ".join(names).strip()

    return entry_name_map

def fetch_kegg_annotations(data: List[Dict[str, str]],
                           batch_size: int = 10) -> Dict[str, Dict[str, str]]:
    """Fetch KEGG entry descriptions grouped by database type."""
    grouped_data = {}
    for entry in data:
        db_type = entry["Database"].lower()
        grouped_data.setdefault(db_type, []).append(entry["Id"])

    results = {}
    for db_type, ids in grouped_data.items():
        results[db_type] = fetch_kegg_names(ids, batch_size=batch_size)

    return results

# def get_protein_name_or_label(data: List[Dict[str, str]],
#                               batch_size: int = 10) -> Dict[str, Dict[str, str]]:
#     """Fetch descriptions for KEGG-related identifiers."""
#     return fetch_kegg_annotations(data, batch_size=batch_size)