AIAgents4Pharma / Git / [3af7d7] /docs/notebooks/talk2knowledgegraphs/tutorial_uniprot

Models:
Amanda-D/
AIAgents4Pharma
Downloads: 1
[3af7d7]: / docs / notebooks / talk2knowledgegraphs / tutorial_uniprot_mapping.ipynb
History
Download this file
698 lines (697 with data), 29.9 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import necessary libraries\n",
    "import time\n",
    "import json\n",
    "import zlib\n",
    "import requests\n",
    "from requests.adapters import HTTPAdapter, Retry\n",
    "from urllib.parse import urlparse, parse_qs, urlencode\n",
    "import pandas as pd\n",
    "import os\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define variables to perform UniProt ID mapping\n",
    "# Adopted from https://www.uniprot.org/help/id_mapping\n",
    "API_URL = \"https://rest.uniprot.org\"\n",
    "POLLING_INTERVAL = 5\n",
    "retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])\n",
    "session = requests.Session()\n",
    "session.mount(\"https://\", HTTPAdapter(max_retries=retries))\n",
    "\n",
    "def submit_id_mapping(from_db, to_db, ids) -> str:\n",
    "    \"\"\"\n",
    "    Function to submit a job to perform ID mapping.\n",
    "\n",
    "    Args:\n",
    "        from_db (str): The source database.\n",
    "        to_db (str): The target database.\n",
    "        ids (list): The list of IDs to map.\n",
    "\n",
    "    Returns:\n",
    "        str: The job ID.\n",
    "    \"\"\"\n",
    "    request = requests.post(f\"{API_URL}/idmapping/run\",\n",
    "                            data={\"from\": from_db,\n",
    "                                  \"to\": to_db,\n",
    "                                  \"ids\": \",\".join(ids)},)\n",
    "    try:\n",
    "        request.raise_for_status()\n",
    "    except requests.HTTPError:\n",
    "        print(request.json())\n",
    "        raise\n",
    "\n",
    "    return request.json()[\"jobId\"]\n",
    "\n",
    "def check_id_mapping_results_ready(job_id):\n",
    "    \"\"\"\n",
    "    Function to check if the ID mapping results are ready.\n",
    "\n",
    "    Args:\n",
    "        job_id (str): The job ID.\n",
    "\n",
    "    Returns:\n",
    "        bool: True if the results are ready, False otherwise.\n",
    "    \"\"\"\n",
    "    while True:\n",
    "        request = session.get(f\"{API_URL}/idmapping/status/{job_id}\")\n",
    "\n",
    "        try:\n",
    "            request.raise_for_status()\n",
    "        except requests.HTTPError:\n",
    "            print(request.json())\n",
    "            raise\n",
    "\n",
    "        j = request.json()\n",
    "        if \"jobStatus\" in j:\n",
    "            if j[\"jobStatus\"] in (\"NEW\", \"RUNNING\"):\n",
    "                print(f\"Retrying in {POLLING_INTERVAL}s\")\n",
    "                time.sleep(POLLING_INTERVAL)\n",
    "            else:\n",
    "                raise Exception(j[\"jobStatus\"])\n",
    "        else:\n",
    "            return bool(j[\"results\"] or j[\"failedIds\"])\n",
    "\n",
    "def get_id_mapping_results_link(job_id):\n",
    "    \"\"\"\n",
    "    Function to get the link to the ID mapping results.\n",
    "\n",
    "    Args:\n",
    "        job_id (str): The job ID.\n",
    "\n",
    "    Returns:\n",
    "        str: The link to the ID mapping results.\n",
    "    \"\"\"\n",
    "    url = f\"{API_URL}/idmapping/details/{job_id}\"\n",
    "    request = requests.Session().get(url)\n",
    "\n",
    "    try:\n",
    "        request.raise_for_status()\n",
    "    except requests.HTTPError:\n",
    "        print(request.json())\n",
    "        raise\n",
    "\n",
    "    return request.json()[\"redirectURL\"]\n",
    "\n",
    "def decode_results(response, file_format, compressed):\n",
    "    \"\"\"\n",
    "    Function to decode the ID mapping results.\n",
    "\n",
    "    Args:\n",
    "        response (requests.Response): The response object.\n",
    "        file_format (str): The file format of the results.\n",
    "        compressed (bool): Whether the results are compressed.\n",
    "\n",
    "    Returns:\n",
    "        str: The ID mapping results\n",
    "    \"\"\"\n",
    "\n",
    "    if compressed:\n",
    "        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)\n",
    "        if file_format == \"json\":\n",
    "            j = json.loads(decompressed.decode(\"utf-8\"))\n",
    "            return j\n",
    "        elif file_format == \"tsv\":\n",
    "            return [line for line in decompressed.decode(\"utf-8\").split(\"\\n\") if line]\n",
    "        elif file_format == \"xlsx\":\n",
    "            return [decompressed]\n",
    "        elif file_format == \"xml\":\n",
    "            return [decompressed.decode(\"utf-8\")]\n",
    "        else:\n",
    "            return decompressed.decode(\"utf-8\")\n",
    "    elif file_format == \"json\":\n",
    "        return response.json()\n",
    "    elif file_format == \"tsv\":\n",
    "        return [line for line in response.text.split(\"\\n\") if line]\n",
    "    elif file_format == \"xlsx\":\n",
    "        return [response.content]\n",
    "    elif file_format == \"xml\":\n",
    "        return [response.text]\n",
    "    return response.text\n",
    "\n",
    "def get_id_mapping_results_stream(url):\n",
    "    \"\"\"\n",
    "    Function to get the ID mapping results from a stream.\n",
    "\n",
    "    Args:\n",
    "        url (str): The URL to the ID mapping results.\n",
    "\n",
    "    Returns:\n",
    "        str: The ID mapping results.\n",
    "    \"\"\"\n",
    "    if \"/stream/\" not in url:\n",
    "        url = url.replace(\"/results/\", \"/results/stream/\")\n",
    "\n",
    "    request = session.get(url)\n",
    "\n",
    "    try:\n",
    "        request.raise_for_status()\n",
    "    except requests.HTTPError:\n",
    "        print(request.json())\n",
    "        raise\n",
    "\n",
    "    parsed = urlparse(url)\n",
    "    query = parse_qs(parsed.query)\n",
    "    file_format = query[\"format\"][0] if \"format\" in query else \"json\"\n",
    "    compressed = (\n",
    "        query[\"compressed\"][0].lower() == \"true\" if \"compressed\" in query else False\n",
    "    )\n",
    "    return decode_results(request, file_format, compressed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Submit a job to perform ID mapping\n",
    "inputs = ['6774', '3569']\n",
    "job_id = submit_id_mapping(\n",
    "    from_db=\"GeneID\", to_db=\"UniProtKB\", ids=inputs\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8556e200d5f3bb6ab102e25e58225fa49fa05e88\n"
     ]
    }
   ],
   "source": [
    "# Print the job ID\n",
    "print(job_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check and get the ID mapping results\n",
    "if check_id_mapping_results_ready(job_id):\n",
    "    link = get_id_mapping_results_link(job_id)\n",
    "    mapping_results = get_id_mapping_results_stream(link)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the results to a pickle file\n",
    "local_dir = '../../../../data/primekg_ibd/'\n",
    "if not os.path.exists(local_dir):\n",
    "    os.makedirs(local_dir)\n",
    "with open(os.path.join(local_dir, 'primekg_ibd_protein_mapped.pkl'), 'wb') as f:\n",
    "    pickle.dump(mapping_results[\"results\"], f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>from</th>\n",
       "      <th>to</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6774</td>\n",
       "      <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6774</td>\n",
       "      <td>{'entryType': 'UniProtKB unreviewed (TrEMBL)',...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3569</td>\n",
       "      <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3569</td>\n",
       "      <td>{'entryType': 'UniProtKB unreviewed (TrEMBL)',...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3569</td>\n",
       "      <td>{'entryType': 'UniProtKB unreviewed (TrEMBL)',...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   from                                                 to\n",
       "0  6774  {'entryType': 'UniProtKB reviewed (Swiss-Prot)...\n",
       "1  6774  {'entryType': 'UniProtKB unreviewed (TrEMBL)',...\n",
       "2  3569  {'entryType': 'UniProtKB reviewed (Swiss-Prot)...\n",
       "3  3569  {'entryType': 'UniProtKB unreviewed (TrEMBL)',...\n",
       "4  3569  {'entryType': 'UniProtKB unreviewed (TrEMBL)',..."
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Convert mapping results to a dataframe\n",
    "protein_mapped_df = pd.DataFrame(mapping_results[\"results\"])\n",
    "protein_mapped_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "UniProtKB unreviewed (TrEMBL)      5\n",
       "UniProtKB reviewed (Swiss-Prot)    2\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Checking duplicated entries based on their entryType\n",
    "protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1).value_counts(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>from</th>\n",
       "      <th>to</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6774</td>\n",
       "      <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3569</td>\n",
       "      <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   from                                                 to\n",
       "0  6774  {'entryType': 'UniProtKB reviewed (Swiss-Prot)...\n",
       "1  3569  {'entryType': 'UniProtKB reviewed (Swiss-Prot)..."
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# There are two entryType. We choose the reviewed one.\n",
    "protein_reviewed_df = protein_mapped_df[protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1) == 'UniProtKB reviewed (Swiss-Prot)']\n",
    "protein_reviewed_df.reset_index(drop=True, inplace=True)\n",
    "protein_reviewed_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
      "C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>from</th>\n",
       "      <th>to</th>\n",
       "      <th>entryType</th>\n",
       "      <th>primaryAccession</th>\n",
       "      <th>secondaryAccessions</th>\n",
       "      <th>uniProtkbId</th>\n",
       "      <th>entryAudit</th>\n",
       "      <th>annotationScore</th>\n",
       "      <th>organism</th>\n",
       "      <th>proteinExistence</th>\n",
       "      <th>proteinDescription</th>\n",
       "      <th>genes</th>\n",
       "      <th>comments</th>\n",
       "      <th>features</th>\n",
       "      <th>keywords</th>\n",
       "      <th>references</th>\n",
       "      <th>uniProtKBCrossReferences</th>\n",
       "      <th>sequence</th>\n",
       "      <th>extraAttributes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6774</td>\n",
       "      <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
       "      <td>UniProtKB reviewed (Swiss-Prot)</td>\n",
       "      <td>P40763</td>\n",
       "      <td>[A8K7B8, K7ENL3, O14916, Q9BW54]</td>\n",
       "      <td>STAT3_HUMAN</td>\n",
       "      <td>{'firstPublicDate': '1995-02-01', 'lastAnnotat...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>{'scientificName': 'Homo sapiens', 'commonName...</td>\n",
       "      <td>1: Evidence at protein level</td>\n",
       "      <td>{'recommendedName': {'fullName': {'evidences':...</td>\n",
       "      <td>[{'geneName': {'evidences': [{'evidenceCode': ...</td>\n",
       "      <td>[{'texts': [{'evidences': [{'evidenceCode': 'E...</td>\n",
       "      <td>[{'type': 'Initiator methionine', 'location': ...</td>\n",
       "      <td>[{'id': 'KW-0002', 'category': 'Technical term...</td>\n",
       "      <td>[{'referenceNumber': 1, 'citation': {'id': '75...</td>\n",
       "      <td>[{'database': 'EMBL', 'id': 'L29277', 'propert...</td>\n",
       "      <td>{'value': 'MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLA...</td>\n",
       "      <td>{'countByCommentType': {'FUNCTION': 1, 'SUBUNI...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3569</td>\n",
       "      <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
       "      <td>UniProtKB reviewed (Swiss-Prot)</td>\n",
       "      <td>P05231</td>\n",
       "      <td>[Q9UCU2, Q9UCU3, Q9UCU4]</td>\n",
       "      <td>IL6_HUMAN</td>\n",
       "      <td>{'firstPublicDate': '1987-08-13', 'lastAnnotat...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>{'scientificName': 'Homo sapiens', 'commonName...</td>\n",
       "      <td>1: Evidence at protein level</td>\n",
       "      <td>{'recommendedName': {'fullName': {'evidences':...</td>\n",
       "      <td>[{'geneName': {'evidences': [{'evidenceCode': ...</td>\n",
       "      <td>[{'texts': [{'evidences': [{'evidenceCode': 'E...</td>\n",
       "      <td>[{'type': 'Signal', 'location': {'start': {'va...</td>\n",
       "      <td>[{'id': 'KW-0002', 'category': 'Technical term...</td>\n",
       "      <td>[{'referenceNumber': 1, 'citation': {'id': '34...</td>\n",
       "      <td>[{'database': 'EMBL', 'id': 'X04430', 'propert...</td>\n",
       "      <td>{'value': 'MNSFSTSAFGPVAFSLGLLLVLPAAFPAPVPPGED...</td>\n",
       "      <td>{'countByCommentType': {'FUNCTION': 3, 'SUBUNI...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   from                                                 to  \\\n",
       "0  6774  {'entryType': 'UniProtKB reviewed (Swiss-Prot)...   \n",
       "1  3569  {'entryType': 'UniProtKB reviewed (Swiss-Prot)...   \n",
       "\n",
       "                         entryType primaryAccession  \\\n",
       "0  UniProtKB reviewed (Swiss-Prot)           P40763   \n",
       "1  UniProtKB reviewed (Swiss-Prot)           P05231   \n",
       "\n",
       "                secondaryAccessions  uniProtkbId  \\\n",
       "0  [A8K7B8, K7ENL3, O14916, Q9BW54]  STAT3_HUMAN   \n",
       "1          [Q9UCU2, Q9UCU3, Q9UCU4]    IL6_HUMAN   \n",
       "\n",
       "                                          entryAudit  annotationScore  \\\n",
       "0  {'firstPublicDate': '1995-02-01', 'lastAnnotat...              5.0   \n",
       "1  {'firstPublicDate': '1987-08-13', 'lastAnnotat...              5.0   \n",
       "\n",
       "                                            organism  \\\n",
       "0  {'scientificName': 'Homo sapiens', 'commonName...   \n",
       "1  {'scientificName': 'Homo sapiens', 'commonName...   \n",
       "\n",
       "               proteinExistence  \\\n",
       "0  1: Evidence at protein level   \n",
       "1  1: Evidence at protein level   \n",
       "\n",
       "                                  proteinDescription  \\\n",
       "0  {'recommendedName': {'fullName': {'evidences':...   \n",
       "1  {'recommendedName': {'fullName': {'evidences':...   \n",
       "\n",
       "                                               genes  \\\n",
       "0  [{'geneName': {'evidences': [{'evidenceCode': ...   \n",
       "1  [{'geneName': {'evidences': [{'evidenceCode': ...   \n",
       "\n",
       "                                            comments  \\\n",
       "0  [{'texts': [{'evidences': [{'evidenceCode': 'E...   \n",
       "1  [{'texts': [{'evidences': [{'evidenceCode': 'E...   \n",
       "\n",
       "                                            features  \\\n",
       "0  [{'type': 'Initiator methionine', 'location': ...   \n",
       "1  [{'type': 'Signal', 'location': {'start': {'va...   \n",
       "\n",
       "                                            keywords  \\\n",
       "0  [{'id': 'KW-0002', 'category': 'Technical term...   \n",
       "1  [{'id': 'KW-0002', 'category': 'Technical term...   \n",
       "\n",
       "                                          references  \\\n",
       "0  [{'referenceNumber': 1, 'citation': {'id': '75...   \n",
       "1  [{'referenceNumber': 1, 'citation': {'id': '34...   \n",
       "\n",
       "                            uniProtKBCrossReferences  \\\n",
       "0  [{'database': 'EMBL', 'id': 'L29277', 'propert...   \n",
       "1  [{'database': 'EMBL', 'id': 'X04430', 'propert...   \n",
       "\n",
       "                                            sequence  \\\n",
       "0  {'value': 'MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLA...   \n",
       "1  {'value': 'MNSFSTSAFGPVAFSLGLLLVLPAAFPAPVPPGED...   \n",
       "\n",
       "                                     extraAttributes  \n",
       "0  {'countByCommentType': {'FUNCTION': 1, 'SUBUNI...  \n",
       "1  {'countByCommentType': {'FUNCTION': 3, 'SUBUNI...  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for key in protein_reviewed_df['to'][0].keys():\n",
    "    protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
    "protein_reviewed_df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}