698 lines (697 with data), 29.9 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Import necessary libraries\n",
"import time\n",
"import json\n",
"import zlib\n",
"import requests\n",
"from requests.adapters import HTTPAdapter, Retry\n",
"from urllib.parse import urlparse, parse_qs, urlencode\n",
"import pandas as pd\n",
"import os\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Define variables to perform UniProt ID mapping\n",
"# Adopted from https://www.uniprot.org/help/id_mapping\n",
"API_URL = \"https://rest.uniprot.org\"\n",
"POLLING_INTERVAL = 5\n",
"retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])\n",
"session = requests.Session()\n",
"session.mount(\"https://\", HTTPAdapter(max_retries=retries))\n",
"\n",
"def submit_id_mapping(from_db, to_db, ids) -> str:\n",
" \"\"\"\n",
" Function to submit a job to perform ID mapping.\n",
"\n",
" Args:\n",
" from_db (str): The source database.\n",
" to_db (str): The target database.\n",
" ids (list): The list of IDs to map.\n",
"\n",
" Returns:\n",
" str: The job ID.\n",
" \"\"\"\n",
" request = requests.post(f\"{API_URL}/idmapping/run\",\n",
" data={\"from\": from_db,\n",
" \"to\": to_db,\n",
" \"ids\": \",\".join(ids)},)\n",
" try:\n",
" request.raise_for_status()\n",
" except requests.HTTPError:\n",
" print(request.json())\n",
" raise\n",
"\n",
" return request.json()[\"jobId\"]\n",
"\n",
"def check_id_mapping_results_ready(job_id):\n",
" \"\"\"\n",
" Function to check if the ID mapping results are ready.\n",
"\n",
" Args:\n",
" job_id (str): The job ID.\n",
"\n",
" Returns:\n",
" bool: True if the results are ready, False otherwise.\n",
" \"\"\"\n",
" while True:\n",
" request = session.get(f\"{API_URL}/idmapping/status/{job_id}\")\n",
"\n",
" try:\n",
" request.raise_for_status()\n",
" except requests.HTTPError:\n",
" print(request.json())\n",
" raise\n",
"\n",
" j = request.json()\n",
" if \"jobStatus\" in j:\n",
" if j[\"jobStatus\"] in (\"NEW\", \"RUNNING\"):\n",
" print(f\"Retrying in {POLLING_INTERVAL}s\")\n",
" time.sleep(POLLING_INTERVAL)\n",
" else:\n",
" raise Exception(j[\"jobStatus\"])\n",
" else:\n",
" return bool(j[\"results\"] or j[\"failedIds\"])\n",
"\n",
"def get_id_mapping_results_link(job_id):\n",
" \"\"\"\n",
" Function to get the link to the ID mapping results.\n",
"\n",
" Args:\n",
" job_id (str): The job ID.\n",
"\n",
" Returns:\n",
" str: The link to the ID mapping results.\n",
" \"\"\"\n",
" url = f\"{API_URL}/idmapping/details/{job_id}\"\n",
" request = requests.Session().get(url)\n",
"\n",
" try:\n",
" request.raise_for_status()\n",
" except requests.HTTPError:\n",
" print(request.json())\n",
" raise\n",
"\n",
" return request.json()[\"redirectURL\"]\n",
"\n",
"def decode_results(response, file_format, compressed):\n",
" \"\"\"\n",
" Function to decode the ID mapping results.\n",
"\n",
" Args:\n",
" response (requests.Response): The response object.\n",
" file_format (str): The file format of the results.\n",
" compressed (bool): Whether the results are compressed.\n",
"\n",
" Returns:\n",
" str: The ID mapping results\n",
" \"\"\"\n",
"\n",
" if compressed:\n",
" decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)\n",
" if file_format == \"json\":\n",
" j = json.loads(decompressed.decode(\"utf-8\"))\n",
" return j\n",
" elif file_format == \"tsv\":\n",
" return [line for line in decompressed.decode(\"utf-8\").split(\"\\n\") if line]\n",
" elif file_format == \"xlsx\":\n",
" return [decompressed]\n",
" elif file_format == \"xml\":\n",
" return [decompressed.decode(\"utf-8\")]\n",
" else:\n",
" return decompressed.decode(\"utf-8\")\n",
" elif file_format == \"json\":\n",
" return response.json()\n",
" elif file_format == \"tsv\":\n",
" return [line for line in response.text.split(\"\\n\") if line]\n",
" elif file_format == \"xlsx\":\n",
" return [response.content]\n",
" elif file_format == \"xml\":\n",
" return [response.text]\n",
" return response.text\n",
"\n",
"def get_id_mapping_results_stream(url):\n",
" \"\"\"\n",
" Function to get the ID mapping results from a stream.\n",
"\n",
" Args:\n",
" url (str): The URL to the ID mapping results.\n",
"\n",
" Returns:\n",
" str: The ID mapping results.\n",
" \"\"\"\n",
" if \"/stream/\" not in url:\n",
" url = url.replace(\"/results/\", \"/results/stream/\")\n",
"\n",
" request = session.get(url)\n",
"\n",
" try:\n",
" request.raise_for_status()\n",
" except requests.HTTPError:\n",
" print(request.json())\n",
" raise\n",
"\n",
" parsed = urlparse(url)\n",
" query = parse_qs(parsed.query)\n",
" file_format = query[\"format\"][0] if \"format\" in query else \"json\"\n",
" compressed = (\n",
" query[\"compressed\"][0].lower() == \"true\" if \"compressed\" in query else False\n",
" )\n",
" return decode_results(request, file_format, compressed)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Submit a job to perform ID mapping\n",
"inputs = ['6774', '3569']\n",
"job_id = submit_id_mapping(\n",
" from_db=\"GeneID\", to_db=\"UniProtKB\", ids=inputs\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"8556e200d5f3bb6ab102e25e58225fa49fa05e88\n"
]
}
],
"source": [
"# Print the job ID\n",
"print(job_id)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Check and get the ID mapping results\n",
"if check_id_mapping_results_ready(job_id):\n",
" link = get_id_mapping_results_link(job_id)\n",
" mapping_results = get_id_mapping_results_stream(link)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Save the results to a pickle file\n",
"local_dir = '../../../../data/primekg_ibd/'\n",
"if not os.path.exists(local_dir):\n",
" os.makedirs(local_dir)\n",
"with open(os.path.join(local_dir, 'primekg_ibd_protein_mapped.pkl'), 'wb') as f:\n",
" pickle.dump(mapping_results[\"results\"], f)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>from</th>\n",
" <th>to</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6774</td>\n",
" <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6774</td>\n",
" <td>{'entryType': 'UniProtKB unreviewed (TrEMBL)',...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3569</td>\n",
" <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3569</td>\n",
" <td>{'entryType': 'UniProtKB unreviewed (TrEMBL)',...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3569</td>\n",
" <td>{'entryType': 'UniProtKB unreviewed (TrEMBL)',...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" from to\n",
"0 6774 {'entryType': 'UniProtKB reviewed (Swiss-Prot)...\n",
"1 6774 {'entryType': 'UniProtKB unreviewed (TrEMBL)',...\n",
"2 3569 {'entryType': 'UniProtKB reviewed (Swiss-Prot)...\n",
"3 3569 {'entryType': 'UniProtKB unreviewed (TrEMBL)',...\n",
"4 3569 {'entryType': 'UniProtKB unreviewed (TrEMBL)',..."
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Convert mapping results to a dataframe\n",
"protein_mapped_df = pd.DataFrame(mapping_results[\"results\"])\n",
"protein_mapped_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"UniProtKB unreviewed (TrEMBL) 5\n",
"UniProtKB reviewed (Swiss-Prot) 2\n",
"Name: count, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Checking duplicated entries based on their entryType\n",
"protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1).value_counts(0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>from</th>\n",
" <th>to</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6774</td>\n",
" <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3569</td>\n",
" <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" from to\n",
"0 6774 {'entryType': 'UniProtKB reviewed (Swiss-Prot)...\n",
"1 3569 {'entryType': 'UniProtKB reviewed (Swiss-Prot)..."
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# There are two entryType. We choose the reviewed one.\n",
"protein_reviewed_df = protein_mapped_df[protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1) == 'UniProtKB reviewed (Swiss-Prot)']\n",
"protein_reviewed_df.reset_index(drop=True, inplace=True)\n",
"protein_reviewed_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"C:\\Users\\mulyadi\\AppData\\Local\\Temp\\ipykernel_8372\\1443167319.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>from</th>\n",
" <th>to</th>\n",
" <th>entryType</th>\n",
" <th>primaryAccession</th>\n",
" <th>secondaryAccessions</th>\n",
" <th>uniProtkbId</th>\n",
" <th>entryAudit</th>\n",
" <th>annotationScore</th>\n",
" <th>organism</th>\n",
" <th>proteinExistence</th>\n",
" <th>proteinDescription</th>\n",
" <th>genes</th>\n",
" <th>comments</th>\n",
" <th>features</th>\n",
" <th>keywords</th>\n",
" <th>references</th>\n",
" <th>uniProtKBCrossReferences</th>\n",
" <th>sequence</th>\n",
" <th>extraAttributes</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6774</td>\n",
" <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
" <td>UniProtKB reviewed (Swiss-Prot)</td>\n",
" <td>P40763</td>\n",
" <td>[A8K7B8, K7ENL3, O14916, Q9BW54]</td>\n",
" <td>STAT3_HUMAN</td>\n",
" <td>{'firstPublicDate': '1995-02-01', 'lastAnnotat...</td>\n",
" <td>5.0</td>\n",
" <td>{'scientificName': 'Homo sapiens', 'commonName...</td>\n",
" <td>1: Evidence at protein level</td>\n",
" <td>{'recommendedName': {'fullName': {'evidences':...</td>\n",
" <td>[{'geneName': {'evidences': [{'evidenceCode': ...</td>\n",
" <td>[{'texts': [{'evidences': [{'evidenceCode': 'E...</td>\n",
" <td>[{'type': 'Initiator methionine', 'location': ...</td>\n",
" <td>[{'id': 'KW-0002', 'category': 'Technical term...</td>\n",
" <td>[{'referenceNumber': 1, 'citation': {'id': '75...</td>\n",
" <td>[{'database': 'EMBL', 'id': 'L29277', 'propert...</td>\n",
" <td>{'value': 'MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLA...</td>\n",
" <td>{'countByCommentType': {'FUNCTION': 1, 'SUBUNI...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3569</td>\n",
" <td>{'entryType': 'UniProtKB reviewed (Swiss-Prot)...</td>\n",
" <td>UniProtKB reviewed (Swiss-Prot)</td>\n",
" <td>P05231</td>\n",
" <td>[Q9UCU2, Q9UCU3, Q9UCU4]</td>\n",
" <td>IL6_HUMAN</td>\n",
" <td>{'firstPublicDate': '1987-08-13', 'lastAnnotat...</td>\n",
" <td>5.0</td>\n",
" <td>{'scientificName': 'Homo sapiens', 'commonName...</td>\n",
" <td>1: Evidence at protein level</td>\n",
" <td>{'recommendedName': {'fullName': {'evidences':...</td>\n",
" <td>[{'geneName': {'evidences': [{'evidenceCode': ...</td>\n",
" <td>[{'texts': [{'evidences': [{'evidenceCode': 'E...</td>\n",
" <td>[{'type': 'Signal', 'location': {'start': {'va...</td>\n",
" <td>[{'id': 'KW-0002', 'category': 'Technical term...</td>\n",
" <td>[{'referenceNumber': 1, 'citation': {'id': '34...</td>\n",
" <td>[{'database': 'EMBL', 'id': 'X04430', 'propert...</td>\n",
" <td>{'value': 'MNSFSTSAFGPVAFSLGLLLVLPAAFPAPVPPGED...</td>\n",
" <td>{'countByCommentType': {'FUNCTION': 3, 'SUBUNI...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" from to \\\n",
"0 6774 {'entryType': 'UniProtKB reviewed (Swiss-Prot)... \n",
"1 3569 {'entryType': 'UniProtKB reviewed (Swiss-Prot)... \n",
"\n",
" entryType primaryAccession \\\n",
"0 UniProtKB reviewed (Swiss-Prot) P40763 \n",
"1 UniProtKB reviewed (Swiss-Prot) P05231 \n",
"\n",
" secondaryAccessions uniProtkbId \\\n",
"0 [A8K7B8, K7ENL3, O14916, Q9BW54] STAT3_HUMAN \n",
"1 [Q9UCU2, Q9UCU3, Q9UCU4] IL6_HUMAN \n",
"\n",
" entryAudit annotationScore \\\n",
"0 {'firstPublicDate': '1995-02-01', 'lastAnnotat... 5.0 \n",
"1 {'firstPublicDate': '1987-08-13', 'lastAnnotat... 5.0 \n",
"\n",
" organism \\\n",
"0 {'scientificName': 'Homo sapiens', 'commonName... \n",
"1 {'scientificName': 'Homo sapiens', 'commonName... \n",
"\n",
" proteinExistence \\\n",
"0 1: Evidence at protein level \n",
"1 1: Evidence at protein level \n",
"\n",
" proteinDescription \\\n",
"0 {'recommendedName': {'fullName': {'evidences':... \n",
"1 {'recommendedName': {'fullName': {'evidences':... \n",
"\n",
" genes \\\n",
"0 [{'geneName': {'evidences': [{'evidenceCode': ... \n",
"1 [{'geneName': {'evidences': [{'evidenceCode': ... \n",
"\n",
" comments \\\n",
"0 [{'texts': [{'evidences': [{'evidenceCode': 'E... \n",
"1 [{'texts': [{'evidences': [{'evidenceCode': 'E... \n",
"\n",
" features \\\n",
"0 [{'type': 'Initiator methionine', 'location': ... \n",
"1 [{'type': 'Signal', 'location': {'start': {'va... \n",
"\n",
" keywords \\\n",
"0 [{'id': 'KW-0002', 'category': 'Technical term... \n",
"1 [{'id': 'KW-0002', 'category': 'Technical term... \n",
"\n",
" references \\\n",
"0 [{'referenceNumber': 1, 'citation': {'id': '75... \n",
"1 [{'referenceNumber': 1, 'citation': {'id': '34... \n",
"\n",
" uniProtKBCrossReferences \\\n",
"0 [{'database': 'EMBL', 'id': 'L29277', 'propert... \n",
"1 [{'database': 'EMBL', 'id': 'X04430', 'propert... \n",
"\n",
" sequence \\\n",
"0 {'value': 'MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLA... \n",
"1 {'value': 'MNSFSTSAFGPVAFSLGLLLVLPAAFPAPVPPGED... \n",
"\n",
" extraAttributes \n",
"0 {'countByCommentType': {'FUNCTION': 1, 'SUBUNI... \n",
"1 {'countByCommentType': {'FUNCTION': 3, 'SUBUNI... "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for key in protein_reviewed_df['to'][0].keys():\n",
" protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]\n",
"protein_reviewed_df.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}