[6e90e5]: / code_final / celltypist_allsamples.ipynb

Download this file

388 lines (387 with data), 10.3 kB

{
 "cells": [
  {
   "cell_type": "raw",
   "id": "c5ef4175-16c5-475c-99b8-63356b5ce1e0",
   "metadata": {},
   "source": [
    "Author: Aya Balbaa\n",
    "\n",
    "email: ab72@sanger.ac.uk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b4350c18-08b7-42e7-bf11-e605f3fea13c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import scanpy as sc\n",
    "import celltypist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5c505999-d754-4925-be28-a6041ce7a5ae",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "adata=sc.read('/lustre/scratch126/cellgen/team298/ab72/CTCL/objects/all_samples_scvi_4kbydonor_6dim_100hidd_leiden_broad_ct.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b9344ee4-f2d5-41f3-8b1d-45b49690c422",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[10000.],\n",
       "        [10000.],\n",
       "        [10000.],\n",
       "        ...,\n",
       "        [10000.],\n",
       "        [10000.],\n",
       "        [10000.]])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.X.expm1().sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5097fbeb-e696-445e-86b1-859a3abded60",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "adata2=sc.read('/lustre/scratch126/cellgen/team298/ab72/CTCL/objects-new/all_samples_raw.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0443bbd9-3fc1-43dc-b480-28f3c22154f9",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "adata.layers['raw_counts']=adata2.X.copy()"
   ]
  },
  {
   "cell_type": "raw",
   "id": "3906e7a9-159a-4cd5-8237-b3d9bb0a4f3e",
   "metadata": {},
   "source": [
    "add ENS gene ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a931614c-5fb0-4a18-91d5-055238a60838",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "ctcl=sc.read('/lustre/scratch126/cellgen/team298/ab72/CTCL/ctcl_cellbender_raw_dbrmv_QCfiltered_17_5.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c160629f-3622-445a-bc2d-3462c764366f",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gene_ids</th>\n",
       "      <th>feature_types</th>\n",
       "      <th>n_cells</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>MIR1302-2HG</th>\n",
       "      <td>ENSG00000243485</td>\n",
       "      <td>Gene Expression</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AL627309.1</th>\n",
       "      <td>ENSG00000238009</td>\n",
       "      <td>Gene Expression</td>\n",
       "      <td>208</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AL627309.3</th>\n",
       "      <td>ENSG00000239945</td>\n",
       "      <td>Gene Expression</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AL627309.5</th>\n",
       "      <td>ENSG00000241860</td>\n",
       "      <td>Gene Expression</td>\n",
       "      <td>419</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AP006222.2</th>\n",
       "      <td>ENSG00000286448</td>\n",
       "      <td>Gene Expression</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AC141272.1</th>\n",
       "      <td>ENSG00000277836</td>\n",
       "      <td>Gene Expression</td>\n",
       "      <td>119</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AC023491.2</th>\n",
       "      <td>ENSG00000278633</td>\n",
       "      <td>Gene Expression</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AC007325.1</th>\n",
       "      <td>ENSG00000276017</td>\n",
       "      <td>Gene Expression</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AC007325.4</th>\n",
       "      <td>ENSG00000278817</td>\n",
       "      <td>Gene Expression</td>\n",
       "      <td>3240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AC007325.2</th>\n",
       "      <td>ENSG00000277196</td>\n",
       "      <td>Gene Expression</td>\n",
       "      <td>706</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>29834 rows Γ— 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    gene_ids    feature_types  n_cells\n",
       "MIR1302-2HG  ENSG00000243485  Gene Expression        3\n",
       "AL627309.1   ENSG00000238009  Gene Expression      208\n",
       "AL627309.3   ENSG00000239945  Gene Expression        7\n",
       "AL627309.5   ENSG00000241860  Gene Expression      419\n",
       "AP006222.2   ENSG00000286448  Gene Expression       82\n",
       "...                      ...              ...      ...\n",
       "AC141272.1   ENSG00000277836  Gene Expression      119\n",
       "AC023491.2   ENSG00000278633  Gene Expression        3\n",
       "AC007325.1   ENSG00000276017  Gene Expression        9\n",
       "AC007325.4   ENSG00000278817  Gene Expression     3240\n",
       "AC007325.2   ENSG00000277196  Gene Expression      706\n",
       "\n",
       "[29834 rows x 3 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ctcl.var"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "22ab4901-9357-4037-831a-0078882abfc7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "adata.var['genes']=adata.var_names.copy()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a0dcb09d-a0bf-45c2-8399-fa46cb534cc2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "gene_ids_mapping = pd.Series(ctcl.var['gene_ids'].values, index=ctcl.var_names)\n",
    "\n",
    "# Step 3: Map these IDs to the 'genes' in adata.var\n",
    "adata.var['gene_ids'] = adata.var['genes'].map(gene_ids_mapping)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "d16cd320-bbac-4b75-8590-15895e51e11f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "adata.var_names= adata.var['gene_ids']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4b38bdae-16a8-48bb-ba60-2abb9a6b4424",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "πŸ”¬ Input data has 885959 cells and 15777 genes\n",
      "πŸ”— Matching reference genes in the model\n",
      "🧬 3334 features used for prediction\n",
      "βš–οΈ Scaling input data\n",
      "πŸ–‹οΈ Predicting labels\n",
      "βœ… Prediction done!\n",
      "πŸ‘€ Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 30\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "IOStream.flush timed out\n",
      "πŸ—³οΈ Majority voting the predictions\n",
      "βœ… Majority voting done!\n"
     ]
    }
   ],
   "source": [
    "import celltypist\n",
    "predictions = celltypist.annotate(adata, model = '/lustre/scratch126/cellgen/team298/ab72/celltypist_model_from_Pasha.pkl', majority_voting = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "0347947c-5fb3-40a5-a242-7e898535b0ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata = predictions.to_adata(prefix= \"ct\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "25f77b41-75bb-41df-955a-58139e3602a3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "adata.write_h5ad('/lustre/scratch126/cellgen/team298/ab72/CTCL/objects-new/all_samples_scvi_4kbydonor_6dim_100hidd_leiden_broad_ct_celltypist.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3326b56-5ad1-4059-87dc-81696de5a83f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "β€œmultiome”",
   "language": "python",
   "name": "multiome"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}