388 lines (387 with data), 10.3 kB
{
"cells": [
{
"cell_type": "raw",
"id": "c5ef4175-16c5-475c-99b8-63356b5ce1e0",
"metadata": {},
"source": [
"Author: Aya Balbaa\n",
"\n",
"email: ab72@sanger.ac.uk"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b4350c18-08b7-42e7-bf11-e605f3fea13c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import scanpy as sc\n",
"import celltypist"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5c505999-d754-4925-be28-a6041ce7a5ae",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"adata=sc.read('/lustre/scratch126/cellgen/team298/ab72/CTCL/objects/all_samples_scvi_4kbydonor_6dim_100hidd_leiden_broad_ct.h5ad')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b9344ee4-f2d5-41f3-8b1d-45b49690c422",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[10000.],\n",
" [10000.],\n",
" [10000.],\n",
" ...,\n",
" [10000.],\n",
" [10000.],\n",
" [10000.]])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"adata.X.expm1().sum(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5097fbeb-e696-445e-86b1-859a3abded60",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"adata2=sc.read('/lustre/scratch126/cellgen/team298/ab72/CTCL/objects-new/all_samples_raw.h5ad')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "0443bbd9-3fc1-43dc-b480-28f3c22154f9",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"adata.layers['raw_counts']=adata2.X.copy()"
]
},
{
"cell_type": "raw",
"id": "3906e7a9-159a-4cd5-8237-b3d9bb0a4f3e",
"metadata": {},
"source": [
"add ENS gene ids"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a931614c-5fb0-4a18-91d5-055238a60838",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"ctcl=sc.read('/lustre/scratch126/cellgen/team298/ab72/CTCL/ctcl_cellbender_raw_dbrmv_QCfiltered_17_5.h5ad')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c160629f-3622-445a-bc2d-3462c764366f",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>gene_ids</th>\n",
" <th>feature_types</th>\n",
" <th>n_cells</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>MIR1302-2HG</th>\n",
" <td>ENSG00000243485</td>\n",
" <td>Gene Expression</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AL627309.1</th>\n",
" <td>ENSG00000238009</td>\n",
" <td>Gene Expression</td>\n",
" <td>208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AL627309.3</th>\n",
" <td>ENSG00000239945</td>\n",
" <td>Gene Expression</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AL627309.5</th>\n",
" <td>ENSG00000241860</td>\n",
" <td>Gene Expression</td>\n",
" <td>419</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AP006222.2</th>\n",
" <td>ENSG00000286448</td>\n",
" <td>Gene Expression</td>\n",
" <td>82</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AC141272.1</th>\n",
" <td>ENSG00000277836</td>\n",
" <td>Gene Expression</td>\n",
" <td>119</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AC023491.2</th>\n",
" <td>ENSG00000278633</td>\n",
" <td>Gene Expression</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AC007325.1</th>\n",
" <td>ENSG00000276017</td>\n",
" <td>Gene Expression</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AC007325.4</th>\n",
" <td>ENSG00000278817</td>\n",
" <td>Gene Expression</td>\n",
" <td>3240</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AC007325.2</th>\n",
" <td>ENSG00000277196</td>\n",
" <td>Gene Expression</td>\n",
" <td>706</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>29834 rows Γ 3 columns</p>\n",
"</div>"
],
"text/plain": [
" gene_ids feature_types n_cells\n",
"MIR1302-2HG ENSG00000243485 Gene Expression 3\n",
"AL627309.1 ENSG00000238009 Gene Expression 208\n",
"AL627309.3 ENSG00000239945 Gene Expression 7\n",
"AL627309.5 ENSG00000241860 Gene Expression 419\n",
"AP006222.2 ENSG00000286448 Gene Expression 82\n",
"... ... ... ...\n",
"AC141272.1 ENSG00000277836 Gene Expression 119\n",
"AC023491.2 ENSG00000278633 Gene Expression 3\n",
"AC007325.1 ENSG00000276017 Gene Expression 9\n",
"AC007325.4 ENSG00000278817 Gene Expression 3240\n",
"AC007325.2 ENSG00000277196 Gene Expression 706\n",
"\n",
"[29834 rows x 3 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ctcl.var"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "22ab4901-9357-4037-831a-0078882abfc7",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"adata.var['genes']=adata.var_names.copy()\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a0dcb09d-a0bf-45c2-8399-fa46cb534cc2",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"gene_ids_mapping = pd.Series(ctcl.var['gene_ids'].values, index=ctcl.var_names)\n",
"\n",
"# Step 3: Map these IDs to the 'genes' in adata.var\n",
"adata.var['gene_ids'] = adata.var['genes'].map(gene_ids_mapping)\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "d16cd320-bbac-4b75-8590-15895e51e11f",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"adata.var_names= adata.var['gene_ids']"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "4b38bdae-16a8-48bb-ba60-2abb9a6b4424",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"π¬ Input data has 885959 cells and 15777 genes\n",
"π Matching reference genes in the model\n",
"𧬠3334 features used for prediction\n",
"βοΈ Scaling input data\n",
"ποΈ Predicting labels\n",
"β
Prediction done!\n",
"π Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
"βοΈ Over-clustering input data with resolution set to 30\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"IOStream.flush timed out\n",
"π³οΈ Majority voting the predictions\n",
"β
Majority voting done!\n"
]
}
],
"source": [
"import celltypist\n",
"predictions = celltypist.annotate(adata, model = '/lustre/scratch126/cellgen/team298/ab72/celltypist_model_from_Pasha.pkl', majority_voting = True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "0347947c-5fb3-40a5-a242-7e898535b0ab",
"metadata": {},
"outputs": [],
"source": [
"adata = predictions.to_adata(prefix= \"ct\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "25f77b41-75bb-41df-955a-58139e3602a3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"adata.write_h5ad('/lustre/scratch126/cellgen/team298/ab72/CTCL/objects-new/all_samples_scvi_4kbydonor_6dim_100hidd_leiden_broad_ct_celltypist.h5ad')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3326b56-5ad1-4059-87dc-81696de5a83f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "βmultiomeβ",
"language": "python",
"name": "multiome"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}