[6e90e5]: / code_final / ExpressionSet_perparation.ipynb

Download this file

1275 lines (1274 with data), 56.2 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "single-replacement",
   "metadata": {},
   "source": [
    "### bulk ExpressionSet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "abroad-circle",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading required package: GenomicRanges\n",
      "\n",
      "Loading required package: stats4\n",
      "\n",
      "Loading required package: BiocGenerics\n",
      "\n",
      "Loading required package: parallel\n",
      "\n",
      "\n",
      "Attaching package: ‘BiocGenerics’\n",
      "\n",
      "\n",
      "The following objects are masked from ‘package:parallel’:\n",
      "\n",
      "    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n",
      "    clusterExport, clusterMap, parApply, parCapply, parLapply,\n",
      "    parLapplyLB, parRapply, parSapply, parSapplyLB\n",
      "\n",
      "\n",
      "The following objects are masked from ‘package:stats’:\n",
      "\n",
      "    IQR, mad, sd, var, xtabs\n",
      "\n",
      "\n",
      "The following objects are masked from ‘package:base’:\n",
      "\n",
      "    anyDuplicated, append, as.data.frame, basename, cbind, colnames,\n",
      "    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,\n",
      "    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,\n",
      "    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,\n",
      "    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,\n",
      "    union, unique, unsplit, which, which.max, which.min\n",
      "\n",
      "\n",
      "Loading required package: S4Vectors\n",
      "\n",
      "\n",
      "Attaching package: ‘S4Vectors’\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:base’:\n",
      "\n",
      "    expand.grid\n",
      "\n",
      "\n",
      "Loading required package: IRanges\n",
      "\n",
      "Loading required package: GenomeInfoDb\n",
      "\n",
      "Loading required package: Biobase\n",
      "\n",
      "Welcome to Bioconductor\n",
      "\n",
      "    Vignettes contain introductory material; view with\n",
      "    'browseVignettes()'. To cite Bioconductor, see\n",
      "    'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n",
      "\n",
      "\n",
      "Loading required package: DelayedArray\n",
      "\n",
      "Loading required package: matrixStats\n",
      "\n",
      "\n",
      "Attaching package: ‘matrixStats’\n",
      "\n",
      "\n",
      "The following objects are masked from ‘package:Biobase’:\n",
      "\n",
      "    anyMissing, rowMedians\n",
      "\n",
      "\n",
      "\n",
      "Attaching package: ‘DelayedArray’\n",
      "\n",
      "\n",
      "The following objects are masked from ‘package:matrixStats’:\n",
      "\n",
      "    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n",
      "\n",
      "\n",
      "The following objects are masked from ‘package:base’:\n",
      "\n",
      "    aperm, apply, rowsum\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "library(SummarizedExperiment)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "experimental-disability",
   "metadata": {},
   "outputs": [],
   "source": [
    "Matrix <- read.table(\"/lustre/scratch117/cellgen/team205/rl20/CTCL/deconvolution/GSE121212_readcount_rmdup.txt\", \n",
    "                     check.names=F) ### count matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "graduate-mining",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"dataframe\">\n",
       "<caption>A data.frame: 6 × 147</caption>\n",
       "<thead>\n",
       "\t<tr><th></th><th scope=col>AD_004_lesional</th><th scope=col>AD_004_non-lesional</th><th scope=col>AD_005_lesional</th><th scope=col>AD_005_non-lesional</th><th scope=col>AD_006_lesional</th><th scope=col>AD_006_non-lesional</th><th scope=col>AD_007_lesional</th><th scope=col>AD_007_non-lesional</th><th scope=col>AD_009_lesional</th><th scope=col>AD_009_non-lesional</th><th scope=col>⋯</th><th scope=col>AD_033_chronic_lesion</th><th scope=col>AD_034_non-lesional</th><th scope=col>AD_034_chronic_lesion</th><th scope=col>AD_032_chronic_lesion</th><th scope=col>AD_035_non-lesional</th><th scope=col>AD_035_chronic_lesion</th><th scope=col>AD_036_non-lesional</th><th scope=col>AD_036_chronic_lesion</th><th scope=col>AD_037_non-lesional</th><th scope=col>AD_037_chronic_lesion</th></tr>\n",
       "\t<tr><th></th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>⋯</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;int&gt;</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>5S_rRNA</th><td>  0</td><td>  3</td><td>   9</td><td>  12</td><td>  2</td><td>  3</td><td>   6</td><td>  4</td><td>   2</td><td>   5</td><td>⋯</td><td>   1</td><td>   2</td><td>   3</td><td>   3</td><td>  3</td><td>   4</td><td>   3</td><td>   5</td><td>   6</td><td>   2</td></tr>\n",
       "\t<tr><th scope=row>7SK</th><td>  9</td><td>  6</td><td>  25</td><td>  25</td><td>  7</td><td>  8</td><td>   9</td><td>  8</td><td>   6</td><td>  15</td><td>⋯</td><td>   3</td><td>   8</td><td>   2</td><td>   5</td><td> 12</td><td>   5</td><td>   7</td><td>   5</td><td>   8</td><td>   3</td></tr>\n",
       "\t<tr><th scope=row>A1BG</th><td>  2</td><td>  0</td><td>   4</td><td>   3</td><td>  1</td><td>  1</td><td>   0</td><td>  0</td><td>   4</td><td>   0</td><td>⋯</td><td>   1</td><td>   1</td><td>   2</td><td>   0</td><td>  1</td><td>   2</td><td>   0</td><td>   0</td><td>   1</td><td>   1</td></tr>\n",
       "\t<tr><th scope=row>A1BG-AS1</th><td>  5</td><td>  0</td><td>  11</td><td>   8</td><td>  7</td><td>  8</td><td>  13</td><td>  2</td><td>   6</td><td>  19</td><td>⋯</td><td>   4</td><td>   0</td><td>   7</td><td>   7</td><td>  4</td><td>   4</td><td>  10</td><td>  13</td><td>   8</td><td>   4</td></tr>\n",
       "\t<tr><th scope=row>A1CF</th><td>  2</td><td>  3</td><td>   2</td><td>   8</td><td>  1</td><td>  2</td><td>   0</td><td>  5</td><td>   1</td><td>   5</td><td>⋯</td><td>   3</td><td>   0</td><td>   1</td><td>   0</td><td>  0</td><td>   2</td><td>   0</td><td>   0</td><td>   1</td><td>   0</td></tr>\n",
       "\t<tr><th scope=row>A2M</th><td>308</td><td>353</td><td>2544</td><td>1366</td><td>978</td><td>406</td><td>2761</td><td>928</td><td>2796</td><td>1891</td><td>⋯</td><td>1463</td><td>1391</td><td>1714</td><td>1998</td><td>683</td><td>1154</td><td>2361</td><td>2374</td><td>1040</td><td>2755</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "A data.frame: 6 × 147\n",
       "\\begin{tabular}{r|lllllllllllllllllllll}\n",
       "  & AD\\_004\\_lesional & AD\\_004\\_non-lesional & AD\\_005\\_lesional & AD\\_005\\_non-lesional & AD\\_006\\_lesional & AD\\_006\\_non-lesional & AD\\_007\\_lesional & AD\\_007\\_non-lesional & AD\\_009\\_lesional & AD\\_009\\_non-lesional & ⋯ & AD\\_033\\_chronic\\_lesion & AD\\_034\\_non-lesional & AD\\_034\\_chronic\\_lesion & AD\\_032\\_chronic\\_lesion & AD\\_035\\_non-lesional & AD\\_035\\_chronic\\_lesion & AD\\_036\\_non-lesional & AD\\_036\\_chronic\\_lesion & AD\\_037\\_non-lesional & AD\\_037\\_chronic\\_lesion\\\\\n",
       "  & <int> & <int> & <int> & <int> & <int> & <int> & <int> & <int> & <int> & <int> & ⋯ & <int> & <int> & <int> & <int> & <int> & <int> & <int> & <int> & <int> & <int>\\\\\n",
       "\\hline\n",
       "\t5S\\_rRNA &   0 &   3 &    9 &   12 &   2 &   3 &    6 &   4 &    2 &    5 & ⋯ &    1 &    2 &    3 &    3 &   3 &    4 &    3 &    5 &    6 &    2\\\\\n",
       "\t7SK &   9 &   6 &   25 &   25 &   7 &   8 &    9 &   8 &    6 &   15 & ⋯ &    3 &    8 &    2 &    5 &  12 &    5 &    7 &    5 &    8 &    3\\\\\n",
       "\tA1BG &   2 &   0 &    4 &    3 &   1 &   1 &    0 &   0 &    4 &    0 & ⋯ &    1 &    1 &    2 &    0 &   1 &    2 &    0 &    0 &    1 &    1\\\\\n",
       "\tA1BG-AS1 &   5 &   0 &   11 &    8 &   7 &   8 &   13 &   2 &    6 &   19 & ⋯ &    4 &    0 &    7 &    7 &   4 &    4 &   10 &   13 &    8 &    4\\\\\n",
       "\tA1CF &   2 &   3 &    2 &    8 &   1 &   2 &    0 &   5 &    1 &    5 & ⋯ &    3 &    0 &    1 &    0 &   0 &    2 &    0 &    0 &    1 &    0\\\\\n",
       "\tA2M & 308 & 353 & 2544 & 1366 & 978 & 406 & 2761 & 928 & 2796 & 1891 & ⋯ & 1463 & 1391 & 1714 & 1998 & 683 & 1154 & 2361 & 2374 & 1040 & 2755\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "A data.frame: 6 × 147\n",
       "\n",
       "| <!--/--> | AD_004_lesional &lt;int&gt; | AD_004_non-lesional &lt;int&gt; | AD_005_lesional &lt;int&gt; | AD_005_non-lesional &lt;int&gt; | AD_006_lesional &lt;int&gt; | AD_006_non-lesional &lt;int&gt; | AD_007_lesional &lt;int&gt; | AD_007_non-lesional &lt;int&gt; | AD_009_lesional &lt;int&gt; | AD_009_non-lesional &lt;int&gt; | ⋯ ⋯ | AD_033_chronic_lesion &lt;int&gt; | AD_034_non-lesional &lt;int&gt; | AD_034_chronic_lesion &lt;int&gt; | AD_032_chronic_lesion &lt;int&gt; | AD_035_non-lesional &lt;int&gt; | AD_035_chronic_lesion &lt;int&gt; | AD_036_non-lesional &lt;int&gt; | AD_036_chronic_lesion &lt;int&gt; | AD_037_non-lesional &lt;int&gt; | AD_037_chronic_lesion &lt;int&gt; |\n",
       "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
       "| 5S_rRNA |   0 |   3 |    9 |   12 |   2 |   3 |    6 |   4 |    2 |    5 | ⋯ |    1 |    2 |    3 |    3 |   3 |    4 |    3 |    5 |    6 |    2 |\n",
       "| 7SK |   9 |   6 |   25 |   25 |   7 |   8 |    9 |   8 |    6 |   15 | ⋯ |    3 |    8 |    2 |    5 |  12 |    5 |    7 |    5 |    8 |    3 |\n",
       "| A1BG |   2 |   0 |    4 |    3 |   1 |   1 |    0 |   0 |    4 |    0 | ⋯ |    1 |    1 |    2 |    0 |   1 |    2 |    0 |    0 |    1 |    1 |\n",
       "| A1BG-AS1 |   5 |   0 |   11 |    8 |   7 |   8 |   13 |   2 |    6 |   19 | ⋯ |    4 |    0 |    7 |    7 |   4 |    4 |   10 |   13 |    8 |    4 |\n",
       "| A1CF |   2 |   3 |    2 |    8 |   1 |   2 |    0 |   5 |    1 |    5 | ⋯ |    3 |    0 |    1 |    0 |   0 |    2 |    0 |    0 |    1 |    0 |\n",
       "| A2M | 308 | 353 | 2544 | 1366 | 978 | 406 | 2761 | 928 | 2796 | 1891 | ⋯ | 1463 | 1391 | 1714 | 1998 | 683 | 1154 | 2361 | 2374 | 1040 | 2755 |\n",
       "\n"
      ],
      "text/plain": [
       "         AD_004_lesional AD_004_non-lesional AD_005_lesional\n",
       "5S_rRNA    0               3                    9           \n",
       "7SK        9               6                   25           \n",
       "A1BG       2               0                    4           \n",
       "A1BG-AS1   5               0                   11           \n",
       "A1CF       2               3                    2           \n",
       "A2M      308             353                 2544           \n",
       "         AD_005_non-lesional AD_006_lesional AD_006_non-lesional\n",
       "5S_rRNA    12                  2               3                \n",
       "7SK        25                  7               8                \n",
       "A1BG        3                  1               1                \n",
       "A1BG-AS1    8                  7               8                \n",
       "A1CF        8                  1               2                \n",
       "A2M      1366                978             406                \n",
       "         AD_007_lesional AD_007_non-lesional AD_009_lesional\n",
       "5S_rRNA     6              4                    2           \n",
       "7SK         9              8                    6           \n",
       "A1BG        0              0                    4           \n",
       "A1BG-AS1   13              2                    6           \n",
       "A1CF        0              5                    1           \n",
       "A2M      2761            928                 2796           \n",
       "         AD_009_non-lesional ⋯ AD_033_chronic_lesion AD_034_non-lesional\n",
       "5S_rRNA     5                ⋯    1                     2               \n",
       "7SK        15                ⋯    3                     8               \n",
       "A1BG        0                ⋯    1                     1               \n",
       "A1BG-AS1   19                ⋯    4                     0               \n",
       "A1CF        5                ⋯    3                     0               \n",
       "A2M      1891                ⋯ 1463                  1391               \n",
       "         AD_034_chronic_lesion AD_032_chronic_lesion AD_035_non-lesional\n",
       "5S_rRNA     3                     3                    3                \n",
       "7SK         2                     5                   12                \n",
       "A1BG        2                     0                    1                \n",
       "A1BG-AS1    7                     7                    4                \n",
       "A1CF        1                     0                    0                \n",
       "A2M      1714                  1998                  683                \n",
       "         AD_035_chronic_lesion AD_036_non-lesional AD_036_chronic_lesion\n",
       "5S_rRNA     4                     3                   5                 \n",
       "7SK         5                     7                   5                 \n",
       "A1BG        2                     0                   0                 \n",
       "A1BG-AS1    4                    10                  13                 \n",
       "A1CF        2                     0                   0                 \n",
       "A2M      1154                  2361                2374                 \n",
       "         AD_037_non-lesional AD_037_chronic_lesion\n",
       "5S_rRNA     6                   2                 \n",
       "7SK         8                   3                 \n",
       "A1BG        1                   1                 \n",
       "A1BG-AS1    8                   4                 \n",
       "A1CF        1                   0                 \n",
       "A2M      1040                2755                 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".list-inline {list-style: none; margin:0; padding: 0}\n",
       ".list-inline>li {display: inline-block}\n",
       ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
       "</style>\n",
       "<ol class=list-inline><li>31362</li><li>147</li></ol>\n"
      ],
      "text/latex": [
       "\\begin{enumerate*}\n",
       "\\item 31362\n",
       "\\item 147\n",
       "\\end{enumerate*}\n"
      ],
      "text/markdown": [
       "1. 31362\n",
       "2. 147\n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "[1] 31362   147"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "head(Matrix);dim(Matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "jewish-sessions",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".list-inline {list-style: none; margin:0; padding: 0}\n",
       ".list-inline>li {display: inline-block}\n",
       ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
       "</style>\n",
       "<ol class=list-inline><li>31362</li><li>147</li></ol>\n"
      ],
      "text/latex": [
       "\\begin{enumerate*}\n",
       "\\item 31362\n",
       "\\item 147\n",
       "\\end{enumerate*}\n"
      ],
      "text/markdown": [
       "1. 31362\n",
       "2. 147\n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "[1] 31362   147"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "is_genename_uniq <- !duplicated(rownames(Matrix))\n",
    "Matrix <- Matrix[is_genename_uniq, ]\n",
    "dim(Matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "professional-pattern",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [],
      "text/latex": [],
      "text/markdown": [],
      "text/plain": [
       "named integer(0)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tt <- apply(Matrix, 2, sum)\n",
    "del <- which(tt==0)\n",
    "del"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "excess-salvation",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"dataframe\">\n",
       "<caption>A data.frame: 6 × 1</caption>\n",
       "<thead>\n",
       "\t<tr><th></th><th scope=col>gene_name</th></tr>\n",
       "\t<tr><th></th><th scope=col>&lt;chr&gt;</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>5S_rRNA</th><td>5S_rRNA </td></tr>\n",
       "\t<tr><th scope=row>7SK</th><td>7SK     </td></tr>\n",
       "\t<tr><th scope=row>A1BG</th><td>A1BG    </td></tr>\n",
       "\t<tr><th scope=row>A1BG-AS1</th><td>A1BG-AS1</td></tr>\n",
       "\t<tr><th scope=row>A1CF</th><td>A1CF    </td></tr>\n",
       "\t<tr><th scope=row>A2M</th><td>A2M     </td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "A data.frame: 6 × 1\n",
       "\\begin{tabular}{r|l}\n",
       "  & gene\\_name\\\\\n",
       "  & <chr>\\\\\n",
       "\\hline\n",
       "\t5S\\_rRNA & 5S\\_rRNA \\\\\n",
       "\t7SK & 7SK     \\\\\n",
       "\tA1BG & A1BG    \\\\\n",
       "\tA1BG-AS1 & A1BG-AS1\\\\\n",
       "\tA1CF & A1CF    \\\\\n",
       "\tA2M & A2M     \\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "A data.frame: 6 × 1\n",
       "\n",
       "| <!--/--> | gene_name &lt;chr&gt; |\n",
       "|---|---|\n",
       "| 5S_rRNA | 5S_rRNA  |\n",
       "| 7SK | 7SK      |\n",
       "| A1BG | A1BG     |\n",
       "| A1BG-AS1 | A1BG-AS1 |\n",
       "| A1CF | A1CF     |\n",
       "| A2M | A2M      |\n",
       "\n"
      ],
      "text/plain": [
       "         gene_name\n",
       "5S_rRNA  5S_rRNA  \n",
       "7SK      7SK      \n",
       "A1BG     A1BG     \n",
       "A1BG-AS1 A1BG-AS1 \n",
       "A1CF     A1CF     \n",
       "A2M      A2M      "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "fdat_df <- data.frame(gene_name=rownames(Matrix), stringsAsFactors=F)\n",
    "rownames(fdat_df) <- rownames(Matrix)\n",
    "metadata_fdat_df <- data.frame(labelDescription= c(\"gene name\"), row.names=c(\"gene_name\"))\n",
    "head(fdat_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "collaborative-inspiration",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"dataframe\">\n",
       "<caption>A data.frame: 6 × 2</caption>\n",
       "<thead>\n",
       "\t<tr><th></th><th scope=col>sample_type</th><th scope=col>condition</th></tr>\n",
       "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>AD_004_lesional</th><td>AD_L </td><td>AD</td></tr>\n",
       "\t<tr><th scope=row>AD_004_non-lesional</th><td>AD_NL</td><td>AD</td></tr>\n",
       "\t<tr><th scope=row>AD_005_lesional</th><td>AD_L </td><td>AD</td></tr>\n",
       "\t<tr><th scope=row>AD_005_non-lesional</th><td>AD_NL</td><td>AD</td></tr>\n",
       "\t<tr><th scope=row>AD_006_lesional</th><td>AD_L </td><td>AD</td></tr>\n",
       "\t<tr><th scope=row>AD_006_non-lesional</th><td>AD_NL</td><td>AD</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "A data.frame: 6 × 2\n",
       "\\begin{tabular}{r|ll}\n",
       "  & sample\\_type & condition\\\\\n",
       "  & <chr> & <chr>\\\\\n",
       "\\hline\n",
       "\tAD\\_004\\_lesional & AD\\_L  & AD\\\\\n",
       "\tAD\\_004\\_non-lesional & AD\\_NL & AD\\\\\n",
       "\tAD\\_005\\_lesional & AD\\_L  & AD\\\\\n",
       "\tAD\\_005\\_non-lesional & AD\\_NL & AD\\\\\n",
       "\tAD\\_006\\_lesional & AD\\_L  & AD\\\\\n",
       "\tAD\\_006\\_non-lesional & AD\\_NL & AD\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "A data.frame: 6 × 2\n",
       "\n",
       "| <!--/--> | sample_type &lt;chr&gt; | condition &lt;chr&gt; |\n",
       "|---|---|---|\n",
       "| AD_004_lesional | AD_L  | AD |\n",
       "| AD_004_non-lesional | AD_NL | AD |\n",
       "| AD_005_lesional | AD_L  | AD |\n",
       "| AD_005_non-lesional | AD_NL | AD |\n",
       "| AD_006_lesional | AD_L  | AD |\n",
       "| AD_006_non-lesional | AD_NL | AD |\n",
       "\n"
      ],
      "text/plain": [
       "                    sample_type condition\n",
       "AD_004_lesional     AD_L        AD       \n",
       "AD_004_non-lesional AD_NL       AD       \n",
       "AD_005_lesional     AD_L        AD       \n",
       "AD_005_non-lesional AD_NL       AD       \n",
       "AD_006_lesional     AD_L        AD       \n",
       "AD_006_non-lesional AD_NL       AD       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "metadata <- read.table(\"/lustre/scratch117/cellgen/team205/rl20/CTCL/deconvolution/GSE121212_meta.xls\", \n",
    "                       check.names=F, sep=\"\\t\", header = T)\n",
    "head(metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "declared-cookbook",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"dataframe\">\n",
       "<caption>A data.frame: 6 × 2</caption>\n",
       "<thead>\n",
       "\t<tr><th></th><th scope=col>sample_type</th><th scope=col>condition</th></tr>\n",
       "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;chr&gt;</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>AD_004_lesional</th><td>AD_L </td><td>AD</td></tr>\n",
       "\t<tr><th scope=row>AD_004_non-lesional</th><td>AD_NL</td><td>AD</td></tr>\n",
       "\t<tr><th scope=row>AD_005_lesional</th><td>AD_L </td><td>AD</td></tr>\n",
       "\t<tr><th scope=row>AD_005_non-lesional</th><td>AD_NL</td><td>AD</td></tr>\n",
       "\t<tr><th scope=row>AD_006_lesional</th><td>AD_L </td><td>AD</td></tr>\n",
       "\t<tr><th scope=row>AD_006_non-lesional</th><td>AD_NL</td><td>AD</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "A data.frame: 6 × 2\n",
       "\\begin{tabular}{r|ll}\n",
       "  & sample\\_type & condition\\\\\n",
       "  & <chr> & <chr>\\\\\n",
       "\\hline\n",
       "\tAD\\_004\\_lesional & AD\\_L  & AD\\\\\n",
       "\tAD\\_004\\_non-lesional & AD\\_NL & AD\\\\\n",
       "\tAD\\_005\\_lesional & AD\\_L  & AD\\\\\n",
       "\tAD\\_005\\_non-lesional & AD\\_NL & AD\\\\\n",
       "\tAD\\_006\\_lesional & AD\\_L  & AD\\\\\n",
       "\tAD\\_006\\_non-lesional & AD\\_NL & AD\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "A data.frame: 6 × 2\n",
       "\n",
       "| <!--/--> | sample_type &lt;chr&gt; | condition &lt;chr&gt; |\n",
       "|---|---|---|\n",
       "| AD_004_lesional | AD_L  | AD |\n",
       "| AD_004_non-lesional | AD_NL | AD |\n",
       "| AD_005_lesional | AD_L  | AD |\n",
       "| AD_005_non-lesional | AD_NL | AD |\n",
       "| AD_006_lesional | AD_L  | AD |\n",
       "| AD_006_non-lesional | AD_NL | AD |\n",
       "\n"
      ],
      "text/plain": [
       "                    sample_type condition\n",
       "AD_004_lesional     AD_L        AD       \n",
       "AD_004_non-lesional AD_NL       AD       \n",
       "AD_005_lesional     AD_L        AD       \n",
       "AD_005_non-lesional AD_NL       AD       \n",
       "AD_006_lesional     AD_L        AD       \n",
       "AD_006_non-lesional AD_NL       AD       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pdat_df <- data.frame(sample_type = metadata$sample_type, \n",
    "                      condition = metadata$condition,\n",
    "                      stringsAsFactors = F)\n",
    "rownames(pdat_df) <- rownames(metadata)\n",
    "head(pdat_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "following-sleeve",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata_pdat_df <- data.frame(labelDescription= c(\"lesion_nonlesion\", \"Condition\"), \n",
    "                               row.names=c(\"sample_type\", \"condition\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "racial-modem",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"dataframe\">\n",
       "<caption>A data.frame: 2 × 1</caption>\n",
       "<thead>\n",
       "\t<tr><th></th><th scope=col>labelDescription</th></tr>\n",
       "\t<tr><th></th><th scope=col>&lt;chr&gt;</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>sample_type</th><td>lesion_nonlesion</td></tr>\n",
       "\t<tr><th scope=row>condition</th><td>Condition       </td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "A data.frame: 2 × 1\n",
       "\\begin{tabular}{r|l}\n",
       "  & labelDescription\\\\\n",
       "  & <chr>\\\\\n",
       "\\hline\n",
       "\tsample\\_type & lesion\\_nonlesion\\\\\n",
       "\tcondition & Condition       \\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "A data.frame: 2 × 1\n",
       "\n",
       "| <!--/--> | labelDescription &lt;chr&gt; |\n",
       "|---|---|\n",
       "| sample_type | lesion_nonlesion |\n",
       "| condition | Condition        |\n",
       "\n"
      ],
      "text/plain": [
       "            labelDescription\n",
       "sample_type lesion_nonlesion\n",
       "condition   Condition       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "metadata_pdat_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "pursuant-switch",
   "metadata": {},
   "outputs": [],
   "source": [
    "Matrix <- Matrix[ ,rownames(pdat_df)] ### the order of rownames(pdat_df) and colnames(Matrix) might be different"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "comfortable-prague",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "TRUE"
      ],
      "text/latex": [
       "TRUE"
      ],
      "text/markdown": [
       "TRUE"
      ],
      "text/plain": [
       "[1] TRUE"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "all.equal(colnames(Matrix), rownames(pdat_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "ruled-spencer",
   "metadata": {},
   "outputs": [],
   "source": [
    "bulk_eset <- ExpressionSet(\n",
    "  assayData = data.matrix(Matrix),\n",
    "  phenoData=new(\"AnnotatedDataFrame\", data = pdat_df, varMetadata = metadata_pdat_df),\n",
    "  featureData=new(\"AnnotatedDataFrame\", data = fdat_df, varMetadata = metadata_fdat_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "renewable-mentor",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ExpressionSet (storageMode: lockedEnvironment)\n",
       "assayData: 31362 features, 147 samples \n",
       "  element names: exprs \n",
       "protocolData: none\n",
       "phenoData\n",
       "  sampleNames: AD_004_lesional AD_004_non-lesional ...\n",
       "    AD_037_chronic_lesion (147 total)\n",
       "  varLabels: sample_type condition\n",
       "  varMetadata: labelDescription\n",
       "featureData\n",
       "  featureNames: 5S_rRNA 7SK ... snoZ5 (31362 total)\n",
       "  fvarLabels: gene_name\n",
       "  fvarMetadata: labelDescription\n",
       "experimentData: use 'experimentData(object)'\n",
       "Annotation:  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "bulk_eset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "personalized-power",
   "metadata": {},
   "outputs": [],
   "source": [
    "saveRDS(bulk_eset, file=\"/lustre/scratch117/cellgen/team205/rl20/CTCL/deconvolution/GSE121212_readcount_ExpressionSet.rds\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "floral-undergraduate",
   "metadata": {},
   "source": [
    "### Single cell ExpressionSet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "passive-sponsorship",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading required package: reticulate\n",
      "\n",
      "Attaching SeuratObject\n",
      "\n",
      "\n",
      "Attaching package: ‘Seurat’\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:SummarizedExperiment’:\n",
      "\n",
      "    Assays\n",
      "\n",
      "\n",
      "\n",
      "Attaching package: ‘Matrix’\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:S4Vectors’:\n",
      "\n",
      "    expand\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "library(sceasy)\n",
    "library(reticulate)\n",
    "library(anndata)\n",
    "library(Seurat)\n",
    "library(BisqueRNA)\n",
    "library(Biobase)\n",
    "library(Matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "determined-permit",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message in asMethod(object):\n",
      "“sparse->dense coercion: allocating vector of size 7.3 GiB”\n"
     ]
    }
   ],
   "source": [
    "h5ad_file <- \"/lustre/scratch126/cellgen/team205/rl20/CTCL/object_revision/All_samples_final_20240707_sub0.08_for_deconv.h5ad\"\n",
    "sdata <- read_h5ad(h5ad_file)\n",
    "seurat_object <- CreateSeuratObject(counts = t(as.matrix(sdata$X)), meta.data = sdata$obs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "criminal-manor",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "An object of class Seurat \n",
       "15777 features across 62199 samples within 1 assay \n",
       "Active assay: RNA (15777 features, 0 variable features)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "seurat_object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "beginning-tongue",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  [[ suppressing 20 column names ‘AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1’, ‘AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1’, ‘AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1’ ... ]]\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "20 x 20 sparse Matrix of class \"dgCMatrix\"\n",
       "                                                \n",
       "SAMD11   . . . . . . . . . . . . . . . . . . . .\n",
       "NOC2L    1 1 . . 2 . . 1 . . . 3 1 . . . . . . .\n",
       "KLHL17   . . . . . . . . . . . . . . . . . . . .\n",
       "PLEKHN1  2 . . . . . . . 1 . . 1 . . . . . . . .\n",
       "HES4     . . 1 . . . . . . . 2 . . . . . 1 . . .\n",
       "ISG15    . . . . . . . 1 . . . . . . . . . . . .\n",
       "AGRN     . . . . . . . . . 1 . . . . 1 . . . . .\n",
       "RNF223   . . . . . . . . . . . . . . . . . . . .\n",
       "C1orf159 . . . . . . . . . . 1 . . . . . . . . .\n",
       "TTLL10   . . . . . . . . . . . . . . . . . . . .\n",
       "TNFRSF18 3 1 1 . 1 . . 5 . . . 1 3 . . 1 . . . .\n",
       "TNFRSF4  . . . . . . . 3 . 1 . . . . . . . . . .\n",
       "SDF4     6 1 5 1 1 3 . 2 . . . 3 . 2 2 1 1 . . .\n",
       "B3GALT6  . . . . . . . . . . . . . . . . . . . .\n",
       "C1QTNF12 . . 1 . . . . . . . . . . . 1 . . . 1 .\n",
       "UBE2J2   . 1 . . 3 . . 1 . . 4 1 . 1 . . 2 . . .\n",
       "SCNN1D   . . 1 . . . . . . . . . . . . . . . . .\n",
       "ACAP3    . . . . . 1 . . . . . . . . . . . . . .\n",
       "PUSL1    . . . . . . . . 1 . . . . . . . . . . .\n",
       "INTS11   3 . . . . 1 . . . . 2 . . 2 1 1 . . . ."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "gene_exprs.matrix <- seurat_object@assays$RNA@counts\n",
    "gene_exprs.matrix[1:20,1:20]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "marine-philosophy",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"dataframe\">\n",
       "<caption>A data.frame: 6 × 22</caption>\n",
       "<thead>\n",
       "\t<tr><th></th><th scope=col>orig.ident</th><th scope=col>nCount_RNA</th><th scope=col>nFeature_RNA</th><th scope=col>batch</th><th scope=col>n_counts</th><th scope=col>n_genes</th><th scope=col>percent_mito</th><th scope=col>donor</th><th scope=col>tech</th><th scope=col>sample_type</th><th scope=col>⋯</th><th scope=col>broad_ct</th><th scope=col>ctpredicted_labels</th><th scope=col>ctover_clustering</th><th scope=col>ctmajority_voting</th><th scope=col>ctconf_score</th><th scope=col>cell_type</th><th scope=col>site</th><th scope=col>tissue</th><th scope=col>Sex</th><th scope=col>type</th></tr>\n",
       "\t<tr><th></th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;int&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>⋯</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1</th><td>SeuratProject</td><td>14602</td><td>3142</td><td>0</td><td>27577</td><td>3474</td><td>0.0011241252</td><td>CTCL1</td><td>10x</td><td>CTCL</td><td>⋯</td><td>Keratinocytes</td><td>Differentiated_KC*</td><td>286</td><td>Differentiated_KC </td><td>0.96744886</td><td>Differentiated_KC </td><td>lesion</td><td>Epidermis</td><td>Female</td><td>NA</td></tr>\n",
       "\t<tr><th scope=row>AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1</th><td>SeuratProject</td><td>13463</td><td>3311</td><td>0</td><td>21021</td><td>3642</td><td>0.0004281433</td><td>CTCL1</td><td>10x</td><td>CTCL</td><td>⋯</td><td>Keratinocytes</td><td>Differentiated_KC*</td><td>492</td><td>Differentiated_KC*</td><td>0.13851590</td><td>Differentiated_KC*</td><td>lesion</td><td>Epidermis</td><td>Female</td><td>NA</td></tr>\n",
       "\t<tr><th scope=row>AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1</th><td>SeuratProject</td><td>14551</td><td>3472</td><td>0</td><td>27837</td><td>3824</td><td>0.0024787153</td><td>CTCL1</td><td>10x</td><td>CTCL</td><td>⋯</td><td>Keratinocytes</td><td>Differentiated_KC*</td><td>535</td><td>Differentiated_KC </td><td>0.01321663</td><td>Differentiated_KC </td><td>lesion</td><td>Epidermis</td><td>Female</td><td>NA</td></tr>\n",
       "\t<tr><th scope=row>AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1</th><td>SeuratProject</td><td>11215</td><td>2632</td><td>0</td><td>17545</td><td>2943</td><td>0.0056426334</td><td>CTCL1</td><td>10x</td><td>CTCL</td><td>⋯</td><td>Keratinocytes</td><td>Differentiated_KC*</td><td>298</td><td>Differentiated_KC*</td><td>0.90835925</td><td>Differentiated_KC*</td><td>lesion</td><td>Epidermis</td><td>Female</td><td>NA</td></tr>\n",
       "\t<tr><th scope=row>AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1</th><td>SeuratProject</td><td> 6150</td><td>2384</td><td>0</td><td> 9136</td><td>2665</td><td>0.0010945710</td><td>CTCL1</td><td>10x</td><td>CTCL</td><td>⋯</td><td>Keratinocytes</td><td>VE2               </td><td>368</td><td>Differentiated_KC*</td><td>0.04688790</td><td>Differentiated_KC*</td><td>lesion</td><td>Epidermis</td><td>Female</td><td>NA</td></tr>\n",
       "\t<tr><th scope=row>AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1</th><td>SeuratProject</td><td> 5296</td><td>1860</td><td>0</td><td>11803</td><td>2101</td><td>0.0155045325</td><td>CTCL1</td><td>10x</td><td>CTCL</td><td>⋯</td><td>Keratinocytes</td><td>Differentiated_KC </td><td>438</td><td>Differentiated_KC </td><td>0.67155350</td><td>Differentiated_KC </td><td>lesion</td><td>Epidermis</td><td>Female</td><td>NA</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "A data.frame: 6 × 22\n",
       "\\begin{tabular}{r|lllllllllllllllllllll}\n",
       "  & orig.ident & nCount\\_RNA & nFeature\\_RNA & batch & n\\_counts & n\\_genes & percent\\_mito & donor & tech & sample\\_type & ⋯ & broad\\_ct & ctpredicted\\_labels & ctover\\_clustering & ctmajority\\_voting & ctconf\\_score & cell\\_type & site & tissue & Sex & type\\\\\n",
       "  & <fct> & <dbl> & <int> & <fct> & <dbl> & <dbl> & <dbl> & <fct> & <fct> & <fct> & ⋯ & <fct> & <fct> & <fct> & <fct> & <dbl> & <fct> & <fct> & <fct> & <fct> & <fct>\\\\\n",
       "\\hline\n",
       "\tAAACCTGCATCACAAC-0\\_CTCL1\\_CTCL1\\_CTCL1 & SeuratProject & 14602 & 3142 & 0 & 27577 & 3474 & 0.0011241252 & CTCL1 & 10x & CTCL & ⋯ & Keratinocytes & Differentiated\\_KC* & 286 & Differentiated\\_KC  & 0.96744886 & Differentiated\\_KC  & lesion & Epidermis & Female & NA\\\\\n",
       "\tAAACCTGCATGTAAGA-0\\_CTCL1\\_CTCL1\\_CTCL1 & SeuratProject & 13463 & 3311 & 0 & 21021 & 3642 & 0.0004281433 & CTCL1 & 10x & CTCL & ⋯ & Keratinocytes & Differentiated\\_KC* & 492 & Differentiated\\_KC* & 0.13851590 & Differentiated\\_KC* & lesion & Epidermis & Female & NA\\\\\n",
       "\tAAACGGGGTCGACTGC-0\\_CTCL1\\_CTCL1\\_CTCL1 & SeuratProject & 14551 & 3472 & 0 & 27837 & 3824 & 0.0024787153 & CTCL1 & 10x & CTCL & ⋯ & Keratinocytes & Differentiated\\_KC* & 535 & Differentiated\\_KC  & 0.01321663 & Differentiated\\_KC  & lesion & Epidermis & Female & NA\\\\\n",
       "\tAAAGCAAAGCGTAATA-0\\_CTCL1\\_CTCL1\\_CTCL1 & SeuratProject & 11215 & 2632 & 0 & 17545 & 2943 & 0.0056426334 & CTCL1 & 10x & CTCL & ⋯ & Keratinocytes & Differentiated\\_KC* & 298 & Differentiated\\_KC* & 0.90835925 & Differentiated\\_KC* & lesion & Epidermis & Female & NA\\\\\n",
       "\tAAAGCAAGTCCTGCTT-0\\_CTCL1\\_CTCL1\\_CTCL1 & SeuratProject &  6150 & 2384 & 0 &  9136 & 2665 & 0.0010945710 & CTCL1 & 10x & CTCL & ⋯ & Keratinocytes & VE2                & 368 & Differentiated\\_KC* & 0.04688790 & Differentiated\\_KC* & lesion & Epidermis & Female & NA\\\\\n",
       "\tAAAGCAATCCCATTTA-0\\_CTCL1\\_CTCL1\\_CTCL1 & SeuratProject &  5296 & 1860 & 0 & 11803 & 2101 & 0.0155045325 & CTCL1 & 10x & CTCL & ⋯ & Keratinocytes & Differentiated\\_KC  & 438 & Differentiated\\_KC  & 0.67155350 & Differentiated\\_KC  & lesion & Epidermis & Female & NA\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "A data.frame: 6 × 22\n",
       "\n",
       "| <!--/--> | orig.ident &lt;fct&gt; | nCount_RNA &lt;dbl&gt; | nFeature_RNA &lt;int&gt; | batch &lt;fct&gt; | n_counts &lt;dbl&gt; | n_genes &lt;dbl&gt; | percent_mito &lt;dbl&gt; | donor &lt;fct&gt; | tech &lt;fct&gt; | sample_type &lt;fct&gt; | ⋯ ⋯ | broad_ct &lt;fct&gt; | ctpredicted_labels &lt;fct&gt; | ctover_clustering &lt;fct&gt; | ctmajority_voting &lt;fct&gt; | ctconf_score &lt;dbl&gt; | cell_type &lt;fct&gt; | site &lt;fct&gt; | tissue &lt;fct&gt; | Sex &lt;fct&gt; | type &lt;fct&gt; |\n",
       "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
       "| AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1 | SeuratProject | 14602 | 3142 | 0 | 27577 | 3474 | 0.0011241252 | CTCL1 | 10x | CTCL | ⋯ | Keratinocytes | Differentiated_KC* | 286 | Differentiated_KC  | 0.96744886 | Differentiated_KC  | lesion | Epidermis | Female | NA |\n",
       "| AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 | SeuratProject | 13463 | 3311 | 0 | 21021 | 3642 | 0.0004281433 | CTCL1 | 10x | CTCL | ⋯ | Keratinocytes | Differentiated_KC* | 492 | Differentiated_KC* | 0.13851590 | Differentiated_KC* | lesion | Epidermis | Female | NA |\n",
       "| AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1 | SeuratProject | 14551 | 3472 | 0 | 27837 | 3824 | 0.0024787153 | CTCL1 | 10x | CTCL | ⋯ | Keratinocytes | Differentiated_KC* | 535 | Differentiated_KC  | 0.01321663 | Differentiated_KC  | lesion | Epidermis | Female | NA |\n",
       "| AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1 | SeuratProject | 11215 | 2632 | 0 | 17545 | 2943 | 0.0056426334 | CTCL1 | 10x | CTCL | ⋯ | Keratinocytes | Differentiated_KC* | 298 | Differentiated_KC* | 0.90835925 | Differentiated_KC* | lesion | Epidermis | Female | NA |\n",
       "| AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1 | SeuratProject |  6150 | 2384 | 0 |  9136 | 2665 | 0.0010945710 | CTCL1 | 10x | CTCL | ⋯ | Keratinocytes | VE2                | 368 | Differentiated_KC* | 0.04688790 | Differentiated_KC* | lesion | Epidermis | Female | NA |\n",
       "| AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1 | SeuratProject |  5296 | 1860 | 0 | 11803 | 2101 | 0.0155045325 | CTCL1 | 10x | CTCL | ⋯ | Keratinocytes | Differentiated_KC  | 438 | Differentiated_KC  | 0.67155350 | Differentiated_KC  | lesion | Epidermis | Female | NA |\n",
       "\n"
      ],
      "text/plain": [
       "                                     orig.ident    nCount_RNA nFeature_RNA\n",
       "AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1 SeuratProject 14602      3142        \n",
       "AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 SeuratProject 13463      3311        \n",
       "AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1 SeuratProject 14551      3472        \n",
       "AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1 SeuratProject 11215      2632        \n",
       "AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1 SeuratProject  6150      2384        \n",
       "AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1 SeuratProject  5296      1860        \n",
       "                                     batch n_counts n_genes percent_mito donor\n",
       "AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1 0     27577    3474    0.0011241252 CTCL1\n",
       "AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 0     21021    3642    0.0004281433 CTCL1\n",
       "AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1 0     27837    3824    0.0024787153 CTCL1\n",
       "AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1 0     17545    2943    0.0056426334 CTCL1\n",
       "AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1 0      9136    2665    0.0010945710 CTCL1\n",
       "AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1 0     11803    2101    0.0155045325 CTCL1\n",
       "                                     tech sample_type ⋯ broad_ct     \n",
       "AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1 10x  CTCL        ⋯ Keratinocytes\n",
       "AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 10x  CTCL        ⋯ Keratinocytes\n",
       "AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1 10x  CTCL        ⋯ Keratinocytes\n",
       "AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1 10x  CTCL        ⋯ Keratinocytes\n",
       "AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1 10x  CTCL        ⋯ Keratinocytes\n",
       "AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1 10x  CTCL        ⋯ Keratinocytes\n",
       "                                     ctpredicted_labels ctover_clustering\n",
       "AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1 Differentiated_KC* 286              \n",
       "AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 Differentiated_KC* 492              \n",
       "AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1 Differentiated_KC* 535              \n",
       "AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1 Differentiated_KC* 298              \n",
       "AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1 VE2                368              \n",
       "AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1 Differentiated_KC  438              \n",
       "                                     ctmajority_voting  ctconf_score\n",
       "AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1 Differentiated_KC  0.96744886  \n",
       "AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 Differentiated_KC* 0.13851590  \n",
       "AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1 Differentiated_KC  0.01321663  \n",
       "AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1 Differentiated_KC* 0.90835925  \n",
       "AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1 Differentiated_KC* 0.04688790  \n",
       "AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1 Differentiated_KC  0.67155350  \n",
       "                                     cell_type          site   tissue    Sex   \n",
       "AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1 Differentiated_KC  lesion Epidermis Female\n",
       "AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 Differentiated_KC* lesion Epidermis Female\n",
       "AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1 Differentiated_KC  lesion Epidermis Female\n",
       "AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1 Differentiated_KC* lesion Epidermis Female\n",
       "AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1 Differentiated_KC* lesion Epidermis Female\n",
       "AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1 Differentiated_KC  lesion Epidermis Female\n",
       "                                     type\n",
       "AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1 NA  \n",
       "AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 NA  \n",
       "AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1 NA  \n",
       "AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1 NA  \n",
       "AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1 NA  \n",
       "AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1 NA  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "head(seurat_object@meta.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "sacred-retreat",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"dataframe\">\n",
       "<caption>A data.frame: 6 × 2</caption>\n",
       "<thead>\n",
       "\t<tr><th></th><th scope=col>donor</th><th scope=col>cell_type</th></tr>\n",
       "\t<tr><th></th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1</th><td>CTCL1</td><td>Differentiated_KC </td></tr>\n",
       "\t<tr><th scope=row>AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1</th><td>CTCL1</td><td>Differentiated_KC*</td></tr>\n",
       "\t<tr><th scope=row>AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1</th><td>CTCL1</td><td>Differentiated_KC </td></tr>\n",
       "\t<tr><th scope=row>AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1</th><td>CTCL1</td><td>Differentiated_KC*</td></tr>\n",
       "\t<tr><th scope=row>AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1</th><td>CTCL1</td><td>Differentiated_KC*</td></tr>\n",
       "\t<tr><th scope=row>AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1</th><td>CTCL1</td><td>Differentiated_KC </td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "A data.frame: 6 × 2\n",
       "\\begin{tabular}{r|ll}\n",
       "  & donor & cell\\_type\\\\\n",
       "  & <fct> & <fct>\\\\\n",
       "\\hline\n",
       "\tAAACCTGCATCACAAC-0\\_CTCL1\\_CTCL1\\_CTCL1 & CTCL1 & Differentiated\\_KC \\\\\n",
       "\tAAACCTGCATGTAAGA-0\\_CTCL1\\_CTCL1\\_CTCL1 & CTCL1 & Differentiated\\_KC*\\\\\n",
       "\tAAACGGGGTCGACTGC-0\\_CTCL1\\_CTCL1\\_CTCL1 & CTCL1 & Differentiated\\_KC \\\\\n",
       "\tAAAGCAAAGCGTAATA-0\\_CTCL1\\_CTCL1\\_CTCL1 & CTCL1 & Differentiated\\_KC*\\\\\n",
       "\tAAAGCAAGTCCTGCTT-0\\_CTCL1\\_CTCL1\\_CTCL1 & CTCL1 & Differentiated\\_KC*\\\\\n",
       "\tAAAGCAATCCCATTTA-0\\_CTCL1\\_CTCL1\\_CTCL1 & CTCL1 & Differentiated\\_KC \\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "A data.frame: 6 × 2\n",
       "\n",
       "| <!--/--> | donor &lt;fct&gt; | cell_type &lt;fct&gt; |\n",
       "|---|---|---|\n",
       "| AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1 | CTCL1 | Differentiated_KC  |\n",
       "| AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 | CTCL1 | Differentiated_KC* |\n",
       "| AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1 | CTCL1 | Differentiated_KC  |\n",
       "| AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1 | CTCL1 | Differentiated_KC* |\n",
       "| AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1 | CTCL1 | Differentiated_KC* |\n",
       "| AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1 | CTCL1 | Differentiated_KC  |\n",
       "\n"
      ],
      "text/plain": [
       "                                     donor cell_type         \n",
       "AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1 CTCL1 Differentiated_KC \n",
       "AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 CTCL1 Differentiated_KC*\n",
       "AAACGGGGTCGACTGC-0_CTCL1_CTCL1_CTCL1 CTCL1 Differentiated_KC \n",
       "AAAGCAAAGCGTAATA-0_CTCL1_CTCL1_CTCL1 CTCL1 Differentiated_KC*\n",
       "AAAGCAAGTCCTGCTT-0_CTCL1_CTCL1_CTCL1 CTCL1 Differentiated_KC*\n",
       "AAAGCAATCCCATTTA-0_CTCL1_CTCL1_CTCL1 CTCL1 Differentiated_KC "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#pheno.matrix <- seurat_object@meta.data[,c(\"donor_id\",\"broad_cell_type\",\"anno\")]\n",
    "pheno.matrix <- seurat_object@meta.data[,c(\"donor\",\"cell_type\")]\n",
    "head(pheno.matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "surface-nevada",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"dataframe\">\n",
       "<caption>A data.frame: 2 × 1</caption>\n",
       "<thead>\n",
       "\t<tr><th></th><th scope=col>labelDescription</th></tr>\n",
       "\t<tr><th></th><th scope=col>&lt;chr&gt;</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>donor</th><td>Donor ID </td></tr>\n",
       "\t<tr><th scope=row>cell_type</th><td>Cell type</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "A data.frame: 2 × 1\n",
       "\\begin{tabular}{r|l}\n",
       "  & labelDescription\\\\\n",
       "  & <chr>\\\\\n",
       "\\hline\n",
       "\tdonor & Donor ID \\\\\n",
       "\tcell\\_type & Cell type\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "A data.frame: 2 × 1\n",
       "\n",
       "| <!--/--> | labelDescription &lt;chr&gt; |\n",
       "|---|---|\n",
       "| donor | Donor ID  |\n",
       "| cell_type | Cell type |\n",
       "\n"
      ],
      "text/plain": [
       "          labelDescription\n",
       "donor     Donor ID        \n",
       "cell_type Cell type       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "metadata <- data.frame(labelDescription= c(\"Donor ID\", \"Cell type\"), \n",
    "                   row.names=c(\"donor\", \"cell_type\"))\n",
    "metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "deadly-venture",
   "metadata": {},
   "outputs": [],
   "source": [
    "library(Matrix) \n",
    "#expr_matrix <- integrated@assays$RNA@counts #sparsematrix\n",
    "gene_exprs.matrix <- gene_exprs.matrix[tabulate(summary(gene_exprs.matrix)$i) != 0, , drop = FALSE] #remove any feature without a single count\n",
    "#expr_matrix <- as.matrix(gene_exprs.matrix) #convert to matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "medical-settlement",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".list-inline {list-style: none; margin:0; padding: 0}\n",
       ".list-inline>li {display: inline-block}\n",
       ".list-inline>li:not(:last-child)::after {content: \"\\00b7\"; padding: 0 .5ex}\n",
       "</style>\n",
       "<ol class=list-inline><li>15767</li><li>62199</li></ol>\n"
      ],
      "text/latex": [
       "\\begin{enumerate*}\n",
       "\\item 15767\n",
       "\\item 62199\n",
       "\\end{enumerate*}\n"
      ],
      "text/markdown": [
       "1. 15767\n",
       "2. 62199\n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "[1] 15767 62199"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dim(gene_exprs.matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "dated-injection",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "n_slices=1\n",
      "\n",
      "converting slice 1/1\n",
      "\n",
      "columns 1:55815\n",
      "\n",
      "cbind dense submatrices\n",
      "\n"
     ]
    }
   ],
   "source": [
    "expr_matrix_den <- SCOPfunctions::utils_big_as.matrix(gene_exprs.matrix, n_slices_init = 1, verbose = T) ## for large matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "conscious-terrorist",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning message in asMethod(object):\n",
      "“sparse->dense coercion: allocating vector of size 7.3 GiB”\n"
     ]
    }
   ],
   "source": [
    "SC.eset = ExpressionSet(assayData = data.matrix(gene_exprs.matrix), \n",
    "                        phenoData =  new(\"AnnotatedDataFrame\", data = pheno.matrix, varMetadata = metadata))\n",
    "#SC.eset = ExpressionSet(assayData = expr_matrix_den, \n",
    "#                        phenoData =  new(\"AnnotatedDataFrame\", data = pheno.matrix, varMetadata = metadata)) ## for large matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "final-hardwood",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ExpressionSet (storageMode: lockedEnvironment)\n",
       "assayData: 15767 features, 62199 samples \n",
       "  element names: exprs \n",
       "protocolData: none\n",
       "phenoData\n",
       "  sampleNames: AAACCTGCATCACAAC-0_CTCL1_CTCL1_CTCL1\n",
       "    AAACCTGCATGTAAGA-0_CTCL1_CTCL1_CTCL1 ... GACGTGCTCACATACG-92_S5\n",
       "    (62199 total)\n",
       "  varLabels: donor cell_type\n",
       "  varMetadata: labelDescription\n",
       "featureData: none\n",
       "experimentData: use 'experimentData(object)'\n",
       "Annotation:  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "SC.eset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "minimal-raising",
   "metadata": {},
   "outputs": [],
   "source": [
    "saveRDS(SC.eset, file=\"/lustre/scratch126/cellgen/team205/rl20/CTCL/object_revision/All_samples_final_20240707_sub0.08_for_deconv_ExpressionSet.rds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "enormous-helping",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "4.0.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}