Diff of /docs/preprocess.html [000000] .. [2c6b19]

Switch to unified view

a b/docs/preprocess.html
1
<!doctype html>
2
<html lang="en">
3
<head>
4
<meta charset="utf-8">
5
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
6
<meta name="generator" content="pdoc 0.8.1" />
7
<title>VITAE.preprocess API documentation</title>
8
<meta name="description" content="" />
9
<link href='https://cdnjs.cloudflare.com/ajax/libs/normalize/8.0.0/normalize.min.css' rel='stylesheet'>
10
<link href='https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/8.0.0/sanitize.min.css' rel='stylesheet'>
11
<link href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css" rel="stylesheet">
12
<style>.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
13
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
14
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
15
<script async src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS_CHTML'></script>
16
</head>
17
<body>
18
<main>
19
<article id="content">
20
<header>
21
<h1 class="title">Module <code>VITAE.preprocess</code></h1>
22
</header>
23
<section id="section-intro">
24
<details class="source">
25
<summary>
26
<span>Expand source code</span>
27
</summary>
28
<pre><code class="python"># -*- coding: utf-8 -*-
29
from typing import Optional
30
import numpy as np
31
import pandas as pd
32
from skmisc import loess
33
from sklearn import preprocessing
34
import warnings
35
from sklearn.decomposition import PCA
36
from VITAE.utils import _check_expression, _check_variability
37
38
39
def normalize_gene_expression(x, K : float = 1e4, transform_fn : str = &#39;log&#39;):
40
    &#39;&#39;&#39;Normalize the gene expression counts for each cell by the total expression counts, 
41
    divide this by a size scale factor, which is determined by total counts and a constant K
42
    then log-transforms the result.
43
44
    Parameters
45
    ----------
46
    x : np.array
47
        \([N, G^{raw}]\) The raw count data.
48
    K : float, optional
49
        The normalizing constant.
50
    transform_fn : str, optional
51
        Either &#39;log&#39; or &#39;sqrt&#39;.
52
53
    Returns
54
    ----------
55
    x_normalized : np.array
56
        \([N, G^{raw}]\) The log-normalized data.
57
    scale_factor : np.array
58
        \([N, ]\) The scale factors.
59
    &#39;&#39;&#39;          
60
    scale_factor = np.sum(x,axis=1, keepdims=True)/K
61
    if transform_fn==&#39;log&#39;:
62
        x_normalized = np.log(x/scale_factor + 1)
63
    else:
64
        x_normalized = np.where(x&gt;0, np.sqrt(x/scale_factor), 0)
65
66
    print(&#39;min normalized value: &#39; + str(np.min(x_normalized)))
67
    print(&#39;max normalized value: &#39; + str(np.max(x_normalized)))
68
    return x_normalized, scale_factor
69
70
71
def feature_select(x, gene_num : int = 2000):
72
    &#39;&#39;&#39;Select highly variable genes (HVGs)
73
    (See [Stuart *et al*, (2019)](https://www.nature.com/articles/nbt.4096) and its early version [preprint](https://www.biorxiv.org/content/10.1101/460147v1.full.pdf)
74
    Page 12-13: Data preprocessing - Feature selection for individual datasets).
75
76
    Parameters
77
    ----------
78
    x : np.array
79
        \([N, G^{raw}]\) The raw count data.
80
    gene_num : int, optional
81
        The number of genes to retain.
82
83
    Returns
84
    ----------
85
    x : np.array
86
        \([N, G]\) The count data after gene selection.
87
    index : np.array
88
        \([G, ]\) The selected index of genes.
89
    &#39;&#39;&#39;     
90
    
91
92
    n, p = x.shape
93
94
    # mean and variance of each gene of the unnormalized data  
95
    mean, var = np.mean(x, axis=0), np.var(x, axis=0, ddof=1)
96
97
    # model log10(var)~log10(mean) by local fitting of polynomials of degree 2
98
    loess_model = loess.loess(np.log10(mean), np.log10(var), 
99
                    span = 0.3, degree = 2, family=&#39;gaussian&#39;
100
                    )
101
    loess_model.fit()
102
    fitted = loess_model.outputs.fitted_values
103
104
    # standardized feature
105
    z = (x - mean)/np.sqrt(10**fitted)
106
107
    # clipped the standardized features to remove outliers
108
    z = np.clip(z, -np.inf, np.sqrt(n))
109
110
    # the variance of standardized features across all cells represents a measure of
111
    # single cell dispersion after controlling for mean expression    
112
    feature_score = np.sum(z**2, axis=0)/(n-1)
113
    
114
    # feature selection
115
    index = feature_score.argsort()[::-1][0:gene_num]
116
117
    return x[:, index], index
118
119
120
def preprocess(adata = None, processed: bool = False, dimred: bool = False, 
121
            x = None, c = None, label_names = None, raw_cell_names = None, raw_gene_names = None,  
122
            K: float = 1e4, transform_fn: str = &#39;log&#39;, gene_num: int = 2000, data_type: str = &#39;UMI&#39;, 
123
            npc: int = 64, random_state=0):
124
    &#39;&#39;&#39;Preprocess count data.
125
126
    Parameters
127
    ----------
128
    adata : AnnData, optional
129
        The scanpy object.
130
    processed : boolean
131
        Whether adata has been processed.
132
    dimred : boolean
133
        Whether the processed adata is after dimension reduction.
134
    x : np.array, optional
135
        \([N^{raw}, G^{raw}]\) The raw count matrix.
136
    c : np.array
137
        \([N^{raw}, s]\) The covariate matrix.
138
    label_names : np.array 
139
        \([N^{raw}, ]\) The true or estimated cell types.
140
    raw_cell_names : np.array  
141
        \([N^{raw}, ]\) The names of cells.
142
    raw_gene_names : np.array
143
        \([G^{raw}, ]\) The names of genes.
144
    K : int, optional
145
        The normalizing constant.
146
    transform_fn : str
147
        The transform function used to normalize the gene expression after scaling. Either &#39;log&#39; or &#39;sqrt&#39;.
148
    gene_num : int, optional
149
        The number of genes to retain.
150
    data_type : str, optional
151
        &#39;UMI&#39;, &#39;non-UMI&#39;, or &#39;Gaussian&#39;.
152
    npc : int, optional
153
        The number of PCs to retain, only used if `data_type=&#39;Gaussian&#39;`.
154
    random_state : int, optional
155
        The random state for PCA. With different random states, the resulted PCA scores are slightly different.
156
157
    Returns
158
    ----------
159
    x_normalized : np.array
160
        \([N, G]\) The preprocessed matrix.
161
    expression : np.array
162
        \([N, G^{raw}]\) The expression matrix after log-normalization and before scaling.
163
    x : np.array
164
        \([N, G]\) The raw count matrix after gene selections.
165
    c : np.array
166
        \([N, s]\) The covariates.
167
    cell_names : np.array
168
        \([N, ]\) The cell names.
169
    gene_names : np.array
170
        \([G^{raw}, ]\) The gene names.
171
    selected_gene_names : 
172
        \([G, ]\) The selected gene names.
173
    scale_factor : 
174
        \([N, ]\) The scale factors.
175
    labels : np.array
176
        \([N, ]\) The encoded labels.
177
    label_names : np.array
178
        \([N, ]\) The label names.
179
    le : sklearn.preprocessing.LabelEncoder
180
        The label encoder.
181
    gene_scalar : sklearn.preprocessing.StandardScaler
182
        The gene scaler.
183
    &#39;&#39;&#39;
184
    # if input is a scanpy data
185
    if adata is not None:
186
        import scanpy as sc
187
        
188
        # if the input scanpy is processed
189
        if processed: 
190
            x_normalized = x = adata.X
191
            gene_names = adata.var_names.values
192
            expression = None
193
            scale_factor = np.ones(x.shape[0])
194
        # if the input scanpy is not processed
195
        else: 
196
            dimred = False
197
            x = adata.X.copy()
198
            adata, expression, gene_names, cell_mask, gene_mask, gene_mask2 = _recipe_seurat(adata, gene_num)
199
            x_normalized = adata.X.copy()
200
            scale_factor = adata.obs.counts_per_cell.values / 1e4
201
            x = x[cell_mask,:][:,gene_mask][:,gene_mask2]
202
            
203
        if label_names is None:
204
            try:
205
                label_names = adata.obs.cell_types
206
            except:
207
                if label_names is not None and processed is False:
208
                    label_names = label_names[cell_mask]
209
            
210
        cell_names = adata.obs_names.values
211
        selected_gene_names = adata.var_names.values
212
        gene_scalar = None
213
    
214
    # if input is a count matrix
215
    else:
216
        # remove cells that have no expression
217
        expressed = _check_expression(x)
218
        print(&#39;Removing %d cells without expression.&#39;%(np.sum(expressed==0)))
219
        x = x[expressed==1,:]
220
        if c is not None:
221
            c = c[expressed==1,:]
222
        if label_names is not None:
223
            label_names = label_names[expressed==1]        
224
        
225
        # remove genes without variability
226
        variable = _check_variability(x)
227
        print(&#39;Removing %d genes without variability.&#39;%(np.sum(variable==0)))
228
        x = x[:, variable==1]
229
        gene_names = raw_gene_names[variable==1]
230
231
        # log-normalization
232
        expression, scale_factor = normalize_gene_expression(x, K, transform_fn)
233
        
234
        # feature selection
235
        x, index = feature_select(x, gene_num)
236
        selected_expression = expression[:, index]
237
        
238
        # per-gene standardization
239
        gene_scalar = preprocessing.StandardScaler()
240
        x_normalized = gene_scalar.fit_transform(selected_expression)
241
    
242
        cell_names = raw_cell_names[expressed==1]
243
        selected_gene_names = gene_names[index]
244
245
    if (data_type==&#39;Gaussian&#39;) and (dimred is False):
246
        # use arpack solver and extend precision to get deterministic result
247
        pca = PCA(n_components = npc, random_state=random_state, svd_solver=&#39;arpack&#39;)
248
        x_normalized = x = pca.fit_transform(x_normalized.astype(np.float64)).astype(np.float32)
249
250
    if c is not None:
251
        c_scalar = preprocessing.StandardScaler()
252
        c = c_scalar.fit_transform(c)
253
254
    if label_names is None:
255
        warnings.warn(&#39;No labels for cells!&#39;)
256
        labels = None
257
        le = None
258
    else:
259
        le = preprocessing.LabelEncoder()
260
        labels = le.fit_transform(label_names)
261
        print(&#39;Number of cells in each class: &#39;)
262
        table = pd.value_counts(label_names)
263
        table.index = pd.Series(le.transform(table.index).astype(str)) \
264
            + &#39; &lt;---&gt; &#39; + table.index
265
        table = table.sort_index()
266
        print(table)
267
        
268
    return (x_normalized, expression, x, c, 
269
        cell_names, gene_names, selected_gene_names, 
270
        scale_factor, labels, label_names, le, gene_scalar)
271
272
273
def _recipe_seurat(adata, gene_num):
274
    &#34;&#34;&#34;
275
    Normalization and filtering as of Seurat [Satija15]_.
276
    This uses a particular preprocessing
277
    &#34;&#34;&#34;
278
    import scanpy as sc
279
    cell_mask = sc.pp.filter_cells(adata, min_genes=200, inplace=False)[0]
280
    adata = adata[cell_mask,:]
281
    gene_mask = sc.pp.filter_genes(adata, min_cells=3, inplace=False)[0]
282
    adata = adata[:,gene_mask]
283
    gene_names = adata.var_names.values
284
285
    sc.pp.normalize_total(adata, target_sum=1e4, key_added=&#39;counts_per_cell&#39;)
286
    filter_result = sc.pp.filter_genes_dispersion(
287
        adata.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=gene_num)
288
    
289
    sc.pp.log1p(adata)
290
    expression = adata.X.copy()
291
    adata._inplace_subset_var(filter_result.gene_subset)  # filter genes
292
    sc.pp.scale(adata, max_value=10)
293
    return adata, expression, gene_names, cell_mask, gene_mask, filter_result.gene_subset</code></pre>
294
</details>
295
</section>
296
<section>
297
</section>
298
<section>
299
</section>
300
<section>
301
<h2 class="section-title" id="header-functions">Functions</h2>
302
<dl>
303
<dt id="VITAE.preprocess.normalize_gene_expression"><code class="name flex">
304
<span>def <span class="ident">normalize_gene_expression</span></span>(<span>x, K: float = 10000.0, transform_fn: str = 'log')</span>
305
</code></dt>
306
<dd>
307
<div class="desc"><p>Normalize the gene expression counts for each cell by the total expression counts,
308
divide this by a size scale factor, which is determined by total counts and a constant K
309
then log-transforms the result.</p>
310
<h2 id="parameters">Parameters</h2>
311
<dl>
312
<dt><strong><code>x</code></strong> :&ensp;<code>np.array</code></dt>
313
<dd><span><span class="MathJax_Preview">[N, G^{raw}]</span><script type="math/tex">[N, G^{raw}]</script></span> The raw count data.</dd>
314
<dt><strong><code>K</code></strong> :&ensp;<code>float</code>, optional</dt>
315
<dd>The normalizing constant.</dd>
316
<dt><strong><code>transform_fn</code></strong> :&ensp;<code>str</code>, optional</dt>
317
<dd>Either 'log' or 'sqrt'.</dd>
318
</dl>
319
<h2 id="returns">Returns</h2>
320
<dl>
321
<dt><strong><code>x_normalized</code></strong> :&ensp;<code>np.array</code></dt>
322
<dd><span><span class="MathJax_Preview">[N, G^{raw}]</span><script type="math/tex">[N, G^{raw}]</script></span> The log-normalized data.</dd>
323
<dt><strong><code>scale_factor</code></strong> :&ensp;<code>np.array</code></dt>
324
<dd><span><span class="MathJax_Preview">[N, ]</span><script type="math/tex">[N, ]</script></span> The scale factors.</dd>
325
</dl></div>
326
<details class="source">
327
<summary>
328
<span>Expand source code</span>
329
</summary>
330
<pre><code class="python">def normalize_gene_expression(x, K : float = 1e4, transform_fn : str = &#39;log&#39;):
331
    &#39;&#39;&#39;Normalize the gene expression counts for each cell by the total expression counts, 
332
    divide this by a size scale factor, which is determined by total counts and a constant K
333
    then log-transforms the result.
334
335
    Parameters
336
    ----------
337
    x : np.array
338
        \([N, G^{raw}]\) The raw count data.
339
    K : float, optional
340
        The normalizing constant.
341
    transform_fn : str, optional
342
        Either &#39;log&#39; or &#39;sqrt&#39;.
343
344
    Returns
345
    ----------
346
    x_normalized : np.array
347
        \([N, G^{raw}]\) The log-normalized data.
348
    scale_factor : np.array
349
        \([N, ]\) The scale factors.
350
    &#39;&#39;&#39;          
351
    scale_factor = np.sum(x,axis=1, keepdims=True)/K
352
    if transform_fn==&#39;log&#39;:
353
        x_normalized = np.log(x/scale_factor + 1)
354
    else:
355
        x_normalized = np.where(x&gt;0, np.sqrt(x/scale_factor), 0)
356
357
    print(&#39;min normalized value: &#39; + str(np.min(x_normalized)))
358
    print(&#39;max normalized value: &#39; + str(np.max(x_normalized)))
359
    return x_normalized, scale_factor</code></pre>
360
</details>
361
</dd>
362
<dt id="VITAE.preprocess.feature_select"><code class="name flex">
363
<span>def <span class="ident">feature_select</span></span>(<span>x, gene_num: int = 2000)</span>
364
</code></dt>
365
<dd>
366
<div class="desc"><p>Select highly variable genes (HVGs)
367
(See <a href="https://www.nature.com/articles/nbt.4096">Stuart <em>et al</em>, (2019)</a> and its early version <a href="https://www.biorxiv.org/content/10.1101/460147v1.full.pdf">preprint</a>
368
Page 12-13: Data preprocessing - Feature selection for individual datasets).</p>
369
<h2 id="parameters">Parameters</h2>
370
<dl>
371
<dt><strong><code>x</code></strong> :&ensp;<code>np.array</code></dt>
372
<dd><span><span class="MathJax_Preview">[N, G^{raw}]</span><script type="math/tex">[N, G^{raw}]</script></span> The raw count data.</dd>
373
<dt><strong><code>gene_num</code></strong> :&ensp;<code>int</code>, optional</dt>
374
<dd>The number of genes to retain.</dd>
375
</dl>
376
<h2 id="returns">Returns</h2>
377
<dl>
378
<dt><strong><code>x</code></strong> :&ensp;<code>np.array</code></dt>
379
<dd><span><span class="MathJax_Preview">[N, G]</span><script type="math/tex">[N, G]</script></span> The count data after gene selection.</dd>
380
<dt><strong><code>index</code></strong> :&ensp;<code>np.array</code></dt>
381
<dd><span><span class="MathJax_Preview">[G, ]</span><script type="math/tex">[G, ]</script></span> The selected index of genes.</dd>
382
</dl></div>
383
<details class="source">
384
<summary>
385
<span>Expand source code</span>
386
</summary>
387
<pre><code class="python">def feature_select(x, gene_num : int = 2000):
388
    &#39;&#39;&#39;Select highly variable genes (HVGs)
389
    (See [Stuart *et al*, (2019)](https://www.nature.com/articles/nbt.4096) and its early version [preprint](https://www.biorxiv.org/content/10.1101/460147v1.full.pdf)
390
    Page 12-13: Data preprocessing - Feature selection for individual datasets).
391
392
    Parameters
393
    ----------
394
    x : np.array
395
        \([N, G^{raw}]\) The raw count data.
396
    gene_num : int, optional
397
        The number of genes to retain.
398
399
    Returns
400
    ----------
401
    x : np.array
402
        \([N, G]\) The count data after gene selection.
403
    index : np.array
404
        \([G, ]\) The selected index of genes.
405
    &#39;&#39;&#39;     
406
    
407
408
    n, p = x.shape
409
410
    # mean and variance of each gene of the unnormalized data  
411
    mean, var = np.mean(x, axis=0), np.var(x, axis=0, ddof=1)
412
413
    # model log10(var)~log10(mean) by local fitting of polynomials of degree 2
414
    loess_model = loess.loess(np.log10(mean), np.log10(var), 
415
                    span = 0.3, degree = 2, family=&#39;gaussian&#39;
416
                    )
417
    loess_model.fit()
418
    fitted = loess_model.outputs.fitted_values
419
420
    # standardized feature
421
    z = (x - mean)/np.sqrt(10**fitted)
422
423
    # clipped the standardized features to remove outliers
424
    z = np.clip(z, -np.inf, np.sqrt(n))
425
426
    # the variance of standardized features across all cells represents a measure of
427
    # single cell dispersion after controlling for mean expression    
428
    feature_score = np.sum(z**2, axis=0)/(n-1)
429
    
430
    # feature selection
431
    index = feature_score.argsort()[::-1][0:gene_num]
432
433
    return x[:, index], index</code></pre>
434
</details>
435
</dd>
436
<dt id="VITAE.preprocess.preprocess"><code class="name flex">
437
<span>def <span class="ident">preprocess</span></span>(<span>adata=None, processed: bool = False, dimred: bool = False, x=None, c=None, label_names=None, raw_cell_names=None, raw_gene_names=None, K: float = 10000.0, transform_fn: str = 'log', gene_num: int = 2000, data_type: str = 'UMI', npc: int = 64, random_state=0)</span>
438
</code></dt>
439
<dd>
440
<div class="desc"><p>Preprocess count data.</p>
441
<h2 id="parameters">Parameters</h2>
442
<dl>
443
<dt><strong><code>adata</code></strong> :&ensp;<code>AnnData</code>, optional</dt>
444
<dd>The scanpy object.</dd>
445
<dt><strong><code>processed</code></strong> :&ensp;<code>boolean</code></dt>
446
<dd>Whether adata has been processed.</dd>
447
<dt><strong><code>dimred</code></strong> :&ensp;<code>boolean</code></dt>
448
<dd>Whether the processed adata is after dimension reduction.</dd>
449
<dt><strong><code>x</code></strong> :&ensp;<code>np.array</code>, optional</dt>
450
<dd><span><span class="MathJax_Preview">[N^{raw}, G^{raw}]</span><script type="math/tex">[N^{raw}, G^{raw}]</script></span> The raw count matrix.</dd>
451
<dt><strong><code>c</code></strong> :&ensp;<code>np.array</code></dt>
452
<dd><span><span class="MathJax_Preview">[N^{raw}, s]</span><script type="math/tex">[N^{raw}, s]</script></span> The covariate matrix.</dd>
453
<dt><strong><code>label_names</code></strong> :&ensp;<code>np.array </code></dt>
454
<dd><span><span class="MathJax_Preview">[N^{raw}, ]</span><script type="math/tex">[N^{raw}, ]</script></span> The true or estimated cell types.</dd>
455
<dt><strong><code>raw_cell_names</code></strong> :&ensp;<code>np.array
456
</code></dt>
457
<dd><span><span class="MathJax_Preview">[N^{raw}, ]</span><script type="math/tex">[N^{raw}, ]</script></span> The names of cells.</dd>
458
<dt><strong><code>raw_gene_names</code></strong> :&ensp;<code>np.array</code></dt>
459
<dd><span><span class="MathJax_Preview">[G^{raw}, ]</span><script type="math/tex">[G^{raw}, ]</script></span> The names of genes.</dd>
460
<dt><strong><code>K</code></strong> :&ensp;<code>int</code>, optional</dt>
461
<dd>The normalizing constant.</dd>
462
<dt><strong><code>transform_fn</code></strong> :&ensp;<code>str</code></dt>
463
<dd>The transform function used to normalize the gene expression after scaling. Either 'log' or 'sqrt'.</dd>
464
<dt><strong><code>gene_num</code></strong> :&ensp;<code>int</code>, optional</dt>
465
<dd>The number of genes to retain.</dd>
466
<dt><strong><code>data_type</code></strong> :&ensp;<code>str</code>, optional</dt>
467
<dd>'UMI', 'non-UMI', or 'Gaussian'.</dd>
468
<dt><strong><code>npc</code></strong> :&ensp;<code>int</code>, optional</dt>
469
<dd>The number of PCs to retain, only used if <code>data_type='Gaussian'</code>.</dd>
470
<dt><strong><code>random_state</code></strong> :&ensp;<code>int</code>, optional</dt>
471
<dd>The random state for PCA. With different random states, the resulted PCA scores are slightly different.</dd>
472
</dl>
473
<h2 id="returns">Returns</h2>
474
<dl>
475
<dt><strong><code>x_normalized</code></strong> :&ensp;<code>np.array</code></dt>
476
<dd><span><span class="MathJax_Preview">[N, G]</span><script type="math/tex">[N, G]</script></span> The preprocessed matrix.</dd>
477
<dt><strong><code>expression</code></strong> :&ensp;<code>np.array</code></dt>
478
<dd><span><span class="MathJax_Preview">[N, G^{raw}]</span><script type="math/tex">[N, G^{raw}]</script></span> The expression matrix after log-normalization and before scaling.</dd>
479
<dt><strong><code>x</code></strong> :&ensp;<code>np.array</code></dt>
480
<dd><span><span class="MathJax_Preview">[N, G]</span><script type="math/tex">[N, G]</script></span> The raw count matrix after gene selections.</dd>
481
<dt><strong><code>c</code></strong> :&ensp;<code>np.array</code></dt>
482
<dd><span><span class="MathJax_Preview">[N, s]</span><script type="math/tex">[N, s]</script></span> The covariates.</dd>
483
<dt><strong><code>cell_names</code></strong> :&ensp;<code>np.array</code></dt>
484
<dd><span><span class="MathJax_Preview">[N, ]</span><script type="math/tex">[N, ]</script></span> The cell names.</dd>
485
<dt><strong><code>gene_names</code></strong> :&ensp;<code>np.array</code></dt>
486
<dd><span><span class="MathJax_Preview">[G^{raw}, ]</span><script type="math/tex">[G^{raw}, ]</script></span> The gene names.</dd>
487
<dt><strong><code>selected_gene_names</code></strong></dt>
488
<dd><span><span class="MathJax_Preview">[G, ]</span><script type="math/tex">[G, ]</script></span> The selected gene names.</dd>
489
<dt><strong><code>scale_factor</code></strong></dt>
490
<dd><span><span class="MathJax_Preview">[N, ]</span><script type="math/tex">[N, ]</script></span> The scale factors.</dd>
491
<dt><strong><code>labels</code></strong> :&ensp;<code>np.array</code></dt>
492
<dd><span><span class="MathJax_Preview">[N, ]</span><script type="math/tex">[N, ]</script></span> The encoded labels.</dd>
493
<dt><strong><code>label_names</code></strong> :&ensp;<code>np.array</code></dt>
494
<dd><span><span class="MathJax_Preview">[N, ]</span><script type="math/tex">[N, ]</script></span> The label names.</dd>
495
<dt><strong><code>le</code></strong> :&ensp;<code>sklearn.preprocessing.LabelEncoder</code></dt>
496
<dd>The label encoder.</dd>
497
<dt><strong><code>gene_scalar</code></strong> :&ensp;<code>sklearn.preprocessing.StandardScaler</code></dt>
498
<dd>The gene scaler.</dd>
499
</dl></div>
500
<details class="source">
501
<summary>
502
<span>Expand source code</span>
503
</summary>
504
<pre><code class="python">def preprocess(adata = None, processed: bool = False, dimred: bool = False, 
505
            x = None, c = None, label_names = None, raw_cell_names = None, raw_gene_names = None,  
506
            K: float = 1e4, transform_fn: str = &#39;log&#39;, gene_num: int = 2000, data_type: str = &#39;UMI&#39;, 
507
            npc: int = 64, random_state=0):
508
    &#39;&#39;&#39;Preprocess count data.
509
510
    Parameters
511
    ----------
512
    adata : AnnData, optional
513
        The scanpy object.
514
    processed : boolean
515
        Whether adata has been processed.
516
    dimred : boolean
517
        Whether the processed adata is after dimension reduction.
518
    x : np.array, optional
519
        \([N^{raw}, G^{raw}]\) The raw count matrix.
520
    c : np.array
521
        \([N^{raw}, s]\) The covariate matrix.
522
    label_names : np.array 
523
        \([N^{raw}, ]\) The true or estimated cell types.
524
    raw_cell_names : np.array  
525
        \([N^{raw}, ]\) The names of cells.
526
    raw_gene_names : np.array
527
        \([G^{raw}, ]\) The names of genes.
528
    K : int, optional
529
        The normalizing constant.
530
    transform_fn : str
531
        The transform function used to normalize the gene expression after scaling. Either &#39;log&#39; or &#39;sqrt&#39;.
532
    gene_num : int, optional
533
        The number of genes to retain.
534
    data_type : str, optional
535
        &#39;UMI&#39;, &#39;non-UMI&#39;, or &#39;Gaussian&#39;.
536
    npc : int, optional
537
        The number of PCs to retain, only used if `data_type=&#39;Gaussian&#39;`.
538
    random_state : int, optional
539
        The random state for PCA. With different random states, the resulted PCA scores are slightly different.
540
541
    Returns
542
    ----------
543
    x_normalized : np.array
544
        \([N, G]\) The preprocessed matrix.
545
    expression : np.array
546
        \([N, G^{raw}]\) The expression matrix after log-normalization and before scaling.
547
    x : np.array
548
        \([N, G]\) The raw count matrix after gene selections.
549
    c : np.array
550
        \([N, s]\) The covariates.
551
    cell_names : np.array
552
        \([N, ]\) The cell names.
553
    gene_names : np.array
554
        \([G^{raw}, ]\) The gene names.
555
    selected_gene_names : 
556
        \([G, ]\) The selected gene names.
557
    scale_factor : 
558
        \([N, ]\) The scale factors.
559
    labels : np.array
560
        \([N, ]\) The encoded labels.
561
    label_names : np.array
562
        \([N, ]\) The label names.
563
    le : sklearn.preprocessing.LabelEncoder
564
        The label encoder.
565
    gene_scalar : sklearn.preprocessing.StandardScaler
566
        The gene scaler.
567
    &#39;&#39;&#39;
568
    # if input is a scanpy data
569
    if adata is not None:
570
        import scanpy as sc
571
        
572
        # if the input scanpy is processed
573
        if processed: 
574
            x_normalized = x = adata.X
575
            gene_names = adata.var_names.values
576
            expression = None
577
            scale_factor = np.ones(x.shape[0])
578
        # if the input scanpy is not processed
579
        else: 
580
            dimred = False
581
            x = adata.X.copy()
582
            adata, expression, gene_names, cell_mask, gene_mask, gene_mask2 = _recipe_seurat(adata, gene_num)
583
            x_normalized = adata.X.copy()
584
            scale_factor = adata.obs.counts_per_cell.values / 1e4
585
            x = x[cell_mask,:][:,gene_mask][:,gene_mask2]
586
            
587
        if label_names is None:
588
            try:
589
                label_names = adata.obs.cell_types
590
            except:
591
                if label_names is not None and processed is False:
592
                    label_names = label_names[cell_mask]
593
            
594
        cell_names = adata.obs_names.values
595
        selected_gene_names = adata.var_names.values
596
        gene_scalar = None
597
    
598
    # if input is a count matrix
599
    else:
600
        # remove cells that have no expression
601
        expressed = _check_expression(x)
602
        print(&#39;Removing %d cells without expression.&#39;%(np.sum(expressed==0)))
603
        x = x[expressed==1,:]
604
        if c is not None:
605
            c = c[expressed==1,:]
606
        if label_names is not None:
607
            label_names = label_names[expressed==1]        
608
        
609
        # remove genes without variability
610
        variable = _check_variability(x)
611
        print(&#39;Removing %d genes without variability.&#39;%(np.sum(variable==0)))
612
        x = x[:, variable==1]
613
        gene_names = raw_gene_names[variable==1]
614
615
        # log-normalization
616
        expression, scale_factor = normalize_gene_expression(x, K, transform_fn)
617
        
618
        # feature selection
619
        x, index = feature_select(x, gene_num)
620
        selected_expression = expression[:, index]
621
        
622
        # per-gene standardization
623
        gene_scalar = preprocessing.StandardScaler()
624
        x_normalized = gene_scalar.fit_transform(selected_expression)
625
    
626
        cell_names = raw_cell_names[expressed==1]
627
        selected_gene_names = gene_names[index]
628
629
    if (data_type==&#39;Gaussian&#39;) and (dimred is False):
630
        # use arpack solver and extend precision to get deterministic result
631
        pca = PCA(n_components = npc, random_state=random_state, svd_solver=&#39;arpack&#39;)
632
        x_normalized = x = pca.fit_transform(x_normalized.astype(np.float64)).astype(np.float32)
633
634
    if c is not None:
635
        c_scalar = preprocessing.StandardScaler()
636
        c = c_scalar.fit_transform(c)
637
638
    if label_names is None:
639
        warnings.warn(&#39;No labels for cells!&#39;)
640
        labels = None
641
        le = None
642
    else:
643
        le = preprocessing.LabelEncoder()
644
        labels = le.fit_transform(label_names)
645
        print(&#39;Number of cells in each class: &#39;)
646
        table = pd.value_counts(label_names)
647
        table.index = pd.Series(le.transform(table.index).astype(str)) \
648
            + &#39; &lt;---&gt; &#39; + table.index
649
        table = table.sort_index()
650
        print(table)
651
        
652
    return (x_normalized, expression, x, c, 
653
        cell_names, gene_names, selected_gene_names, 
654
        scale_factor, labels, label_names, le, gene_scalar)</code></pre>
655
</details>
656
</dd>
657
</dl>
658
</section>
659
<section>
660
</section>
661
</article>
662
<nav id="sidebar">
663
<h1>Index</h1>
664
<div class="toc">
665
<ul></ul>
666
</div>
667
<ul id="index">
668
<li><h3>Super-module</h3>
669
<ul>
670
<li><code><a title="VITAE" href="index.html">VITAE</a></code></li>
671
</ul>
672
</li>
673
<li><h3><a href="#header-functions">Functions</a></h3>
674
<ul class="">
675
<li><code><a title="VITAE.preprocess.normalize_gene_expression" href="#VITAE.preprocess.normalize_gene_expression">normalize_gene_expression</a></code></li>
676
<li><code><a title="VITAE.preprocess.feature_select" href="#VITAE.preprocess.feature_select">feature_select</a></code></li>
677
<li><code><a title="VITAE.preprocess.preprocess" href="#VITAE.preprocess.preprocess">preprocess</a></code></li>
678
</ul>
679
</li>
680
</ul>
681
</nav>
682
</main>
683
<footer id="footer">
684
<p>Generated by <a href="https://pdoc3.github.io/pdoc"><cite>pdoc</cite> 0.8.1</a>.</p>
685
</footer>
686
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
687
<script>hljs.initHighlightingOnLoad()</script>
688
</body>
689
</html>