Module `VITAE.preprocess`

Expand source code

# -*- coding: utf-8 -*-
from typing import Optional
import numpy as np
import pandas as pd
from skmisc import loess
from sklearn import preprocessing
import warnings
from sklearn.decomposition import PCA
from VITAE.utils import _check_expression, _check_variability


def normalize_gene_expression(x, K : float = 1e4, transform_fn : str = 'log'):
    '''Normalize the gene expression counts for each cell by the total expression counts, 
    divide this by a size scale factor, which is determined by total counts and a constant K
    then log-transforms the result.

    Parameters
    ----------
    x : np.array
        \([N, G^{raw}]\) The raw count data.
    K : float, optional
        The normalizing constant.
    transform_fn : str, optional
        Either 'log' or 'sqrt'.

    Returns
    ----------
    x_normalized : np.array
        \([N, G^{raw}]\) The log-normalized data.
    scale_factor : np.array
        \([N, ]\) The scale factors.
    '''          
    scale_factor = np.sum(x,axis=1, keepdims=True)/K
    if transform_fn=='log':
        x_normalized = np.log(x/scale_factor + 1)
    else:
        x_normalized = np.where(x>0, np.sqrt(x/scale_factor), 0)

    print('min normalized value: ' + str(np.min(x_normalized)))
    print('max normalized value: ' + str(np.max(x_normalized)))
    return x_normalized, scale_factor


def feature_select(x, gene_num : int = 2000):
    '''Select highly variable genes (HVGs)
    (See [Stuart *et al*, (2019)](https://www.nature.com/articles/nbt.4096) and its early version [preprint](https://www.biorxiv.org/content/10.1101/460147v1.full.pdf)
    Page 12-13: Data preprocessing - Feature selection for individual datasets).

    Parameters
    ----------
    x : np.array
        \([N, G^{raw}]\) The raw count data.
    gene_num : int, optional
        The number of genes to retain.

    Returns
    ----------
    x : np.array
        \([N, G]\) The count data after gene selection.
    index : np.array
        \([G, ]\) The selected index of genes.
    '''     
    

    n, p = x.shape

    # mean and variance of each gene of the unnormalized data  
    mean, var = np.mean(x, axis=0), np.var(x, axis=0, ddof=1)

    # model log10(var)~log10(mean) by local fitting of polynomials of degree 2
    loess_model = loess.loess(np.log10(mean), np.log10(var), 
                    span = 0.3, degree = 2, family='gaussian'
                    )
    loess_model.fit()
    fitted = loess_model.outputs.fitted_values

    # standardized feature
    z = (x - mean)/np.sqrt(10**fitted)

    # clipped the standardized features to remove outliers
    z = np.clip(z, -np.inf, np.sqrt(n))

    # the variance of standardized features across all cells represents a measure of
    # single cell dispersion after controlling for mean expression    
    feature_score = np.sum(z**2, axis=0)/(n-1)
    
    # feature selection
    index = feature_score.argsort()[::-1][0:gene_num]

    return x[:, index], index


def preprocess(adata = None, processed: bool = False, dimred: bool = False, 
            x = None, c = None, label_names = None, raw_cell_names = None, raw_gene_names = None,  
            K: float = 1e4, transform_fn: str = 'log', gene_num: int = 2000, data_type: str = 'UMI', 
            npc: int = 64, random_state=0):
    '''Preprocess count data.

    Parameters
    ----------
    adata : AnnData, optional
        The scanpy object.
    processed : boolean
        Whether adata has been processed.
    dimred : boolean
        Whether the processed adata is after dimension reduction.
    x : np.array, optional
        \([N^{raw}, G^{raw}]\) The raw count matrix.
    c : np.array
        \([N^{raw}, s]\) The covariate matrix.
    label_names : np.array 
        \([N^{raw}, ]\) The true or estimated cell types.
    raw_cell_names : np.array  
        \([N^{raw}, ]\) The names of cells.
    raw_gene_names : np.array
        \([G^{raw}, ]\) The names of genes.
    K : int, optional
        The normalizing constant.
    transform_fn : str
        The transform function used to normalize the gene expression after scaling. Either 'log' or 'sqrt'.
    gene_num : int, optional
        The number of genes to retain.
    data_type : str, optional
        'UMI', 'non-UMI', or 'Gaussian'.
    npc : int, optional
        The number of PCs to retain, only used if `data_type='Gaussian'`.
    random_state : int, optional
        The random state for PCA. With different random states, the resulted PCA scores are slightly different.

    Returns
    ----------
    x_normalized : np.array
        \([N, G]\) The preprocessed matrix.
    expression : np.array
        \([N, G^{raw}]\) The expression matrix after log-normalization and before scaling.
    x : np.array
        \([N, G]\) The raw count matrix after gene selections.
    c : np.array
        \([N, s]\) The covariates.
    cell_names : np.array
        \([N, ]\) The cell names.
    gene_names : np.array
        \([G^{raw}, ]\) The gene names.
    selected_gene_names : 
        \([G, ]\) The selected gene names.
    scale_factor : 
        \([N, ]\) The scale factors.
    labels : np.array
        \([N, ]\) The encoded labels.
    label_names : np.array
        \([N, ]\) The label names.
    le : sklearn.preprocessing.LabelEncoder
        The label encoder.
    gene_scalar : sklearn.preprocessing.StandardScaler
        The gene scaler.
    '''
    # if input is a scanpy data
    if adata is not None:
        import scanpy as sc
        
        # if the input scanpy is processed
        if processed: 
            x_normalized = x = adata.X
            gene_names = adata.var_names.values
            expression = None
            scale_factor = np.ones(x.shape[0])
        # if the input scanpy is not processed
        else: 
            dimred = False
            x = adata.X.copy()
            adata, expression, gene_names, cell_mask, gene_mask, gene_mask2 = _recipe_seurat(adata, gene_num)
            x_normalized = adata.X.copy()
            scale_factor = adata.obs.counts_per_cell.values / 1e4
            x = x[cell_mask,:][:,gene_mask][:,gene_mask2]
            
        if label_names is None:
            try:
                label_names = adata.obs.cell_types
            except:
                if label_names is not None and processed is False:
                    label_names = label_names[cell_mask]
            
        cell_names = adata.obs_names.values
        selected_gene_names = adata.var_names.values
        gene_scalar = None
    
    # if input is a count matrix
    else:
        # remove cells that have no expression
        expressed = _check_expression(x)
        print('Removing %d cells without expression.'%(np.sum(expressed==0)))
        x = x[expressed==1,:]
        if c is not None:
            c = c[expressed==1,:]
        if label_names is not None:
            label_names = label_names[expressed==1]        
        
        # remove genes without variability
        variable = _check_variability(x)
        print('Removing %d genes without variability.'%(np.sum(variable==0)))
        x = x[:, variable==1]
        gene_names = raw_gene_names[variable==1]

        # log-normalization
        expression, scale_factor = normalize_gene_expression(x, K, transform_fn)
        
        # feature selection
        x, index = feature_select(x, gene_num)
        selected_expression = expression[:, index]
        
        # per-gene standardization
        gene_scalar = preprocessing.StandardScaler()
        x_normalized = gene_scalar.fit_transform(selected_expression)
    
        cell_names = raw_cell_names[expressed==1]
        selected_gene_names = gene_names[index]

    if (data_type=='Gaussian') and (dimred is False):
        # use arpack solver and extend precision to get deterministic result
        pca = PCA(n_components = npc, random_state=random_state, svd_solver='arpack')
        x_normalized = x = pca.fit_transform(x_normalized.astype(np.float64)).astype(np.float32)

    if c is not None:
        c_scalar = preprocessing.StandardScaler()
        c = c_scalar.fit_transform(c)

    if label_names is None:
        warnings.warn('No labels for cells!')
        labels = None
        le = None
    else:
        le = preprocessing.LabelEncoder()
        labels = le.fit_transform(label_names)
        print('Number of cells in each class: ')
        table = pd.value_counts(label_names)
        table.index = pd.Series(le.transform(table.index).astype(str)) \
            + ' <---> ' + table.index
        table = table.sort_index()
        print(table)
        
    return (x_normalized, expression, x, c, 
        cell_names, gene_names, selected_gene_names, 
        scale_factor, labels, label_names, le, gene_scalar)


def _recipe_seurat(adata, gene_num):
    """
    Normalization and filtering as of Seurat [Satija15]_.
    This uses a particular preprocessing
    """
    import scanpy as sc
    cell_mask = sc.pp.filter_cells(adata, min_genes=200, inplace=False)[0]
    adata = adata[cell_mask,:]
    gene_mask = sc.pp.filter_genes(adata, min_cells=3, inplace=False)[0]
    adata = adata[:,gene_mask]
    gene_names = adata.var_names.values

    sc.pp.normalize_total(adata, target_sum=1e4, key_added='counts_per_cell')
    filter_result = sc.pp.filter_genes_dispersion(
        adata.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=gene_num)
    
    sc.pp.log1p(adata)
    expression = adata.X.copy()
    adata._inplace_subset_var(filter_result.gene_subset)  # filter genes
    sc.pp.scale(adata, max_value=10)
    return adata, expression, gene_names, cell_mask, gene_mask, filter_result.gene_subset

Functions

def normalize_gene_expression(x, K: float = 10000.0, transform_fn: str = 'log')

Normalize the gene expression counts for each cell by the total expression counts, divide this by a size scale factor, which is determined by total counts and a constant K then log-transforms the result.

Parameters

x : np.array: $[N, G^{raw}]$ The raw count data.
K : float, optional: The normalizing constant.
transform_fn : str, optional: Either 'log' or 'sqrt'.

Returns

x_normalized : np.array: $[N, G^{raw}]$ The log-normalized data.
scale_factor : np.array: $[N, ]$ The scale factors.

Expand source code

def normalize_gene_expression(x, K : float = 1e4, transform_fn : str = 'log'):
    '''Normalize the gene expression counts for each cell by the total expression counts, 
    divide this by a size scale factor, which is determined by total counts and a constant K
    then log-transforms the result.

    Parameters
    ----------
    x : np.array
        \([N, G^{raw}]\) The raw count data.
    K : float, optional
        The normalizing constant.
    transform_fn : str, optional
        Either 'log' or 'sqrt'.

    Returns
    ----------
    x_normalized : np.array
        \([N, G^{raw}]\) The log-normalized data.
    scale_factor : np.array
        \([N, ]\) The scale factors.
    '''          
    scale_factor = np.sum(x,axis=1, keepdims=True)/K
    if transform_fn=='log':
        x_normalized = np.log(x/scale_factor + 1)
    else:
        x_normalized = np.where(x>0, np.sqrt(x/scale_factor), 0)

    print('min normalized value: ' + str(np.min(x_normalized)))
    print('max normalized value: ' + str(np.max(x_normalized)))
    return x_normalized, scale_factor

def feature_select(x, gene_num: int = 2000)

Select highly variable genes (HVGs) (See Stuart et al, (2019) and its early version preprint Page 12-13: Data preprocessing - Feature selection for individual datasets).

Parameters

x : np.array: $[N, G^{raw}]$ The raw count data.
gene_num : int, optional: The number of genes to retain.

Returns

x : np.array: $[N, G]$ The count data after gene selection.
index : np.array: $[G, ]$ The selected index of genes.

Expand source code

def feature_select(x, gene_num : int = 2000):
    '''Select highly variable genes (HVGs)
    (See [Stuart *et al*, (2019)](https://www.nature.com/articles/nbt.4096) and its early version [preprint](https://www.biorxiv.org/content/10.1101/460147v1.full.pdf)
    Page 12-13: Data preprocessing - Feature selection for individual datasets).

    Parameters
    ----------
    x : np.array
        \([N, G^{raw}]\) The raw count data.
    gene_num : int, optional
        The number of genes to retain.

    Returns
    ----------
    x : np.array
        \([N, G]\) The count data after gene selection.
    index : np.array
        \([G, ]\) The selected index of genes.
    '''     
    

    n, p = x.shape

    # mean and variance of each gene of the unnormalized data  
    mean, var = np.mean(x, axis=0), np.var(x, axis=0, ddof=1)

    # model log10(var)~log10(mean) by local fitting of polynomials of degree 2
    loess_model = loess.loess(np.log10(mean), np.log10(var), 
                    span = 0.3, degree = 2, family='gaussian'
                    )
    loess_model.fit()
    fitted = loess_model.outputs.fitted_values

    # standardized feature
    z = (x - mean)/np.sqrt(10**fitted)

    # clipped the standardized features to remove outliers
    z = np.clip(z, -np.inf, np.sqrt(n))

    # the variance of standardized features across all cells represents a measure of
    # single cell dispersion after controlling for mean expression    
    feature_score = np.sum(z**2, axis=0)/(n-1)
    
    # feature selection
    index = feature_score.argsort()[::-1][0:gene_num]

    return x[:, index], index

def preprocess(adata=None, processed: bool = False, dimred: bool = False, x=None, c=None, label_names=None, raw_cell_names=None, raw_gene_names=None, K: float = 10000.0, transform_fn: str = 'log', gene_num: int = 2000, data_type: str = 'UMI', npc: int = 64, random_state=0)

Preprocess count data.

Parameters

adata : AnnData, optional: The scanpy object.
processed : boolean: Whether adata has been processed.
dimred : boolean: Whether the processed adata is after dimension reduction.
x : np.array, optional: $[N^{raw}, G^{raw}]$ The raw count matrix.
c : np.array: $[N^{raw}, s]$ The covariate matrix.
label_names : np.array: $[N^{raw}, ]$ The true or estimated cell types.
raw_cell_names : np.array: $[N^{raw}, ]$ The names of cells.
raw_gene_names : np.array: $[G^{raw}, ]$ The names of genes.
K : int, optional: The normalizing constant.
transform_fn : str: The transform function used to normalize the gene expression after scaling. Either 'log' or 'sqrt'.
gene_num : int, optional: The number of genes to retain.
data_type : str, optional: 'UMI', 'non-UMI', or 'Gaussian'.
npc : int, optional: The number of PCs to retain, only used if data_type='Gaussian'.
random_state : int, optional: The random state for PCA. With different random states, the resulted PCA scores are slightly different.

Returns

x_normalized : np.array: $[N, G]$ The preprocessed matrix.
expression : np.array: $[N, G^{raw}]$ The expression matrix after log-normalization and before scaling.
x : np.array: $[N, G]$ The raw count matrix after gene selections.
c : np.array: $[N, s]$ The covariates.
cell_names : np.array: $[N, ]$ The cell names.
gene_names : np.array: $[G^{raw}, ]$ The gene names.
selected_gene_names: $[G, ]$ The selected gene names.
scale_factor: $[N, ]$ The scale factors.
labels : np.array: $[N, ]$ The encoded labels.
label_names : np.array: $[N, ]$ The label names.
le : sklearn.preprocessing.LabelEncoder: The label encoder.
gene_scalar : sklearn.preprocessing.StandardScaler: The gene scaler.

Expand source code

def preprocess(adata = None, processed: bool = False, dimred: bool = False, 
            x = None, c = None, label_names = None, raw_cell_names = None, raw_gene_names = None,  
            K: float = 1e4, transform_fn: str = 'log', gene_num: int = 2000, data_type: str = 'UMI', 
            npc: int = 64, random_state=0):
    '''Preprocess count data.

    Parameters
    ----------
    adata : AnnData, optional
        The scanpy object.
    processed : boolean
        Whether adata has been processed.
    dimred : boolean
        Whether the processed adata is after dimension reduction.
    x : np.array, optional
        \([N^{raw}, G^{raw}]\) The raw count matrix.
    c : np.array
        \([N^{raw}, s]\) The covariate matrix.
    label_names : np.array 
        \([N^{raw}, ]\) The true or estimated cell types.
    raw_cell_names : np.array  
        \([N^{raw}, ]\) The names of cells.
    raw_gene_names : np.array
        \([G^{raw}, ]\) The names of genes.
    K : int, optional
        The normalizing constant.
    transform_fn : str
        The transform function used to normalize the gene expression after scaling. Either 'log' or 'sqrt'.
    gene_num : int, optional
        The number of genes to retain.
    data_type : str, optional
        'UMI', 'non-UMI', or 'Gaussian'.
    npc : int, optional
        The number of PCs to retain, only used if `data_type='Gaussian'`.
    random_state : int, optional
        The random state for PCA. With different random states, the resulted PCA scores are slightly different.

    Returns
    ----------
    x_normalized : np.array
        \([N, G]\) The preprocessed matrix.
    expression : np.array
        \([N, G^{raw}]\) The expression matrix after log-normalization and before scaling.
    x : np.array
        \([N, G]\) The raw count matrix after gene selections.
    c : np.array
        \([N, s]\) The covariates.
    cell_names : np.array
        \([N, ]\) The cell names.
    gene_names : np.array
        \([G^{raw}, ]\) The gene names.
    selected_gene_names : 
        \([G, ]\) The selected gene names.
    scale_factor : 
        \([N, ]\) The scale factors.
    labels : np.array
        \([N, ]\) The encoded labels.
    label_names : np.array
        \([N, ]\) The label names.
    le : sklearn.preprocessing.LabelEncoder
        The label encoder.
    gene_scalar : sklearn.preprocessing.StandardScaler
        The gene scaler.
    '''
    # if input is a scanpy data
    if adata is not None:
        import scanpy as sc
        
        # if the input scanpy is processed
        if processed: 
            x_normalized = x = adata.X
            gene_names = adata.var_names.values
            expression = None
            scale_factor = np.ones(x.shape[0])
        # if the input scanpy is not processed
        else: 
            dimred = False
            x = adata.X.copy()
            adata, expression, gene_names, cell_mask, gene_mask, gene_mask2 = _recipe_seurat(adata, gene_num)
            x_normalized = adata.X.copy()
            scale_factor = adata.obs.counts_per_cell.values / 1e4
            x = x[cell_mask,:][:,gene_mask][:,gene_mask2]
            
        if label_names is None:
            try:
                label_names = adata.obs.cell_types
            except:
                if label_names is not None and processed is False:
                    label_names = label_names[cell_mask]
            
        cell_names = adata.obs_names.values
        selected_gene_names = adata.var_names.values
        gene_scalar = None
    
    # if input is a count matrix
    else:
        # remove cells that have no expression
        expressed = _check_expression(x)
        print('Removing %d cells without expression.'%(np.sum(expressed==0)))
        x = x[expressed==1,:]
        if c is not None:
            c = c[expressed==1,:]
        if label_names is not None:
            label_names = label_names[expressed==1]        
        
        # remove genes without variability
        variable = _check_variability(x)
        print('Removing %d genes without variability.'%(np.sum(variable==0)))
        x = x[:, variable==1]
        gene_names = raw_gene_names[variable==1]

        # log-normalization
        expression, scale_factor = normalize_gene_expression(x, K, transform_fn)
        
        # feature selection
        x, index = feature_select(x, gene_num)
        selected_expression = expression[:, index]
        
        # per-gene standardization
        gene_scalar = preprocessing.StandardScaler()
        x_normalized = gene_scalar.fit_transform(selected_expression)
    
        cell_names = raw_cell_names[expressed==1]
        selected_gene_names = gene_names[index]

    if (data_type=='Gaussian') and (dimred is False):
        # use arpack solver and extend precision to get deterministic result
        pca = PCA(n_components = npc, random_state=random_state, svd_solver='arpack')
        x_normalized = x = pca.fit_transform(x_normalized.astype(np.float64)).astype(np.float32)

    if c is not None:
        c_scalar = preprocessing.StandardScaler()
        c = c_scalar.fit_transform(c)

    if label_names is None:
        warnings.warn('No labels for cells!')
        labels = None
        le = None
    else:
        le = preprocessing.LabelEncoder()
        labels = le.fit_transform(label_names)
        print('Number of cells in each class: ')
        table = pd.value_counts(label_names)
        table.index = pd.Series(le.transform(table.index).astype(str)) \
            + ' <---> ' + table.index
        table = table.sort_index()
        print(table)
        
    return (x_normalized, expression, x, c, 
        cell_names, gene_names, selected_gene_names, 
        scale_factor, labels, label_names, le, gene_scalar)