MultiVelo / Git / [0ad989] /src/multivelo/auxiliary.py

Models:
AlyssaS/
MultiVelo
Downloads: 1
[0ad989]: / src / multivelo / auxiliary.py
History
Download this file
1037 lines (908 with data), 42.3 kB

import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, csr_matrix, diags
from umap.umap_ import fuzzy_simplicial_set
from anndata import AnnData
import scanpy as sc
import scvelo as scv
import pandas as pd
from tqdm.auto import tqdm
import scipy
import os
import sys
from joblib import Parallel, delayed
from tqdm.auto import tqdm

current_path = os.path.dirname(__file__)
src_path = os.path.join(current_path, "..")
sys.path.append(src_path)

from multivelo import mv_logging as logg
from multivelo import settings

current_path = os.path.dirname(__file__)

sys.path.append(current_path)

from pyWNN import pyWNN

def do_count(fastqs, input_loc, output_loc, whitelist_path=None, tech="10XV3", strand=None, threads=8, memory=4):

    """Get spliced and unspliced counts from fastq data.

    Makes use of the kallisto-bustools count function:
    https://www.kallistobus.tools/kb_usage/kb_count/
    kb-python.readthedocs.io/en/latest/autoapi/kb_python/count/index.html

    Parameters
    ----------
    fastqs: `List[str]`
        The file locations of the fastqs to process.
    input_loc: `str`
        The folder location of the reference files.
        The folder should contain an index file with the name "index.idx", a 
        transcripts-to-gene file with the name "t2g.txt", a cDNA 
        transcripts-to-capture file with the name "cdna_t2c.txt", and an
        intron transcripts-to-captured file with the name intron_t2c.txt.
    output_loc: `str`
        The desired folder location of the output of the function.
    whitelist_path: `str` (default: `None`)
        Path to a barcode whitelist to use to replace the selected technology's
        whitelist.
    tech: `str` (default: `10XV3`)
        The technology used to collect the single-cell data. 
    strand: `str` (default: `None`)
        The strandedness desired to process the data.
    threads: `int`
        The number of threads to use for parallel processing.
    memory: `int`
        Maximum memory (in GB) to use while processing.

    Returns
    -------
    adata_count: :class:`~anndata.AnnData`
        An AnnData object containing all the spliced and unspliced counts,
        as well as associated gene names.

    """

    # convert the number of threads and the amount of allocated memory
    # into correctly-formatted strings for running kb count
    thread_string = str(threads)
    memory_string = str(memory) + "G"

    # locations of important files
    index_loc = input_loc + "/index.idx"
    t2g_loc = input_loc + "/t2g.txt"

    cdna_t2c = input_loc + "/cdna_t2c.txt"
    intron_t2c = input_loc + "/intron_t2c.txt"

    # keep the original argv values in case the user specifies it
    orig_argv = sys.argv

    # assemble the input array to use for kb count
    input_array = ["count",
                "count",
                "-i", index_loc, "-g", t2g_loc,
                "-x", tech,
                "-o", output_loc,
                "-t", thread_string, "-m", memory_string,
                "--workflow", "lamanno",
                "-c1", cdna_t2c,
                "-c2", intron_t2c,
                "--h5ad"]

    # specify the stranded-ness of the run
    if strand is not None:
        input_array.append("--strand")
        input_array.append(strand)

    # specify the whitelist path of the run
    if whitelist_path is not None:
        input_array.append("-w")
        input_array.append(whitelist_path)

    # add the fastq's we're doing the run on
    for fastq in fastqs:
        input_array.append(fastq)

    # use our assembled input array as the parameters for kb count
    sys.argv = input_array

    # run kb count
    kbm.main()

    # set argv back to its original value
    sys.argv = orig_argv

    # get the anndata object for 
    path = output_loc + "/counts_unfiltered"
    adata_count = sc.read(path + "/adata.h5ad")

    return adata_count


def prepare_gene_mat(var_dict, peaks, gene_mat, adata_atac_X_copy, i):

    for peak in peaks:
        if peak in var_dict:
            peak_index = var_dict[peak]

            gene_mat[:, i] += adata_atac_X_copy[:, peak_index]


def aggregate_peaks_10x(adata_atac, peak_annot_file, linkage_file,
                        peak_dist=10000, min_corr=0.5, gene_body=False,
                        return_dict=False, parallel=False, n_jobs=1):

    """Peak to gene aggregation.

    This function aggregates promoter and enhancer peaks to genes based on the
    10X linkage file.

    Parameters
    ----------
    adata_atac: :class:`~anndata.AnnData`
        ATAC anndata object which stores raw peak counts.
    peak_annot_file: `str`
        Peak annotation file from 10X CellRanger ARC.
    linkage_file: `str`
        Peak-gene linkage file from 10X CellRanger ARC. This file stores highly
        correlated peak-peak and peak-gene pair information.
    peak_dist: `int` (default: 10000)
        Maximum distance for peaks to be included for a gene.
    min_corr: `float` (default: 0.5)
        Minimum correlation for a peak to be considered as enhancer.
    gene_body: `bool` (default: `False`)
        Whether to add gene body peaks to the associated promoters.
    return_dict: `bool` (default: `False`)
        Whether to return promoter and enhancer dictionaries.

    Returns
    -------
    A new ATAC anndata object which stores gene aggreagted peak counts.
    Additionally, if `return_dict==True`:
        A dictionary which stores genes and promoter peaks.
        And a dictionary which stores genes and enhancer peaks.
    """
    promoter_dict = {}
    distal_dict = {}
    gene_body_dict = {}
    corr_dict = {}

    # read annotations
    with open(peak_annot_file) as f:
        header = next(f)
        tmp = header.split('\t')
        if len(tmp) == 4:
            cellranger_version = 1
        elif len(tmp) == 6:
            cellranger_version = 2
        else:
            raise ValueError('Peak annotation file should contain 4 columns '
                             '(CellRanger ARC 1.0.0) or 6 columns (CellRanger '
                             'ARC 2.0.0)')

        logg.update(f'CellRanger ARC identified as {cellranger_version}.0.0',
                    v=1)

        if cellranger_version == 1:
            for line in f:
                tmp = line.rstrip().split('\t')
                tmp1 = tmp[0].split('_')
                peak = f'{tmp1[0]}:{tmp1[1]}-{tmp1[2]}'
                if tmp[1] != '':
                    genes = tmp[1].split(';')
                    dists = tmp[2].split(';')
                    types = tmp[3].split(';')
                    for i, gene in enumerate(genes):
                        dist = dists[i]
                        annot = types[i]
                        if annot == 'promoter':
                            if gene not in promoter_dict:
                                promoter_dict[gene] = [peak]
                            else:
                                promoter_dict[gene].append(peak)
                        elif annot == 'distal':
                            if dist == '0':
                                if gene not in gene_body_dict:
                                    gene_body_dict[gene] = [peak]
                                else:
                                    gene_body_dict[gene].append(peak)
                            else:
                                if gene not in distal_dict:
                                    distal_dict[gene] = [peak]
                                else:
                                    distal_dict[gene].append(peak)
        else:
            for line in f:
                tmp = line.rstrip().split('\t')
                peak = f'{tmp[0]}:{tmp[1]}-{tmp[2]}'
                gene = tmp[3]
                dist = tmp[4]
                annot = tmp[5]
                if annot == 'promoter':
                    if gene not in promoter_dict:
                        promoter_dict[gene] = [peak]
                    else:
                        promoter_dict[gene].append(peak)
                elif annot == 'distal':
                    if dist == '0':
                        if gene not in gene_body_dict:
                            gene_body_dict[gene] = [peak]
                        else:
                            gene_body_dict[gene].append(peak)
                    else:
                        if gene not in distal_dict:
                            distal_dict[gene] = [peak]
                        else:
                            distal_dict[gene].append(peak)

    # read linkages
    with open(linkage_file) as f:
        for line in f:
            tmp = line.rstrip().split('\t')
            if tmp[12] == "peak-peak":
                peak1 = f'{tmp[0]}:{tmp[1]}-{tmp[2]}'
                peak2 = f'{tmp[3]}:{tmp[4]}-{tmp[5]}'
                tmp2 = tmp[6].split('><')[0][1:].split(';')
                tmp3 = tmp[6].split('><')[1][:-1].split(';')
                corr = float(tmp[7])
                for t2 in tmp2:
                    gene1 = t2.split('_')
                    for t3 in tmp3:
                        gene2 = t3.split('_')
                        # one of the peaks is in promoter, peaks belong to the
                        # same gene or are close in distance
                        if (((gene1[1] == "promoter") !=
                            (gene2[1] == "promoter")) and
                            ((gene1[0] == gene2[0]) or
                             (float(tmp[11]) < peak_dist))):

                            if gene1[1] == "promoter":
                                gene = gene1[0]
                            else:
                                gene = gene2[0]
                            if gene in corr_dict:
                                # peak 1 is in promoter, peak 2 is not in gene
                                # body -> peak 2 is added to gene 1
                                if (peak2 not in corr_dict[gene] and
                                    gene1[1] == "promoter" and
                                    (gene2[0] not in gene_body_dict or
                                     peak2 not in gene_body_dict[gene2[0]])):

                                    corr_dict[gene][0].append(peak2)
                                    corr_dict[gene][1].append(corr)
                                # peak 2 is in promoter, peak 1 is not in gene
                                # body -> peak 1 is added to gene 2
                                if (peak1 not in corr_dict[gene] and
                                    gene2[1] == "promoter" and
                                    (gene1[0] not in gene_body_dict or
                                     peak1 not in gene_body_dict[gene1[0]])):

                                    corr_dict[gene][0].append(peak1)
                                    corr_dict[gene][1].append(corr)
                            else:
                                # peak 1 is in promoter, peak 2 is not in gene
                                # body -> peak 2 is added to gene 1
                                if (gene1[1] == "promoter" and
                                    (gene2[0] not in
                                     gene_body_dict
                                     or peak2 not in
                                     gene_body_dict[gene2[0]])):

                                    corr_dict[gene] = [[peak2], [corr]]
                                # peak 2 is in promoter, peak 1 is not in gene
                                # body -> peak 1 is added to gene 2
                                if (gene2[1] == "promoter" and
                                    (gene1[0] not in
                                     gene_body_dict
                                     or peak1 not in
                                     gene_body_dict[gene1[0]])):

                                    corr_dict[gene] = [[peak1], [corr]]
            elif tmp[12] == "peak-gene":
                peak1 = f'{tmp[0]}:{tmp[1]}-{tmp[2]}'
                tmp2 = tmp[6].split('><')[0][1:].split(';')
                gene2 = tmp[6].split('><')[1][:-1]
                corr = float(tmp[7])
                for t2 in tmp2:
                    gene1 = t2.split('_')
                    # peak 1 belongs to gene 2 or are close in distance
                    # -> peak 1 is added to gene 2
                    if ((gene1[0] == gene2) or (float(tmp[11]) < peak_dist)):
                        gene = gene1[0]
                        if gene in corr_dict:
                            if (peak1 not in corr_dict[gene] and
                                gene1[1] != "promoter" and
                                (gene1[0] not in gene_body_dict or
                                 peak1 not in gene_body_dict[gene1[0]])):

                                corr_dict[gene][0].append(peak1)
                                corr_dict[gene][1].append(corr)
                        else:
                            if (gene1[1] != "promoter" and
                                (gene1[0] not in gene_body_dict or
                                 peak1 not in gene_body_dict[gene1[0]])):
                                corr_dict[gene] = [[peak1], [corr]]
            elif tmp[12] == "gene-peak":
                peak2 = f'{tmp[3]}:{tmp[4]}-{tmp[5]}'
                gene1 = tmp[6].split('><')[0][1:]
                tmp3 = tmp[6].split('><')[1][:-1].split(';')
                corr = float(tmp[7])
                for t3 in tmp3:
                    gene2 = t3.split('_')
                    # peak 2 belongs to gene 1 or are close in distance
                    # -> peak 2 is added to gene 1
                    if ((gene1 == gene2[0]) or (float(tmp[11]) < peak_dist)):
                        gene = gene1
                        if gene in corr_dict:
                            if (peak2 not in corr_dict[gene] and
                                gene2[1] != "promoter" and
                                (gene2[0] not in gene_body_dict or
                                 peak2 not in gene_body_dict[gene2[0]])):

                                corr_dict[gene][0].append(peak2)
                                corr_dict[gene][1].append(corr)
                        else:
                            if (gene2[1] != "promoter" and
                                (gene2[0] not in gene_body_dict or
                                 peak2 not in gene_body_dict[gene2[0]])):

                                corr_dict[gene] = [[peak2], [corr]]

    gene_dict = promoter_dict
    enhancer_dict = {}
    promoter_genes = list(promoter_dict.keys())
    logg.update(f'Found {len(promoter_genes)} genes with promoter peaks', 1)
    for gene in promoter_genes:
        if gene_body:  # add gene-body peaks
            if gene in gene_body_dict:
                for peak in gene_body_dict[gene]:
                    if peak not in gene_dict[gene]:
                        gene_dict[gene].append(peak)
        enhancer_dict[gene] = []
        if gene in corr_dict:  # add enhancer peaks
            for j, peak in enumerate(corr_dict[gene][0]):
                corr = corr_dict[gene][1][j]
                if corr > min_corr:
                    if peak not in gene_dict[gene]:
                        gene_dict[gene].append(peak)
                        enhancer_dict[gene].append(peak)

    # aggregate to genes
    adata_atac_X_copy = adata_atac.X.A
    gene_mat = np.zeros((adata_atac.shape[0], len(promoter_genes)))
    var_names = adata_atac.var_names.to_numpy()
    var_dict = {}

    for i, name in enumerate(var_names):
        var_dict.update({name: i})

    # if we only want to run one job at a time, then no parallelization
    # is necessary
    if n_jobs == 1:
        parallel = False

    if parallel:
        # if we want to run in parallel, modify the gene_mat variable with
        # multiple cores, calling prepare_gene_mat with joblib.Parallel()
        Parallel(n_jobs=n_jobs,
                 require='sharedmem')(
                 delayed(prepare_gene_mat)(var_dict,
                                           gene_dict[promoter_genes[i]],
                                           gene_mat,
                                           adata_atac_X_copy,
                                           i)for i in tqdm(range(
                                               len(promoter_genes))))

    else:
        # if we aren't running in parallel, just call prepare_gene_mat
        # from a for loop
        for i, gene in tqdm(enumerate(promoter_genes),
                            total=len(promoter_genes)):
            prepare_gene_mat(var_dict,
                             gene_dict[promoter_genes[i]],
                             gene_mat,
                             adata_atac_X_copy,
                             i)

    gene_mat[gene_mat < 0] = 0
    gene_mat = AnnData(X=csr_matrix(gene_mat))
    gene_mat.obs_names = pd.Index(list(adata_atac.obs_names))
    gene_mat.var_names = pd.Index(promoter_genes)
    gene_mat = gene_mat[:, gene_mat.X.sum(0) > 0]
    if return_dict:
        return gene_mat, promoter_dict, enhancer_dict
    else:
        return gene_mat


def tfidf_norm(adata_atac, scale_factor=1e4, copy=False):
    """TF-IDF normalization.

    This function normalizes counts in an AnnData object with TF-IDF.

    Parameters
    ----------
    adata_atac: :class:`~anndata.AnnData`
        ATAC anndata object.
    scale_factor: `float` (default: 1e4)
        Value to be multiplied after normalization.
    copy: `bool` (default: `False`)
        Whether to return a copy or modify `.X` directly.

    Returns
    -------
    If `copy==True`, a new ATAC anndata object which stores normalized counts
    in `.X`.
    """
    npeaks = adata_atac.X.sum(1)
    npeaks_inv = csr_matrix(1.0/npeaks)
    tf = adata_atac.X.multiply(npeaks_inv)
    idf = diags(np.ravel(adata_atac.X.shape[0] / adata_atac.X.sum(0))).log1p()
    if copy:
        adata_atac_copy = adata_atac.copy()
        adata_atac_copy.X = tf.dot(idf) * scale_factor
        return adata_atac_copy
    else:
        adata_atac.X = tf.dot(idf) * scale_factor


def gen_wnn(adata_rna, adata_adt, dims, nn, random_state=0):
    """Computes inputs for KNN smoothing.

    This function calculates the nn_idx and nn_dist matrices needed
    to run knn_smooth_chrom().

    Parameters
    ----------
    adata_rna: :class:`~anndata.AnnData`
        RNA anndata object.
    adata_atac: :class:`~anndata.AnnData`
        ATAC anndata object.
    dims: `List[int]`
        Dimensions of data for RNA (index=0) and ATAC (index=1)
    nn: `int` (default: `None`)
        Top N neighbors to extract for each cell in the connectivities matrix.

    Returns
    -------
    nn_idx: `np.darray` (default: `None`)
        KNN index matrix of size (cells, k).
    nn_dist: `np.darray` (default: `None`)
        KNN distance matrix of size (cells, k).
    """

    # make a copy of the original adata objects so as to keep them unchanged
    rna_copy = adata_rna.copy()
    adt_copy = adata_adt.copy()

    sc.tl.pca(rna_copy,
              n_comps=dims[0],
              random_state=np.random.RandomState(seed=42),
              use_highly_variable=True)  # run PCA on RNA

    lsi = scipy.sparse.linalg.svds(adt_copy.X, k=dims[1])  # run SVD on ADT

    # get the lsi result
    adt_copy.obsm['X_lsi'] = lsi[0]

    # add the PCA from adt to rna
    rna_copy.obsm['X_rna_pca'] = rna_copy.obsm.pop('X_pca')
    rna_copy.obsm['X_adt_lsi'] = adt_copy.obsm['X_lsi']

    # run WNN
    WNNobj = pyWNN(rna_copy,
                      reps=['X_rna_pca', 'X_adt_lsi'],
                      npcs=dims,
                      n_neighbors=nn,
                      seed=42)

    adata_seurat = WNNobj.compute_wnn(rna_copy)

    # get the matrix storing the distances between each cell and its neighbors
    cx = scipy.sparse.coo_matrix(adata_seurat.obsp["WNN_distance"])

    # the number of cells
    cells = adata_seurat.obsp['WNN_distance'].shape[0]

    # define the shape of our final results
    # and make the arrays that will hold the results
    new_shape = (cells, nn)
    nn_dist = np.zeros(shape=new_shape)
    nn_idx = np.zeros(shape=new_shape)

    # new_col defines what column we store data in
    # our result arrays
    new_col = 0

    # loop through the distance matrices
    for i, j, v in zip(cx.row, cx.col, cx.data):

        # store the distances between neighbor cells
        nn_dist[i][new_col % nn] = v

        # for each cell's row, store the row numbers of its neighbor cells
        # (1-indexing instead of 0- is a holdover from R multimodalneighbors())
        nn_idx[i][new_col % nn] = int(j) + 1

        new_col += 1

    return nn_idx, nn_dist


def knn_smooth_chrom(adata_atac, nn_idx=None, nn_dist=None, conn=None,
                     n_neighbors=None):
    """KNN smoothing.

    This function smooth (impute) the count matrix with k nearest neighbors.
    The inputs can be either KNN index and distance matrices or a pre-computed
    connectivities matrix (for example in adata_rna object).

    Parameters
    ----------
    adata_atac: :class:`~anndata.AnnData`
        ATAC anndata object.
    nn_idx: `np.darray` (default: `None`)
        KNN index matrix of size (cells, k).
    nn_dist: `np.darray` (default: `None`)
        KNN distance matrix of size (cells, k).
    conn: `csr_matrix` (default: `None`)
        Pre-computed connectivities matrix.
    n_neighbors: `int` (default: `None`)
        Top N neighbors to extract for each cell in the connectivities matrix.

    Returns
    -------
    `.layers['Mc']` stores imputed values.
    """
    if nn_idx is not None and nn_dist is not None:
        if nn_idx.shape[0] != adata_atac.shape[0]:
            raise ValueError('Number of rows of KNN indices does not equal to '
                             'number of observations.')
        if nn_dist.shape[0] != adata_atac.shape[0]:
            raise ValueError('Number of rows of KNN distances does not equal '
                             'to number of observations.')
        X = coo_matrix(([], ([], [])), shape=(nn_idx.shape[0], 1))
        conn, sigma, rho, dists = fuzzy_simplicial_set(X, nn_idx.shape[1],
                                                       None, None,
                                                       knn_indices=nn_idx-1,
                                                       knn_dists=nn_dist,
                                                       return_dists=True)
    elif conn is not None:
        pass
    else:
        raise ValueError('Please input nearest neighbor indices and distances,'
                         ' or a connectivities matrix of size n x n, with '
                         'columns being neighbors.'
                         ' For example, RNA connectivities can usually be '
                         'found in adata.obsp.')

    conn = conn.tocsr().copy()
    n_counts = (conn > 0).sum(1).A1
    if n_neighbors is not None and n_neighbors < n_counts.min():
        conn = top_n_sparse(conn, n_neighbors)
    conn.setdiag(1)
    conn_norm = conn.multiply(1.0 / conn.sum(1)).tocsr()
    adata_atac.layers['Mc'] = csr_matrix.dot(conn_norm, adata_atac.X)
    adata_atac.obsp['connectivities'] = conn


def calculate_qc_metrics(adata, **kwargs):
    """Basic QC metrics.

    This function calculate basic QC metrics with
    `scanpy.pp.calculate_qc_metrics`.
    Additionally, total counts and the ratio of unspliced and spliced matrices,
    as well as the cell cycle scores (with `scvelo.tl.score_genes_cell_cycle`)
    will be computed.

    Parameters
    ----------
    adata: :class:`~anndata.AnnData`
        RNA anndata object. Required fields: `unspliced` and `spliced`.
    Additional parameters passed to `scanpy.pp.calculate_qc_metrics`.

    Returns
    -------
    Outputs of `scanpy.pp.calculate_qc_metrics` and
    `scvelo.tl.score_genes_cell_cycle`. total_unspliced, total_spliced: `.var`
        total counts of unspliced and spliced matrices.
    unspliced_ratio: `.var`
        ratio of unspliced counts vs (unspliced + spliced counts).
    cell_cycle_score: `.var`
        cell cycle score difference between G2M_score and S_score.
    """
    sc.pp.calculate_qc_metrics(adata, **kwargs)
    if 'spliced' not in adata.layers:
        raise ValueError('Spliced matrix not found in adata.layers')
    if 'unspliced' not in adata.layers:
        raise ValueError('Unspliced matrix not found in adata.layers')

    logg.update(adata.layers['spliced'].shape, v=1)

    total_s = np.nansum(adata.layers['spliced'].toarray(), axis=1)
    total_u = np.nansum(adata.layers['unspliced'].toarray(), axis=1)

    logg.update(total_u.shape, v=1)

    adata.obs['total_unspliced'] = total_u
    adata.obs['total_spliced'] = total_s
    adata.obs['unspliced_ratio'] = total_u / (total_s + total_u)
    scv.tl.score_genes_cell_cycle(adata)
    adata.obs['cell_cycle_score'] = (adata.obs['G2M_score']
                                     - adata.obs['S_score'])


def ellipse_fit(adata,
                genes,
                color_by='quantile',
                n_cols=8,
                title=None,
                figsize=None,
                axis_on=False,
                pointsize=2,
                linewidth=2
                ):
    """Fit ellipses to unspliced and spliced phase portraits.

    This function plots the ellipse fits on the unspliced-spliced phase
    portraits.

    Parameters
    ----------
    adata: :class:`~anndata.AnnData`
        RNA anndata object. Required fields: `Mu` and `Ms`.
    genes: `str`,  list of `str`
        List of genes to plot.
    color_by: `str` (default: `quantile`)
        Color by the four quantiles based on ellipse fit if `quantile`. Other
        common values are leiden, louvain, celltype, etc.
        If not `quantile`, the color field must be present in `.uns`, which
        can be pre-computed with `scanpy.pl.scatter`.
        For `quantile`, red, orange, green, and blue represent quantile left,
        top, right, and bottom, respectively.
        If `quantile_scores`, `multivelo.compute_quantile_scores` function
        must have been run.
    n_cols: `int` (default: 8)
        Number of columns to plot on each row.
    figsize: `tuple` (default: `None`)
        Total figure size.
    title: `tuple` (default: `None`)
        Title of the figure. Default is `Ellipse Fit`.
    axis_on: `bool` (default: `False`)
        Whether to show axis labels.
    pointsize: `float` (default: 2)
        Point size for scatter plots.
    linewidth: `float` (default: 2)
        Line width for ellipse.
    """
    by_quantile = color_by == 'quantile'
    by_quantile_score = color_by == 'quantile_scores'
    if not by_quantile and not by_quantile_score:
        types = adata.obs[color_by].cat.categories
        colors = adata.uns[f'{color_by}_colors']
    gn = len(genes)
    if gn < n_cols:
        n_cols = gn
    fig, axs = plt.subplots(-(-gn // n_cols), n_cols, squeeze=False,
                            figsize=(2 * n_cols, 2.4 * (-(-gn // n_cols)))
                            if figsize is None else figsize)
    count = 0
    for gene in genes:
        u = np.array(adata[:, gene].layers['Mu'])
        s = np.array(adata[:, gene].layers['Ms'])
        row = count // n_cols
        col = count % n_cols
        non_zero = (u > 0) & (s > 0)
        if np.sum(non_zero) < 10:
            count += 1
            fig.delaxes(axs[row, col])
            continue

        mean_u, mean_s = np.mean(u[non_zero]), np.mean(s[non_zero])
        std_u, std_s = np.std(u[non_zero]), np.std(s[non_zero])
        u_ = (u - mean_u)/std_u
        s_ = (s - mean_s)/std_s
        X = np.reshape(s_[non_zero], (-1, 1))
        Y = np.reshape(u_[non_zero], (-1, 1))

        # Ax^2 + Bxy + Cy^2 + Dx + Ey + 1 = 0
        A = np.hstack([X**2, X * Y, Y**2, X, Y])
        b = -np.ones_like(X)
        x, res, _, _ = np.linalg.lstsq(A, b)
        x = x.squeeze()
        A, B, C, D, E = x
        good_fit = B**2 - 4*A*C < 0
        theta = np.arctan(B/(A - C))/2 \
            if x[0] > x[2] \
            else np.pi/2 + np.arctan(B/(A - C))/2
        good_fit = good_fit & (theta < np.pi/2) & (theta > 0)
        if not good_fit:
            count += 1
            fig.delaxes(axs[row, col])
            continue
        x_coord = np.linspace((-mean_s)/std_s, (np.max(s)-mean_s)/std_s, 500)
        y_coord = np.linspace((-mean_u)/std_u, (np.max(u)-mean_u)/std_u, 500)
        X_coord, Y_coord = np.meshgrid(x_coord, y_coord)
        Z_coord = (A * X_coord**2 + B * X_coord * Y_coord + C * Y_coord**2 +
                   D * X_coord + E * Y_coord + 1)

        M0 = np.array([
             A, B/2, D/2,
             B/2, C, E/2,
             D/2, E/2, 1,
        ]).reshape(3, 3)
        M = np.array([
            A, B/2,
            B/2, C,
        ]).reshape(2, 2)
        l1, l2 = np.sort(np.linalg.eigvals(M))
        xc = (B*E - 2*C*D)/(4*A*C - B**2)
        yc = (B*D - 2*A*E)/(4*A*C - B**2)
        slope_major = np.tan(theta)
        theta2 = np.pi/2 + theta
        slope_minor = np.tan(theta2)
        a = np.sqrt(-np.linalg.det(M0)/np.linalg.det(M)/l2)
        b = np.sqrt(-np.linalg.det(M0)/np.linalg.det(M)/l1)
        xtop = xc + a*np.cos(theta)
        ytop = yc + a*np.sin(theta)
        xbot = xc - a*np.cos(theta)
        ybot = yc - a*np.sin(theta)
        xtop2 = xc + b*np.cos(theta2)
        ytop2 = yc + b*np.sin(theta2)
        xbot2 = xc - b*np.cos(theta2)
        ybot2 = yc - b*np.sin(theta2)
        mse = res[0] / np.sum(non_zero)
        major = lambda x, y: (y - yc) - (slope_major * (x - xc))
        minor = lambda x, y: (y - yc) - (slope_minor * (x - xc))
        quant1 = (major(s_, u_) > 0) & (minor(s_, u_) < 0)
        quant2 = (major(s_, u_) > 0) & (minor(s_, u_) > 0)
        quant3 = (major(s_, u_) < 0) & (minor(s_, u_) > 0)
        quant4 = (major(s_, u_) < 0) & (minor(s_, u_) < 0)
        if (np.sum(quant1 | quant4) < 10) or (np.sum(quant2 | quant3) < 10):
            count += 1
            continue

        if by_quantile:
            axs[row, col].scatter(s_[quant1], u_[quant1], s=pointsize,
                                  c='tab:red', alpha=0.6)
            axs[row, col].scatter(s_[quant2], u_[quant2], s=pointsize,
                                  c='tab:orange', alpha=0.6)
            axs[row, col].scatter(s_[quant3], u_[quant3], s=pointsize,
                                  c='tab:green', alpha=0.6)
            axs[row, col].scatter(s_[quant4], u_[quant4], s=pointsize,
                                  c='tab:blue', alpha=0.6)
        elif by_quantile_score:
            if 'quantile_scores' not in adata.layers:
                raise ValueError('Please run multivelo.compute_quantile_scores'
                                 ' first to compute quantile scores.')
            axs[row, col].scatter(s_, u_, s=pointsize,
                                  c=adata[:, gene].layers['quantile_scores'],
                                  cmap='RdBu_r', alpha=0.7)
        else:
            for i in range(len(types)):
                filt = adata.obs[color_by] == types[i]
                axs[row, col].scatter(s_[filt], u_[filt], s=pointsize,
                                      c=colors[i], alpha=0.7)
        axs[row, col].contour(X_coord, Y_coord, Z_coord, levels=[0],
                              colors=('r'), linewidths=linewidth, alpha=0.7)
        axs[row, col].scatter([xc], [yc], c='black', s=5, zorder=2)
        axs[row, col].scatter([0], [0], c='black', s=5, zorder=2)
        axs[row, col].plot([xtop, xbot], [ytop, ybot], color='b',
                           linestyle='dashed', linewidth=linewidth, alpha=0.7)
        axs[row, col].plot([xtop2, xbot2], [ytop2, ybot2], color='g',
                           linestyle='dashed', linewidth=linewidth, alpha=0.7)

        axs[row, col].set_title(f'{gene} {mse:.3g}')
        axs[row, col].set_xlabel('s')
        axs[row, col].set_ylabel('u')
        common_range = [(np.min([(-mean_s)/std_s, (-mean_u)/std_u])
                        - (0.05*np.max(s)/std_s)),
                        (np.max([(np.max(s)-mean_s)/std_s,
                                 (np.max(u)-mean_u)/std_u])
                        + (0.05*np.max(s)/std_s))]
        axs[row, col].set_xlim(common_range)
        axs[row, col].set_ylim(common_range)
        if not axis_on:
            axs[row, col].xaxis.set_ticks_position('none')
            axs[row, col].yaxis.set_ticks_position('none')
            axs[row, col].get_xaxis().set_visible(False)
            axs[row, col].get_yaxis().set_visible(False)
            axs[row, col].xaxis.set_ticks_position('none')
            axs[row, col].yaxis.set_ticks_position('none')
            axs[row, col].set_frame_on(False)
        count += 1

    for i in range(col+1, n_cols):
        fig.delaxes(axs[row, i])
    if title is not None:
        fig.suptitle(title, fontsize=15)
    else:
        fig.suptitle('Ellipse Fit', fontsize=15)
    fig.tight_layout(rect=[0, 0.1, 1, 0.98])


def compute_quantile_scores(adata,
                            n_pcs=30,
                            n_neighbors=30
                            ):
    """Fit ellipses to unspliced and spliced phase portraits and compute
        quantile scores.

    This function fit ellipses to unspliced-spliced phase portraits. The cells
    are split into four groups (quantiles) based on the axes of the ellipse.
    Then the function assigns each quantile a score: -3 for left, -1 for top, 1
    for right, and 3 for bottom. These gene-specific values are smoothed with a
    connectivities matrix. This is similar to the RNA velocity gene time
    assignment.

    In addition, a 2-bit tuple is assigned to each of the four quantiles, (0,0)
    for left, (1,0) for top, (1,1) for right, and (0,1) for bottom. This is to
    mimic the distance relationship between quantiles.

    Parameters
    ----------
    adata: :class:`~anndata.AnnData`
        RNA anndata object. Required fields: `Mu` and `Ms`.
    n_pcs: `int` (default: 30)
        Number of principal components to compute connectivities.
    n_neighbors: `int` (default: 30)
        Number of nearest neighbors to compute connectivities.

    Returns
    -------
    quantile_scores: `.layers`
        gene-specific quantile scores
    quantile_scores_1st_bit, quantile_scores_2nd_bit: `.layers`
        2-bit assignment for gene quantiles
    quantile_score_sum: `.obs`
        aggreagted quantile scores
    quantile_genes: `.var`
        genes with good quantilty ellipse fits
    """
    neighbors = Neighbors(adata)
    neighbors.compute_neighbors(n_neighbors=n_neighbors, knn=True, n_pcs=n_pcs)
    conn = neighbors.connectivities
    conn.setdiag(1)
    conn_norm = conn.multiply(1.0 / conn.sum(1)).tocsr()

    quantile_scores = np.zeros(adata.shape)
    quantile_scores_2bit = np.zeros((adata.shape[0], adata.shape[1], 2))
    quantile_gene = np.full(adata.n_vars, False)
    quality_gene_idx = []
    for idx, gene in enumerate(adata.var_names):
        u = np.array(adata[:, gene].layers['Mu'])
        s = np.array(adata[:, gene].layers['Ms'])
        non_zero = (u > 0) & (s > 0)
        if np.sum(non_zero) < 10:
            continue

        mean_u, mean_s = np.mean(u[non_zero]), np.mean(s[non_zero])
        std_u, std_s = np.std(u[non_zero]), np.std(s[non_zero])
        u_ = (u - mean_u)/std_u
        s_ = (s - mean_s)/std_s
        X = np.reshape(s_[non_zero], (-1, 1))
        Y = np.reshape(u_[non_zero], (-1, 1))

        # Ax^2 + Bxy + Cy^2 + Dx + Ey + 1 = 0
        A = np.hstack([X**2, X * Y, Y**2, X, Y])
        b = -np.ones_like(X)
        x, res, _, _ = np.linalg.lstsq(A, b)
        x = x.squeeze()
        A, B, C, D, E = x
        good_fit = B**2 - 4*A*C < 0
        theta = np.arctan(B/(A - C))/2 \
            if x[0] > x[2] \
            else np.pi/2 + np.arctan(B/(A - C))/2
        good_fit = good_fit & (theta < np.pi/2) & (theta > 0)
        if not good_fit:
            continue

        x_coord = np.linspace((-mean_s)/std_s, (np.max(s)-mean_s)/std_s, 500)
        y_coord = np.linspace((-mean_u)/std_u, (np.max(u)-mean_u)/std_u, 500)
        X_coord, Y_coord = np.meshgrid(x_coord, y_coord)
        M = np.array([
            A, B/2,
            B/2, C,
        ]).reshape(2, 2)
        l1, l2 = np.sort(np.linalg.eigvals(M))
        xc = (B*E - 2*C*D)/(4*A*C - B**2)
        yc = (B*D - 2*A*E)/(4*A*C - B**2)
        slope_major = np.tan(theta)
        theta2 = np.pi/2 + theta
        slope_minor = np.tan(theta2)
        major = lambda x, y: (y - yc) - (slope_major * (x - xc))
        minor = lambda x, y: (y - yc) - (slope_minor * (x - xc))

        quant1 = (major(s_, u_) > 0) & (minor(s_, u_) < 0)
        quant2 = (major(s_, u_) > 0) & (minor(s_, u_) > 0)
        quant3 = (major(s_, u_) < 0) & (minor(s_, u_) > 0)
        quant4 = (major(s_, u_) < 0) & (minor(s_, u_) < 0)
        if (np.sum(quant1 | quant4) < 10) or (np.sum(quant2 | quant3) < 10):
            continue

        quantile_scores[:, idx:idx+1] = ((-3.) * quant1 + (-1.) * quant2 + 1.
                                         * quant3 + 3. * quant4)
        quantile_scores_2bit[:, idx:idx+1, 0] = 1. * (quant1 | quant2)
        quantile_scores_2bit[:, idx:idx+1, 1] = 1. * (quant2 | quant3)
        quality_gene_idx.append(idx)

    quantile_scores = csr_matrix.dot(conn_norm, quantile_scores)
    quantile_scores_2bit[:, :, 0] = csr_matrix.dot(conn_norm,
                                                   quantile_scores_2bit[:,
                                                                        :, 0])
    quantile_scores_2bit[:, :, 1] = csr_matrix.dot(conn_norm,
                                                   quantile_scores_2bit[:,
                                                                        :, 1])
    adata.layers['quantile_scores'] = quantile_scores
    adata.layers['quantile_scores_1st_bit'] = quantile_scores_2bit[:, :, 0]
    adata.layers['quantile_scores_2nd_bit'] = quantile_scores_2bit[:, :, 1]
    quantile_gene[quality_gene_idx] = True

    if settings.VERBOSITY >= 1:
        perc_good = np.sum(quantile_gene) / adata.n_vars * 100

    logg.update(f'{np.sum(quantile_gene)}/{adata.n_vars} - {perc_good:.3g}%'
                'genes have good ellipse fits', v=1)

    adata.obs['quantile_score_sum'] = \
        np.sum(adata[:, quantile_gene].layers['quantile_scores'], axis=1)
    adata.var['quantile_genes'] = quantile_gene


def cluster_by_quantile(adata,
                        plot=False,
                        n_clusters=None,
                        affinity='euclidean',
                        linkage='ward'
                        ):
    """Cluster genes based on 2-bit quantile scores.

    This function cluster similar genes based on their 2-bit quantile score
    assignments from ellipse fit.
    Hierarchical cluster is done with `sklean.cluster.AgglomerativeClustering`.

    Parameters
    ----------
    adata: :class:`~anndata.AnnData`
        RNA anndata object. Required fields: `Mu` and `Ms`.
    plot: `bool` (default: `False`)
        Plot the hierarchical clusters.
    n_clusters: `int` (default: None)
        The number of clusters to keep.
    affinity: `str` (default: `euclidean`)
        Metric used to compute linkage. Passed to
        `sklean.cluster.AgglomerativeClustering`.
    linkage: `str` (default: `ward`)
        Linkage criterion to use. Passed to
        `sklean.cluster.AgglomerativeClustering`.

    Returns
    -------
    quantile_cluster: `.var`
        cluster assignments of genes based on quantiles
    """
    from sklearn.cluster import AgglomerativeClustering
    if 'quantile_scores_1st_bit' not in adata.layers.keys():
        raise ValueError("Quantile scores not found. Please run "
                         "compute_quantile_scores function first.")
    quantile_gene = adata.var['quantile_genes']
    if plot or n_clusters is None:
        cluster = AgglomerativeClustering(distance_threshold=0,
                                          n_clusters=None,
                                          affinity=affinity,
                                          linkage=linkage)
        cluster = cluster.fit(np.vstack((adata[:, quantile_gene]
                                         .layers['quantile_scores_1st_bit'],
                                         adata[:, quantile_gene]
                                         .layers['quantile_scores_2nd_bit']))
                                .transpose())

        # https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
        def plot_dendrogram(model, **kwargs):
            from scipy.cluster.hierarchy import dendrogram
            counts = np.zeros(model.children_.shape[0])
            n_samples = len(model.labels_)
            for i, merge in enumerate(model.children_):
                current_count = 0
                for child_idx in merge:
                    if child_idx < n_samples:
                        current_count += 1
                    else:
                        current_count += counts[child_idx - n_samples]
                counts[i] = current_count
            linkage_matrix = np.column_stack([model.children_,
                                              model.distances_,
                                              counts]).astype(float)
            dendrogram(linkage_matrix, **kwargs)

        plot_dendrogram(cluster, truncate_mode='level', p=5, no_labels=True)

    if n_clusters is not None:
        n_clusters = int(n_clusters)
        cluster = AgglomerativeClustering(n_clusters=n_clusters,
                                          affinity=affinity,
                                          linkage=linkage)
        cluster = cluster.fit_predict(np.vstack((adata[:, quantile_gene].layers
                                                 ['quantile_scores_1st_bit'],
                                                 adata[:, quantile_gene].layers
                                                 ['quantile_scores_2nd_bit']))
                                        .transpose())
        quantile_cluster = np.full(adata.n_vars, -1)
        quantile_cluster[quantile_gene] = cluster
        adata.var['quantile_cluster'] = quantile_cluster