inmoose / Git / [ea0fd6] /inmoose/deseq2/replicates.py

Models:
AlyssaS/
inmoose
Downloads: 1
[ea0fd6]: / inmoose / deseq2 / replicates.py
History
Download this file
106 lines (88 with data), 4.2 kB

# -----------------------------------------------------------------------------
# Copyright (C) 2013-2022 Michael I. Love, Constantin Ahlmann-Eltze
# Copyright (C) 2023 Maximilien Colange

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------------

# This file is based on the file 'R/core.R' of the Bioconductor DESeq2 package
# (version 3.16).


import numpy as np

from ..utils import Factor


def collapseReplicates(obj, groupby, run, renameCols=True):
    """
    Collapse technical replicates in an AnnData or DESeqDataSet

    Collapse the samples (rows) in `obj` by summing within levels
    of a grouping factor `groupby`. The purpose of this function
    if to sum up read counts from technical replicates to create an object
    with a single row of read counts for each sample.
    Note: by "technical replicates", we mean multiple sequencing runs of the same
    library, in contrast to "biological replicates" in which multiple
    libraries are prepared from separate biological units.
    Optionally renames the columns of returned object with the levels of the
    grouping factor.

    Arguments
    ---------
    obj : AnnData or DESeqDataSet
    groupby : Factor
        a grouping factor, as long as the rows of obj
    run : ??, optional
        the names of each unique row in obj
        if provided, a new row "runsCollapsed" will be added to obj.obs
        which pastes together the names of `run`
    renameCols : bool, optional
        whether to rename the rows of the returned object using the levels
        of the grouping factor

    Returns
    -------
    AnnData or DESeqDataSet
        an object with as many rows as levels in `groupby`.
        This object has count data which is summed from the various rows which
        are grouped together, and its .obs is subset using the first row
        for each group in `groupby`.

    Examples
    --------
    >>> dds = makeExampleDESeqDataSet(m=12)
    >>> # make data with two technical replicates for three samples
    >>> dds.obs["sample"] = Factor(np.repeat(np.arange(1,10), [2,1,1,2,1,1,2,1,1]))
    >>> dds.obs["run"] = [f"run{i}" for i in range(12)]
    >>> ddsColl = collapseReplicates(dds, dds.obs["sample"], dds.obs["run"])
    >>> # examine the clinical data and rows names of the collapsed data
    >>> ddsColl.obs
    ???
    >>> ddsColl.index
    ???
    >>> # check that the sum of the counts for "sample0" is the same
    ... # as the counts in the "sample0" row in ddsColl
    >>> matchFirstLevel = dds.obs["sample"] == dds.obs["sample"].categories[0]
    >>> np.all(np.sum(dds[matchFirstLevel,:].counts(), axis=0) == ddsColl[0,:].counts())
    True
    """
    if not isinstance(groupby, Factor):
        groupby = Factor(groupby)
    groupby = groupby.droplevels()
    if len(groupby) != obj.n_obs:
        raise ValueError("groupby should have as many elements as observations in obj")

    countdata = [
        np.sum(obj.counts()[groupby == c, :], axis=0) for c in groupby.categories
    ]
    rowsToKeep = [np.nonzero(groupby == c)[0][0] for c in groupby.categories]
    collapsed = obj[rowsToKeep, :]
    collapsed.X = countdata
    if run is not None:
        if len(groupby) != len(run):
            raise ValueError("groupby and run should have the same length")
        collapsed.obs["runsCollapsed"] = [
            ",".join([f"{e}" for e in run[groupby == c]]) for c in groupby.categories
        ]

    if renameCols:
        collapsed.index = groupby.categories

    assert np.sum(obj.counts()) == np.sum(collapsed.counts())

    return collapsed