SingleCellMultiOmics / Git / [fedac4] /singlecellmultiomics/utils/lda.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
[fedac4]: / singlecellmultiomics / utils / lda.py
History
Download this file
70 lines (58 with data), 2.3 kB

import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation

class SCMO_LDA(LatentDirichletAllocation):
    """
    This class is a slight expansion of the sklearn LatentDirichletAllocation,
    and implements two scmo specific additions:

    - reconstruction of the count matrix using reconstruct()
    - keep count matrix structure when performing fit_transform

    Example:
        >>> from singlecellmultiomics.utils import SCMO_LDA
        >>> SCMO_LDA(n_jobs=-1)
        >>> topics = lda.fit_transform(X)
        >>> imputed_X = lda.reconstruct(X)
    """

    def fit_transform(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame:
        """
        Expects a sample by gene/genomic location pd.dataframe
        """
        return pd.DataFrame(
                LatentDirichletAllocation.fit_transform(
                    self,
                    X,
                    **kwargs),
                index=X.index
                )

    def reconstruct(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Reconstruct imputed count matrix
        This method converts the LDA components to probabilities
        and then multiplies these by the total counts observed for
        each cell

        Example:
            >>> from singlecellmultiomics.utils import SCMO_LDA
            >>> SCMO_LDA(n_jobs=-1)
            >>> topics = lda.fit_transform(X)
            >>> imputed_X = lda.reconstruct(X)

        Warning:
            Make sure to call .fit_transform or .fit first!

        Args:
            X: sample by gene/genomic location pandas DataFrame

        Returns:
            reconstructed_count_frame : sample by gene/genomic location pandas DataFrame

        """
        # Obtain topic weights for each cell:
        tf = self.transform(X)
        # Convert lda components to probabilities
        comps = pd.DataFrame(
            self.components_ / self.components_.sum(axis=1)[:, np.newaxis],
            columns=X.columns )

        # Convert the probabilities to counts and reconstruct the count matrix
        return pd.DataFrame([
                (comps.T*row).sum(1)*total_cuts
                for (cell, row), total_cuts in
                    zip(pd.DataFrame(tf,index=X.index).iterrows(),
                    X.sum(1)
                )
            ], index=X.index )