Download this file

70 lines (58 with data), 2.3 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
class SCMO_LDA(LatentDirichletAllocation):
"""
This class is a slight expansion of the sklearn LatentDirichletAllocation,
and implements two scmo specific additions:
- reconstruction of the count matrix using reconstruct()
- keep count matrix structure when performing fit_transform
Example:
>>> from singlecellmultiomics.utils import SCMO_LDA
>>> SCMO_LDA(n_jobs=-1)
>>> topics = lda.fit_transform(X)
>>> imputed_X = lda.reconstruct(X)
"""
def fit_transform(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame:
"""
Expects a sample by gene/genomic location pd.dataframe
"""
return pd.DataFrame(
LatentDirichletAllocation.fit_transform(
self,
X,
**kwargs),
index=X.index
)
def reconstruct(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Reconstruct imputed count matrix
This method converts the LDA components to probabilities
and then multiplies these by the total counts observed for
each cell
Example:
>>> from singlecellmultiomics.utils import SCMO_LDA
>>> SCMO_LDA(n_jobs=-1)
>>> topics = lda.fit_transform(X)
>>> imputed_X = lda.reconstruct(X)
Warning:
Make sure to call .fit_transform or .fit first!
Args:
X: sample by gene/genomic location pandas DataFrame
Returns:
reconstructed_count_frame : sample by gene/genomic location pandas DataFrame
"""
# Obtain topic weights for each cell:
tf = self.transform(X)
# Convert lda components to probabilities
comps = pd.DataFrame(
self.components_ / self.components_.sum(axis=1)[:, np.newaxis],
columns=X.columns )
# Convert the probabilities to counts and reconstruct the count matrix
return pd.DataFrame([
(comps.T*row).sum(1)*total_cuts
for (cell, row), total_cuts in
zip(pd.DataFrame(tf,index=X.index).iterrows(),
X.sum(1)
)
], index=X.index )