|
a |
|
b/docs/source/clustering.rst |
|
|
1 |
================================ |
|
|
2 |
Consensus Clustering |
|
|
3 |
================================ |
|
|
4 |
|
|
|
5 |
InMoose implements consensus clustering [Monti2003]_, an unsupervised cluster |
|
|
6 |
discovery algorithm. InMoose implementation is based on a previous |
|
|
7 |
`implementation by Žiga Sajovic <https://github.com/ZigaSajovic/Consensus_Clustering>`_ |
|
|
8 |
|
|
|
9 |
Cohort Stratification |
|
|
10 |
===================== |
|
|
11 |
|
|
|
12 |
.. currentmodule:: inmoose.consensus_clustering |
|
|
13 |
|
|
|
14 |
We illustrate the clustering-based stratification capabilities of InMoose. |
|
|
15 |
|
|
|
16 |
We start by simulating RNA-Seq data, using the :mod:`sim` module of InMoose. |
|
|
17 |
|
|
|
18 |
.. repl:: |
|
|
19 |
import numpy as np |
|
|
20 |
import pandas as pd |
|
|
21 |
from inmoose.sim import sim_rnaseq |
|
|
22 |
|
|
|
23 |
# number of genes |
|
|
24 |
N = 1000 |
|
|
25 |
# number of samples |
|
|
26 |
M = 1000 |
|
|
27 |
assert M % 10 == 0 |
|
|
28 |
P = M // 10 # 10% of M, helper variable |
|
|
29 |
|
|
|
30 |
# 3 batches: 20% 30% 50% of the samples |
|
|
31 |
batch = (2 * P) * [0] + (3 * P) * [1] + (5 * P) * [2] |
|
|
32 |
batch = np.array([f"batch{b}" for b in batch]) |
|
|
33 |
batch0 = batch == "batch0" |
|
|
34 |
batch1 = batch == "batch1" |
|
|
35 |
batch2 = batch == "batch2" |
|
|
36 |
|
|
|
37 |
# 2 condition groups |
|
|
38 |
# - group 1: 50% batch 1, 33% batch 2, 60% batch 3 |
|
|
39 |
# - group 2: 50% batch 1, 67% batch 2, 40% batch 3 |
|
|
40 |
group = P * [0] + P * [1] + P * [0] + (2 * P) * [1] + (2 * P) * [0] + (3 * P) * [1] |
|
|
41 |
group = np.array([f"group{g}" for g in group]) |
|
|
42 |
assert len(batch) == M and len(group) == M |
|
|
43 |
|
|
|
44 |
# store clinical metadata (i.e. batch and group) as a DataFrame |
|
|
45 |
clinical = pd.DataFrame({"batch": batch, "group": group}) |
|
|
46 |
clinical.index = [f"sample{i}" for i in range(M)] |
|
|
47 |
|
|
|
48 |
# simulate data |
|
|
49 |
# random_state passes a seed to the PRNG for reproducibility |
|
|
50 |
counts = sim_rnaseq(N, M, batch=batch, group=group, random_state=42).T |
|
|
51 |
|
|
|
52 |
|
|
|
53 |
We then run the consensus clustering algorithm. |
|
|
54 |
|
|
|
55 |
.. repl:: |
|
|
56 |
from inmoose.consensus_clustering.consensus_clustering import consensusClustering |
|
|
57 |
from sklearn.cluster import AgglomerativeClustering |
|
|
58 |
|
|
|
59 |
cc = consensusClustering(AgglomerativeClustering) |
|
|
60 |
cc.compute_consensus_clustering(counts, random_state=None) |
|
|
61 |
|
|
|
62 |
|
|
|
63 |
We can now look at the clusters found. |
|
|
64 |
|
|
|
65 |
.. repl-quiet:: |
|
|
66 |
from matplotlib import rcParams |
|
|
67 |
# repl default config replaces '.' by '-' in the savefig.directory :/ |
|
|
68 |
rcParams['savefig.directory'] = rcParams['savefig.directory'].replace("readthedocs-org", "readthedocs.org") |
|
|
69 |
|
|
|
70 |
|
|
|
71 |
.. repl:: |
|
|
72 |
from anndata import AnnData |
|
|
73 |
from inmoose.utils import Factor |
|
|
74 |
import scanpy as sc |
|
|
75 |
|
|
|
76 |
|
|
|
77 |
ad = AnnData(counts, obs=clinical) |
|
|
78 |
for k in range(2, 11): |
|
|
79 |
# Factor ensures that cluster ID are interpreted as categorical data |
|
|
80 |
ad.obs[f"k={k}"] = Factor(cc.predict(k)) |
|
|
81 |
|
|
|
82 |
# compute the PCA |
|
|
83 |
sc.tl.pca(ad) |
|
|
84 |
# plot the PCA |
|
|
85 |
sc.pl.pca(ad, color=[f"k={k}" for k in range(2, 11)], return_fig=True).show() |
|
|
86 |
|
|
|
87 |
|
|
|
88 |
|
|
|
89 |
References |
|
|
90 |
========== |
|
|
91 |
|
|
|
92 |
.. [Monti2003] S. Monti, P. Tamayo, J. Mesirov, T. Golub. 2003. Consensus |
|
|
93 |
Clustering: A Resampling-Based Method for Class Discovery and Visualization |
|
|
94 |
of Gene Expression Microarray Data. *Machine Learning* 52(1). |
|
|
95 |
:doi:`https://doi.org/10.1023/A:1023949509487` |
|
|
96 |
|
|
|
97 |
|
|
|
98 |
Code documentation |
|
|
99 |
================== |
|
|
100 |
|
|
|
101 |
.. autosummary:: |
|
|
102 |
:toctree: generated/ |
|
|
103 |
|
|
|
104 |
~consensus_clustering.consensusClustering |
|
|
105 |
|