|
a |
|
b/paper/Application on integrating mouse brain datasets/run_VITAE.py |
|
|
1 |
import scanpy as sc |
|
|
2 |
import numpy as np |
|
|
3 |
import pandas as pd |
|
|
4 |
import os |
|
|
5 |
from scipy import sparse |
|
|
6 |
import sys |
|
|
7 |
import importlib |
|
|
8 |
import VITAE |
|
|
9 |
import seaborn |
|
|
10 |
|
|
|
11 |
import random |
|
|
12 |
import tensorflow as tf |
|
|
13 |
|
|
|
14 |
## Preprocess datasets from "Molecular logic of cellular diversification in the mouse cerebral cortex" |
|
|
15 |
## One can download the dataset and meta data through the URL: |
|
|
16 |
## https://singlecell.broadinstitute.org/single_cell/study/SCP1290/molecular-logic-of-cellular-diversification-in-the-mammalian-cerebral-cortex |
|
|
17 |
## transoform.h5ad file we used means data has been normalized to 10000 and done log(x+1) transformation. |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
###################################### Preprocess Di Bella's dataset ################################################# |
|
|
21 |
dd=sc.read("transform.h5ad") |
|
|
22 |
metadata = pd.read_csv('metaData_scDevSC.txt', delimiter='\t', index_col = 0) |
|
|
23 |
dd.obs['Day'] = metadata['orig_ident'][1:metadata.shape[0]] |
|
|
24 |
dd.obs['Clusters'] = metadata['New_cellType'][1:metadata.shape[0]] |
|
|
25 |
dd.obs['Clusters'] = pd.Categorical(dd.obs['Clusters'], categories = [ |
|
|
26 |
'Apical progenitors', 'Intermediate progenitors', 'Migrating neurons', |
|
|
27 |
'Immature neurons', 'Cajal Retzius cells', 'CThPN', 'SCPN', |
|
|
28 |
'NP', 'Layer 6b', 'Layer 4', 'DL CPN', 'DL_CPN_1', 'DL_CPN_2', 'UL CPN', |
|
|
29 |
'Interneurons', 'Astrocytes', 'Oligodendrocytes', 'Microglia', |
|
|
30 |
'Cycling glial cells', 'Ependymocytes', 'Endothelial cells', |
|
|
31 |
'VLMC', 'Pericytes','Red blood cells', 'Doublet', 'Low quality cells' |
|
|
32 |
], ordered = True) |
|
|
33 |
dd.obs['S_Score'] = pd.to_numeric(metadata['S_Score'][1:metadata.shape[0]]) |
|
|
34 |
dd.obs['G2M_Score'] = pd.to_numeric(metadata['G2M_Score'][1:metadata.shape[0]]) |
|
|
35 |
dd = dd[dd.obs['Clusters'].isin(['Doublet', 'Low quality cells']) == False] |
|
|
36 |
dd.obs.index=dd.obs.index.tolist() |
|
|
37 |
dd.obs['Day']=dd.obs['Day'].tolist() |
|
|
38 |
dd.obs['Clusters']=dd.obs['Clusters'].tolist() |
|
|
39 |
dd.obs['S_Score']=dd.obs['S_Score'].tolist() |
|
|
40 |
dd.obs['G2M_Score']=dd.obs['G2M_Score'].tolist() |
|
|
41 |
|
|
|
42 |
sc.pp.highly_variable_genes(dd, flavor = "seurat") |
|
|
43 |
sc.pp.scale(dd, max_value=10) |
|
|
44 |
|
|
|
45 |
day18 = np.array([np.nan] * dd.shape[0]).astype(object) |
|
|
46 |
day18[(dd.obs["Day"] == "E18").values] = "E18" |
|
|
47 |
day18[(dd.obs["Day"] == "E18_S1").values] = "E18_S1" |
|
|
48 |
day18[(dd.obs["Day"] == "E18_S3").values] = "E18_S3" |
|
|
49 |
dd.obs["merge_day_18"] = day18 |
|
|
50 |
|
|
|
51 |
dayP1 = np.array([np.nan] * dd.shape[0]).astype(object) |
|
|
52 |
dayP1[dd.obs["Day"] == "P1"] = "P1" |
|
|
53 |
dayP1[dd.obs["Day"] == "P1_S1"] = "P1_S1" |
|
|
54 |
dd.obs["merge_P1"] = dayP1 |
|
|
55 |
|
|
|
56 |
dd.obs["merge_day_18"] = dd.obs["merge_day_18"].astype("category") |
|
|
57 |
dd.obs["merge_P1"] = dd.obs["merge_P1"].astype("category") |
|
|
58 |
|
|
|
59 |
dd.obs["Source"] = "Di Bella" |
|
|
60 |
|
|
|
61 |
|
|
|
62 |
|
|
|
63 |
###################################### Preprocess Yuzwa's and Ruan's merged dataset ################################################# |
|
|
64 |
mouse = load_data(path='data/',file_name="mouse_brain_merged") |
|
|
65 |
sc.pp.normalize_total(mouse, target_sum=1e4) |
|
|
66 |
sc.pp.log1p(mouse) |
|
|
67 |
sc.pp.highly_variable_genes(mouse, min_mean=0.0125, max_mean=3, min_disp=0.5) |
|
|
68 |
sc.pp.scale(mouse, max_value=10) |
|
|
69 |
sc.tl.pca(mouse, n_comps = 64) |
|
|
70 |
sc.pp.neighbors(mouse, n_neighbors = 20) |
|
|
71 |
sc.tl.umap(mouse) |
|
|
72 |
|
|
|
73 |
mouse_source = np.repeat("Yuzwa",mouse.shape[0]) |
|
|
74 |
mouse_source[mouse.obs["covariate_2"]==0] = "Ruan" |
|
|
75 |
mouse.obs["Source"] = mouse_source |
|
|
76 |
sc.pl.umap(mouse,color="Source",save = "_mouse_Source.pdf") |
|
|
77 |
|
|
|
78 |
mouse_day = mouse.obs.index.values.copy() |
|
|
79 |
mouse_day = [x[:3] for x in mouse_day] |
|
|
80 |
mouse.obs["Day"] = mouse_day |
|
|
81 |
mouse.obs.rename(columns={"grouping": "Clusters", "covariate_0": "S_Score", |
|
|
82 |
"covariate_1":"G2M_Score"}, inplace = True) |
|
|
83 |
|
|
|
84 |
|
|
|
85 |
|
|
|
86 |
###################################### Merge two datasets ################################################# |
|
|
87 |
|
|
|
88 |
dd = dd.concatenate(mouse,join="inner") |
|
|
89 |
## define highly_variable in merged dataset |
|
|
90 |
dd.var["highly_variable"] = dd.var["highly_variable-0"].astype(bool) | dd.var["highly_variable-1"].astype(bool) |
|
|
91 |
|
|
|
92 |
## union different name of cell types in two datasets |
|
|
93 |
group_dict = {"Immature Neuron" : "Immature neurons", |
|
|
94 |
"NEC":"Apical progenitors", |
|
|
95 |
"RGC":"Apical progenitors", |
|
|
96 |
"Layer I":"Cajal Retzius cells", |
|
|
97 |
"OPC":"Oligodendrocytes", |
|
|
98 |
"Interneurons":"Interneurons", |
|
|
99 |
"Endothelial Cell":"Endothelial cells", |
|
|
100 |
"Microglia":"Microglia", |
|
|
101 |
"Pericyte":"Pericytes", |
|
|
102 |
"Intermediate progenitors":"IPC"} |
|
|
103 |
|
|
|
104 |
c = dd.obs["Clusters"].values.copy() |
|
|
105 |
c = [x if group_dict.get(x) == None else group_dict.get(x) for x in c] |
|
|
106 |
dd.obs["tidy_clusters"] = c.copy() |
|
|
107 |
|
|
|
108 |
## Add covariates for different source |
|
|
109 |
a = np.zeros(dd.shape[0]) |
|
|
110 |
a[dd.obs["Source"] == "Ruan"] = 1 |
|
|
111 |
dd.obs["cov1"] = a |
|
|
112 |
|
|
|
113 |
a = np.zeros(dd.shape[0]) |
|
|
114 |
a[dd.obs["Source"] == "Yuzwa"] = 1 |
|
|
115 |
dd.obs["cov2"] = a |
|
|
116 |
|
|
|
117 |
a = np.zeros(dd.shape[0]) |
|
|
118 |
a[dd.obs["Source"] == "Di Bella"] = 1 |
|
|
119 |
dd.obs["cov3"] = a |
|
|
120 |
|
|
|
121 |
a = np.array([np.nan] * dd.shape[0]) |
|
|
122 |
a[dd.obs["Day"] == "E18"] = 1 |
|
|
123 |
a[dd.obs["Day"] == "E18_S1"] = 2 |
|
|
124 |
a[dd.obs["Day"] == "E18_S3"] = 3 |
|
|
125 |
dd.obs["merge_18"] = a |
|
|
126 |
|
|
|
127 |
|
|
|
128 |
a = np.array([np.nan] * dd.shape[0]) |
|
|
129 |
a[dd.obs["Day"] == "P1"] = 1 |
|
|
130 |
a[dd.obs["Day"] == "P1_S1"] = 2 |
|
|
131 |
dd.obs["merge_P1"] = a |
|
|
132 |
|
|
|
133 |
dd.obs["merge_18"] = dd.obs["merge_18"].astype("category") |
|
|
134 |
dd.obs["merge_P1"] = dd.obs["merge_P1"].astype("category") |
|
|
135 |
|
|
|
136 |
a = np.array([np.nan] * dd.shape[0]).astype(object) |
|
|
137 |
a[dd.obs["merge_P1"] == 1] = "P1" |
|
|
138 |
a[dd.obs["merge_P1"] == 2] = "P1_S1" |
|
|
139 |
dd.obs["merge_day_P1"] = a |
|
|
140 |
|
|
|
141 |
a = dd.obs["tidy_clusters"].values.copy().astype(str) |
|
|
142 |
a[(dd.obs["Day"].isin(["E14","E15","E16"])) & (a == "SCPN")] = "SCPN1" |
|
|
143 |
dd.obs["Cluster2"] = a.copy() |
|
|
144 |
|
|
|
145 |
###################################### Naive Merge ################################################# |
|
|
146 |
## See what happened if we directly merge them ## |
|
|
147 |
dd.obs["Source"] = dd.obs["Source"].astype(pd.CategoricalDtype( |
|
|
148 |
['Ruan', 'Yuzwa','Di Bella'], ordered=True)) |
|
|
149 |
|
|
|
150 |
sc.tl.pca(dd, n_comps = 64) |
|
|
151 |
sc.pp.neighbors(dd, n_neighbors = 20) |
|
|
152 |
sc.tl.umap(dd) |
|
|
153 |
seaborn.set(rc={'figure.figsize':(6,6)},style = "white") |
|
|
154 |
sc.pl.umap(dd,color="Source",save="_naive_Source.pdf",size = 5,palette= ["#F8766D","#00BFC4","#E5E8E8"],alpha = 0.5,na_color="#E5E8E8") |
|
|
155 |
|
|
|
156 |
color_ggplot=["#F8766D","#E38900","#C49A00","#99A800","#53B400","#00BC56","#00C094","#00BFC4","#00B6EB","#06A4FF","#A58AFF","#DF70F8","#FB61D7","#FF66A8"] |
|
|
157 |
seaborn.set(rc={'figure.figsize':(5,5)},style = "white") |
|
|
158 |
sc.pl.umap(dd,color="Day",size = 2, alpha=0.4,palette=color_ggplot,save="_naive_merge_day.pdf") |
|
|
159 |
|
|
|
160 |
|
|
|
161 |
###################################### Train VITAE ################################################# |
|
|
162 |
|
|
|
163 |
|
|
|
164 |
seed = 400 |
|
|
165 |
tf.keras.backend.clear_session() |
|
|
166 |
random.seed(seed) |
|
|
167 |
np.random.seed(seed) |
|
|
168 |
tf.random.set_seed(seed) |
|
|
169 |
|
|
|
170 |
model = VITAE.VITAE(adata = dd, |
|
|
171 |
# adata_layer_counts = 'counts', |
|
|
172 |
hidden_layers = [48, 24], |
|
|
173 |
latent_space_dim = 16, |
|
|
174 |
model_type = 'Gaussian',covariates = ['S_Score', 'G2M_Score',"cov1","cov2"],npc = 96) |
|
|
175 |
|
|
|
176 |
|
|
|
177 |
model.pre_train(learning_rate = 1e-3,early_stopping_tolerance = 0.01, early_stopping_relative = True,L=1,phi=1,verbose=False,gamma = 1) |
|
|
178 |
model.init_latent_space(cluster_label= 'Cluster2',ratio_prune=0.45) |
|
|
179 |
model.train(beta = 1, learning_rate = 1e-3, early_stopping_tolerance = 0.01, early_stopping_relative = True,gamma=0,phi=1, verbose=False) |
|
|
180 |
model.posterior_estimation(batch_size=32, L=10) |
|
|
181 |
model.save_model(path_to_file="model.checkpoint") |