Switch to unified view

a b/paper/Application on integrating mouse brain datasets/run_VITAE.py
1
import scanpy as sc
2
import numpy as np
3
import pandas as pd
4
import os
5
from scipy import sparse
6
import sys
7
import importlib
8
import VITAE
9
import seaborn
10
11
import random
12
import tensorflow as tf
13
14
## Preprocess datasets from "Molecular logic of cellular diversification in the mouse cerebral cortex"
15
## One can download the dataset and meta data through the URL:
16
## https://singlecell.broadinstitute.org/single_cell/study/SCP1290/molecular-logic-of-cellular-diversification-in-the-mammalian-cerebral-cortex
17
## transoform.h5ad file we used means data has been normalized to 10000 and done log(x+1) transformation.
18
19
20
###################################### Preprocess Di Bella's dataset #################################################
21
dd=sc.read("transform.h5ad")
22
metadata = pd.read_csv('metaData_scDevSC.txt', delimiter='\t', index_col = 0)
23
dd.obs['Day'] = metadata['orig_ident'][1:metadata.shape[0]]
24
dd.obs['Clusters'] = metadata['New_cellType'][1:metadata.shape[0]]
25
dd.obs['Clusters'] = pd.Categorical(dd.obs['Clusters'], categories = [
26
    'Apical progenitors', 'Intermediate progenitors', 'Migrating neurons',
27
    'Immature neurons', 'Cajal Retzius cells', 'CThPN', 'SCPN',
28
    'NP', 'Layer 6b', 'Layer 4', 'DL CPN', 'DL_CPN_1', 'DL_CPN_2', 'UL CPN',
29
    'Interneurons', 'Astrocytes', 'Oligodendrocytes', 'Microglia',
30
    'Cycling glial cells', 'Ependymocytes', 'Endothelial cells',
31
    'VLMC', 'Pericytes','Red blood cells', 'Doublet', 'Low quality cells'
32
    ], ordered = True)
33
dd.obs['S_Score'] = pd.to_numeric(metadata['S_Score'][1:metadata.shape[0]])
34
dd.obs['G2M_Score'] = pd.to_numeric(metadata['G2M_Score'][1:metadata.shape[0]])
35
dd = dd[dd.obs['Clusters'].isin(['Doublet', 'Low quality cells']) == False]
36
dd.obs.index=dd.obs.index.tolist()
37
dd.obs['Day']=dd.obs['Day'].tolist()
38
dd.obs['Clusters']=dd.obs['Clusters'].tolist()
39
dd.obs['S_Score']=dd.obs['S_Score'].tolist()
40
dd.obs['G2M_Score']=dd.obs['G2M_Score'].tolist()
41
42
sc.pp.highly_variable_genes(dd, flavor = "seurat")
43
sc.pp.scale(dd, max_value=10)
44
45
day18 = np.array([np.nan] * dd.shape[0]).astype(object)
46
day18[(dd.obs["Day"] == "E18").values] = "E18"
47
day18[(dd.obs["Day"] == "E18_S1").values] = "E18_S1"
48
day18[(dd.obs["Day"] == "E18_S3").values] = "E18_S3"
49
dd.obs["merge_day_18"] = day18
50
51
dayP1 = np.array([np.nan] * dd.shape[0]).astype(object)
52
dayP1[dd.obs["Day"] == "P1"] = "P1"
53
dayP1[dd.obs["Day"] == "P1_S1"] = "P1_S1"
54
dd.obs["merge_P1"] = dayP1
55
56
dd.obs["merge_day_18"] = dd.obs["merge_day_18"].astype("category")
57
dd.obs["merge_P1"] = dd.obs["merge_P1"].astype("category")
58
59
dd.obs["Source"] = "Di Bella"
60
61
62
63
###################################### Preprocess Yuzwa's and Ruan's merged dataset #################################################
64
mouse = load_data(path='data/',file_name="mouse_brain_merged")
65
sc.pp.normalize_total(mouse, target_sum=1e4)
66
sc.pp.log1p(mouse)
67
sc.pp.highly_variable_genes(mouse, min_mean=0.0125, max_mean=3, min_disp=0.5)
68
sc.pp.scale(mouse, max_value=10)
69
sc.tl.pca(mouse, n_comps = 64)
70
sc.pp.neighbors(mouse, n_neighbors = 20)
71
sc.tl.umap(mouse)
72
73
mouse_source = np.repeat("Yuzwa",mouse.shape[0])
74
mouse_source[mouse.obs["covariate_2"]==0] = "Ruan"
75
mouse.obs["Source"] = mouse_source
76
sc.pl.umap(mouse,color="Source",save = "_mouse_Source.pdf")
77
78
mouse_day = mouse.obs.index.values.copy()
79
mouse_day = [x[:3] for x in mouse_day]
80
mouse.obs["Day"] = mouse_day
81
mouse.obs.rename(columns={"grouping": "Clusters", "covariate_0": "S_Score",
82
                     "covariate_1":"G2M_Score"}, inplace = True)
83
84
85
86
###################################### Merge two datasets  #################################################
87
88
dd = dd.concatenate(mouse,join="inner")
89
## define highly_variable in merged dataset
90
dd.var["highly_variable"] = dd.var["highly_variable-0"].astype(bool) | dd.var["highly_variable-1"].astype(bool)
91
92
## union different name of cell types in two datasets
93
group_dict = {"Immature Neuron" : "Immature neurons",
94
             "NEC":"Apical progenitors",
95
             "RGC":"Apical progenitors",
96
              "Layer I":"Cajal Retzius cells",
97
             "OPC":"Oligodendrocytes",
98
             "Interneurons":"Interneurons",
99
             "Endothelial Cell":"Endothelial cells",
100
             "Microglia":"Microglia",
101
             "Pericyte":"Pericytes",
102
             "Intermediate progenitors":"IPC"}
103
104
c = dd.obs["Clusters"].values.copy()
105
c = [x if group_dict.get(x) == None else group_dict.get(x) for x in c]
106
dd.obs["tidy_clusters"] = c.copy()
107
108
## Add covariates for different source
109
a = np.zeros(dd.shape[0])
110
a[dd.obs["Source"] == "Ruan"] = 1
111
dd.obs["cov1"] = a
112
113
a = np.zeros(dd.shape[0])
114
a[dd.obs["Source"] == "Yuzwa"] = 1
115
dd.obs["cov2"] = a
116
117
a = np.zeros(dd.shape[0])
118
a[dd.obs["Source"] == "Di Bella"] = 1
119
dd.obs["cov3"] = a
120
121
a = np.array([np.nan] * dd.shape[0])
122
a[dd.obs["Day"] == "E18"] = 1
123
a[dd.obs["Day"] == "E18_S1"] = 2
124
a[dd.obs["Day"] == "E18_S3"] = 3
125
dd.obs["merge_18"] = a
126
127
128
a = np.array([np.nan] * dd.shape[0])
129
a[dd.obs["Day"] == "P1"] = 1
130
a[dd.obs["Day"] == "P1_S1"] = 2
131
dd.obs["merge_P1"] = a
132
133
dd.obs["merge_18"] = dd.obs["merge_18"].astype("category")
134
dd.obs["merge_P1"] = dd.obs["merge_P1"].astype("category")
135
136
a = np.array([np.nan] * dd.shape[0]).astype(object)
137
a[dd.obs["merge_P1"] == 1] = "P1"
138
a[dd.obs["merge_P1"] == 2] = "P1_S1"
139
dd.obs["merge_day_P1"] = a
140
141
a = dd.obs["tidy_clusters"].values.copy().astype(str)
142
a[(dd.obs["Day"].isin(["E14","E15","E16"])) & (a == "SCPN")] = "SCPN1"
143
dd.obs["Cluster2"] = a.copy()
144
145
###################################### Naive Merge  #################################################
146
## See what happened if we directly merge them ##
147
dd.obs["Source"] = dd.obs["Source"].astype(pd.CategoricalDtype(
148
                  ['Ruan', 'Yuzwa','Di Bella'], ordered=True))
149
150
sc.tl.pca(dd, n_comps = 64)
151
sc.pp.neighbors(dd, n_neighbors = 20)
152
sc.tl.umap(dd)
153
seaborn.set(rc={'figure.figsize':(6,6)},style = "white")
154
sc.pl.umap(dd,color="Source",save="_naive_Source.pdf",size = 5,palette= ["#F8766D","#00BFC4","#E5E8E8"],alpha = 0.5,na_color="#E5E8E8")
155
156
color_ggplot=["#F8766D","#E38900","#C49A00","#99A800","#53B400","#00BC56","#00C094","#00BFC4","#00B6EB","#06A4FF","#A58AFF","#DF70F8","#FB61D7","#FF66A8"]
157
seaborn.set(rc={'figure.figsize':(5,5)},style = "white")
158
sc.pl.umap(dd,color="Day",size = 2, alpha=0.4,palette=color_ggplot,save="_naive_merge_day.pdf")
159
160
161
###################################### Train VITAE  #################################################
162
163
164
seed = 400
165
tf.keras.backend.clear_session()
166
random.seed(seed)
167
np.random.seed(seed)
168
tf.random.set_seed(seed)
169
170
model = VITAE.VITAE(adata = dd,
171
    #        adata_layer_counts = 'counts',
172
            hidden_layers = [48, 24],
173
            latent_space_dim = 16,
174
            model_type = 'Gaussian',covariates = ['S_Score', 'G2M_Score',"cov1","cov2"],npc = 96)
175
176
177
model.pre_train(learning_rate = 1e-3,early_stopping_tolerance = 0.01, early_stopping_relative = True,L=1,phi=1,verbose=False,gamma = 1)
178
model.init_latent_space(cluster_label= 'Cluster2',ratio_prune=0.45)
179
model.train(beta = 1, learning_rate = 1e-3, early_stopping_tolerance = 0.01, early_stopping_relative = True,gamma=0,phi=1, verbose=False)
180
model.posterior_estimation(batch_size=32, L=10)
181
model.save_model(path_to_file="model.checkpoint")