|
a |
|
b/supplementary_files/create_datasets.py |
|
|
1 |
""" |
|
|
2 |
describe what is in this file |
|
|
3 |
""" |
|
|
4 |
|
|
|
5 |
# %% |
|
|
6 |
import numpy as np |
|
|
7 |
import random as rnd |
|
|
8 |
import pandas as pd |
|
|
9 |
from pathlib import Path |
|
|
10 |
from scipy.stats import pearsonr |
|
|
11 |
import matplotlib.pyplot as plt |
|
|
12 |
from sklearn.datasets import make_sparse_spd_matrix |
|
|
13 |
from move.data.preprocessing import scale |
|
|
14 |
|
|
|
15 |
|
|
|
16 |
########################### Hyperparameters #################################### |
|
|
17 |
PROJECT_NAME = "ffa_sim" |
|
|
18 |
MODE = "linear" # "non-linear" |
|
|
19 |
SEED_1 = 1234 |
|
|
20 |
np.random.seed(SEED_1) |
|
|
21 |
rnd.seed(SEED_1) |
|
|
22 |
|
|
|
23 |
COV_ALPHA = 0.99 |
|
|
24 |
N_SAMPLES = 500 |
|
|
25 |
# SETTINGS = { |
|
|
26 |
# "proteomics": { |
|
|
27 |
# "features": 3000, |
|
|
28 |
# "frequencies": [0.002, 0.01, 0.02], |
|
|
29 |
# "coefficients": [500, 100, 50], |
|
|
30 |
# "phase": 0, |
|
|
31 |
# "offset": 500, |
|
|
32 |
# }, |
|
|
33 |
# "metagenomics": { |
|
|
34 |
# "features": 1000, |
|
|
35 |
# "frequencies": [0.001, 0.05, 0.08], |
|
|
36 |
# "coefficients": [80, 20, 10], |
|
|
37 |
# "phase": np.pi / 2, |
|
|
38 |
# "offset": 400, |
|
|
39 |
# }, |
|
|
40 |
# } |
|
|
41 |
|
|
|
42 |
SETTINGS = { |
|
|
43 |
"LP": { |
|
|
44 |
"features": 900, |
|
|
45 |
"frequencies": [0.002, 0.01, 0.02], |
|
|
46 |
"coefficients": [500, 100, 50], |
|
|
47 |
"phase": 0, |
|
|
48 |
"offset": 500, |
|
|
49 |
}, |
|
|
50 |
"transcriptomics": { |
|
|
51 |
"features": 1000, |
|
|
52 |
"frequencies": [0.001, 0.05, 0.08], |
|
|
53 |
"coefficients": [80, 20, 10], |
|
|
54 |
"phase": np.pi / 2, |
|
|
55 |
"offset": 400, |
|
|
56 |
}, |
|
|
57 |
# "PRS": { |
|
|
58 |
# "features": 15, |
|
|
59 |
# "frequencies": [0.1, 0.5, 0.8], |
|
|
60 |
# "coefficients": [.2, .1, .05], |
|
|
61 |
# "phase": np.pi / 2, |
|
|
62 |
# "offset": 10, |
|
|
63 |
# }, |
|
|
64 |
# "age": { |
|
|
65 |
# "features": 1, |
|
|
66 |
# "frequencies": [0.1, 0.5, 0.8], |
|
|
67 |
# "coefficients": [.1, .05, 0.1], |
|
|
68 |
# "phase": np.pi / 2, |
|
|
69 |
# "offset": 50, |
|
|
70 |
# }, |
|
|
71 |
} |
|
|
72 |
|
|
|
73 |
COR_THRES = 0.02 |
|
|
74 |
PAIRS_OF_INTEREST = [(1, 2), (3, 4)] # ,(77,75),(99,70),(38,2),(67,62)] |
|
|
75 |
|
|
|
76 |
# Path to store output files |
|
|
77 |
outpath = Path("./") / f"dataset_creation_outputs_{PROJECT_NAME}" |
|
|
78 |
outpath.mkdir(exist_ok=True, parents=True) |
|
|
79 |
|
|
|
80 |
################################ Functions #################################### |
|
|
81 |
|
|
|
82 |
|
|
|
83 |
def get_feature_names(settings): |
|
|
84 |
all_feature_names = [ |
|
|
85 |
f"{key}_{i+1}" |
|
|
86 |
for key in settings.keys() |
|
|
87 |
for i in range(settings[key]["features"]) |
|
|
88 |
] |
|
|
89 |
return all_feature_names |
|
|
90 |
|
|
|
91 |
|
|
|
92 |
def create_mean_profiles(settings): |
|
|
93 |
feature_means = [] |
|
|
94 |
for key in settings.keys(): |
|
|
95 |
mean = settings[key]["offset"] |
|
|
96 |
for freq, coef in zip( |
|
|
97 |
settings[key]["frequencies"], settings[key]["coefficients"] |
|
|
98 |
): |
|
|
99 |
mean += coef * ( |
|
|
100 |
np.sin( |
|
|
101 |
freq * np.arange(settings[key]["features"]) + settings[key]["phase"] |
|
|
102 |
) |
|
|
103 |
+ 1 |
|
|
104 |
) |
|
|
105 |
feature_means.extend(list(mean)) |
|
|
106 |
return feature_means |
|
|
107 |
|
|
|
108 |
|
|
|
109 |
def create_ground_truth_correlations_file(correlations): |
|
|
110 |
sort_ids = np.argsort(abs(correlations), axis=None)[::-1] # 1D: N x C |
|
|
111 |
corr = np.take(correlations, sort_ids) # 1D: N x C |
|
|
112 |
sig_ids = sort_ids[abs(corr) > COR_THRES] |
|
|
113 |
sig_ids = np.vstack( |
|
|
114 |
(sig_ids // len(all_feature_names), sig_ids % len(all_feature_names)) |
|
|
115 |
).T |
|
|
116 |
associations = pd.DataFrame(sig_ids, columns=["feature_a_id", "feature_b_id"]) |
|
|
117 |
a_df = pd.DataFrame(dict(feature_a_name=all_feature_names)) |
|
|
118 |
a_df.index.name = "feature_a_id" |
|
|
119 |
a_df.reset_index(inplace=True) |
|
|
120 |
b_df = pd.DataFrame(dict(feature_b_name=all_feature_names)) |
|
|
121 |
b_df.index.name = "feature_b_id" |
|
|
122 |
b_df.reset_index(inplace=True) |
|
|
123 |
associations = associations.merge(a_df, on="feature_a_id", how="left").merge( |
|
|
124 |
b_df, on="feature_b_id", how="left" |
|
|
125 |
) |
|
|
126 |
associations["Correlation"] = corr[abs(corr) > COR_THRES] |
|
|
127 |
associations = associations[ |
|
|
128 |
associations.feature_a_id > associations.feature_b_id |
|
|
129 |
] # Only one half of the matrix |
|
|
130 |
return associations |
|
|
131 |
|
|
|
132 |
|
|
|
133 |
def plot_score_matrix( |
|
|
134 |
array, feature_names, cmap="bwr", vmin=None, vmax=None, label_step=50 |
|
|
135 |
): |
|
|
136 |
if vmin is None: |
|
|
137 |
vmin = np.min(array) |
|
|
138 |
elif vmax is None: |
|
|
139 |
vmax = np.max(array) |
|
|
140 |
# if ax is None: |
|
|
141 |
fig = plt.figure(figsize=(15, 15)) |
|
|
142 |
plt.imshow(array, cmap=cmap, vmin=vmin, vmax=vmax) |
|
|
143 |
plt.xticks( |
|
|
144 |
np.arange(0, len(feature_names), label_step), |
|
|
145 |
feature_names[::label_step], |
|
|
146 |
fontsize=8, |
|
|
147 |
rotation=90, |
|
|
148 |
) |
|
|
149 |
plt.yticks( |
|
|
150 |
np.arange(0, len(feature_names), label_step), |
|
|
151 |
feature_names[::label_step], |
|
|
152 |
fontsize=8, |
|
|
153 |
) |
|
|
154 |
plt.tight_layout() |
|
|
155 |
# ax |
|
|
156 |
return fig |
|
|
157 |
|
|
|
158 |
|
|
|
159 |
def plot_feature_profiles(dataset, feature_means): |
|
|
160 |
## Plot profiles |
|
|
161 |
fig = plt.figure(figsize=(15, 5)) |
|
|
162 |
plt.plot( |
|
|
163 |
np.arange(len(feature_means)), feature_means, lw=1, marker=".", markersize=0 |
|
|
164 |
) |
|
|
165 |
for sample in dataset: |
|
|
166 |
plt.plot( |
|
|
167 |
np.arange(len(feature_means)), sample, lw=0.1, marker=".", markersize=0 |
|
|
168 |
) |
|
|
169 |
plt.xlabel("Feature number") |
|
|
170 |
plt.ylabel("Count number") |
|
|
171 |
plt.title("Patient specific profiles") |
|
|
172 |
plt.tight_layout() |
|
|
173 |
|
|
|
174 |
return fig |
|
|
175 |
|
|
|
176 |
|
|
|
177 |
def plot_feature_correlations(dataset, pairs_2_plot): |
|
|
178 |
fig = plt.figure() |
|
|
179 |
for f1, f2 in pairs_2_plot: |
|
|
180 |
plt.plot( |
|
|
181 |
dataset[:, f1], |
|
|
182 |
dataset[:, f2], |
|
|
183 |
lw=0, |
|
|
184 |
marker=".", |
|
|
185 |
markersize=1, |
|
|
186 |
label=f"{correlations[f1,f2]:.2f}", |
|
|
187 |
) |
|
|
188 |
plt.xlabel("Feature A") |
|
|
189 |
plt.ylabel("Feature B") |
|
|
190 |
plt.legend( |
|
|
191 |
loc="upper center", |
|
|
192 |
bbox_to_anchor=(0.5, -0.1), |
|
|
193 |
fancybox=True, |
|
|
194 |
shadow=True, |
|
|
195 |
ncol=5, |
|
|
196 |
) |
|
|
197 |
plt.title("Feature correlations") |
|
|
198 |
plt.tight_layout() |
|
|
199 |
|
|
|
200 |
return fig |
|
|
201 |
|
|
|
202 |
|
|
|
203 |
def save_splitted_datasets( |
|
|
204 |
settings: dict, PROJECT_NAME, dataset, all_feature_names, n_samples, outpath |
|
|
205 |
): |
|
|
206 |
# Save index file |
|
|
207 |
index = pd.DataFrame({"ID": list(np.arange(1, n_samples + 1))}) |
|
|
208 |
index.to_csv(outpath / f"random.{PROJECT_NAME}.ids.txt", index=False) |
|
|
209 |
# Save continuous files |
|
|
210 |
df = pd.DataFrame( |
|
|
211 |
dataset, columns=all_feature_names, index=list(np.arange(1, n_samples + 1)) |
|
|
212 |
) |
|
|
213 |
cum_feat = 0 |
|
|
214 |
for key in settings.keys(): |
|
|
215 |
df_feat = settings[key]["features"] |
|
|
216 |
df_cont = df.iloc[:, cum_feat : cum_feat + df_feat] |
|
|
217 |
df_cont.insert(0, "ID", np.arange(1, n_samples + 1)) |
|
|
218 |
df_cont.to_csv( |
|
|
219 |
outpath / f"random.{PROJECT_NAME}.{key}.tsv", sep="\t", index=False |
|
|
220 |
) |
|
|
221 |
cum_feat += df_feat |
|
|
222 |
|
|
|
223 |
|
|
|
224 |
################################## Main script ################################## |
|
|
225 |
if __name__ == "__main__": |
|
|
226 |
# %% |
|
|
227 |
# Add all datasets in a single matrix: |
|
|
228 |
all_feature_names = get_feature_names(SETTINGS) |
|
|
229 |
feat_means = create_mean_profiles(SETTINGS) |
|
|
230 |
|
|
|
231 |
# %% |
|
|
232 |
###### Covariance matrix definition ###### |
|
|
233 |
if MODE == "linear": |
|
|
234 |
covariance_matrix = make_sparse_spd_matrix( |
|
|
235 |
dim=len(all_feature_names), |
|
|
236 |
alpha=COV_ALPHA, |
|
|
237 |
smallest_coef=-30, |
|
|
238 |
largest_coef=30, |
|
|
239 |
norm_diag=False, |
|
|
240 |
random_state=SEED_1, |
|
|
241 |
) |
|
|
242 |
elif MODE == "non-linear": |
|
|
243 |
covariance_matrix = np.identity(len(all_feature_names)) |
|
|
244 |
|
|
|
245 |
ABS_MAX = np.max(abs(covariance_matrix)) |
|
|
246 |
fig = plot_score_matrix( |
|
|
247 |
covariance_matrix, all_feature_names, vmin=-ABS_MAX, vmax=ABS_MAX |
|
|
248 |
) |
|
|
249 |
fig.savefig(outpath / f"Covariance_matrix_{PROJECT_NAME}.png") |
|
|
250 |
|
|
|
251 |
# dataset = np.array( |
|
|
252 |
# [ |
|
|
253 |
# list(np.random.multivariate_normal(feat_means, covariance_matrix)) |
|
|
254 |
# for _ in range(N_SAMPLES) |
|
|
255 |
# ] |
|
|
256 |
# ) |
|
|
257 |
|
|
|
258 |
dataset = np.random.multivariate_normal(feat_means, covariance_matrix, N_SAMPLES) |
|
|
259 |
print(dataset.shape) |
|
|
260 |
|
|
|
261 |
# Add non-linearities |
|
|
262 |
if MODE == "non-linear": |
|
|
263 |
for i, j in PAIRS_OF_INTEREST: |
|
|
264 |
freq = np.random.choice([4, 5, 6]) |
|
|
265 |
dataset[:, i] += np.sin(freq * dataset[:, j]) |
|
|
266 |
|
|
|
267 |
scaled_dataset, _ = scale(dataset) |
|
|
268 |
|
|
|
269 |
# Actual correlations: |
|
|
270 |
# correlations = np.empty(np.shape(covariance_matrix)) |
|
|
271 |
# for ifeat in range(len(covariance_matrix)): |
|
|
272 |
# for jfeat in range(len(covariance_matrix)): |
|
|
273 |
# correlations[ifeat, jfeat] = pearsonr(dataset[:, ifeat], dataset[:, jfeat])[ |
|
|
274 |
# 0 |
|
|
275 |
# ] |
|
|
276 |
|
|
|
277 |
correlations = np.corrcoef(dataset, rowvar=False) |
|
|
278 |
fig = plot_score_matrix(correlations, all_feature_names, vmin=-1, vmax=1) |
|
|
279 |
fig.savefig(outpath / f"Correlations_{PROJECT_NAME}.png", dpi=200) |
|
|
280 |
|
|
|
281 |
# Sort correlations by absolute value |
|
|
282 |
associations = create_ground_truth_correlations_file(correlations) |
|
|
283 |
associations.to_csv(outpath / f"changes.{PROJECT_NAME}.txt", sep="\t", index=False) |
|
|
284 |
|
|
|
285 |
# Plot feature profiles per sample |
|
|
286 |
fig = plot_feature_profiles(dataset, feat_means) |
|
|
287 |
fig.savefig(outpath / "Multi-omic_profiles.png") |
|
|
288 |
|
|
|
289 |
## Plot correlations |
|
|
290 |
fig = plot_feature_correlations(dataset, PAIRS_OF_INTEREST) |
|
|
291 |
fig.savefig(outpath / "Feature_correlations.png") |
|
|
292 |
|
|
|
293 |
fig = plot_feature_correlations(scaled_dataset, PAIRS_OF_INTEREST) |
|
|
294 |
fig.savefig(outpath / "Feature_correlations_scaled.png") |
|
|
295 |
|
|
|
296 |
# Write tsv files with feature values for all samples in both datasets: |
|
|
297 |
save_splitted_datasets( |
|
|
298 |
SETTINGS, PROJECT_NAME, dataset, all_feature_names, N_SAMPLES, outpath |
|
|
299 |
) |