Data: Tabular Time Series Specialty: Endocrinology Laboratory: Blood Tests EHR: Demographics Diagnoses Medications Omics: Genomics Multi-omics Transcriptomics Wearable: Activity Clinical Purpose: Treatment Response Assessment Task: Biomarker Discovery

Switch to unified view

a b/supplementary_files/create_datasets.py
1
"""
2
describe what is in this file
3
"""
4
5
# %%
6
import numpy as np
7
import random as rnd
8
import pandas as pd
9
from pathlib import Path
10
from scipy.stats import pearsonr
11
import matplotlib.pyplot as plt
12
from sklearn.datasets import make_sparse_spd_matrix
13
from move.data.preprocessing import scale
14
15
16
########################### Hyperparameters ####################################
17
PROJECT_NAME = "ffa_sim"
18
MODE = "linear"  # "non-linear"
19
SEED_1 = 1234
20
np.random.seed(SEED_1)
21
rnd.seed(SEED_1)
22
23
COV_ALPHA = 0.99
24
N_SAMPLES = 500
25
# SETTINGS = {
26
#     "proteomics": {
27
#         "features": 3000,
28
#         "frequencies": [0.002, 0.01, 0.02],
29
#         "coefficients": [500, 100, 50],
30
#         "phase": 0,
31
#         "offset": 500,
32
#     },
33
#     "metagenomics": {
34
#         "features": 1000,
35
#         "frequencies": [0.001, 0.05, 0.08],
36
#         "coefficients": [80, 20, 10],
37
#         "phase": np.pi / 2,
38
#         "offset": 400,
39
#     },
40
# }
41
42
SETTINGS = {
43
    "LP": {
44
        "features": 900,
45
        "frequencies": [0.002, 0.01, 0.02],
46
        "coefficients": [500, 100, 50],
47
        "phase": 0,
48
        "offset": 500,
49
    },
50
    "transcriptomics": {
51
        "features": 1000,
52
        "frequencies": [0.001, 0.05, 0.08],
53
        "coefficients": [80, 20, 10],
54
        "phase": np.pi / 2,
55
        "offset": 400,
56
    },
57
    # "PRS": {
58
    #     "features": 15,
59
    #     "frequencies": [0.1, 0.5, 0.8],
60
    #     "coefficients": [.2, .1, .05],
61
    #     "phase": np.pi / 2,
62
    #     "offset": 10,
63
    # },
64
    # "age": {
65
    # "features": 1,
66
    # "frequencies": [0.1, 0.5, 0.8],
67
    # "coefficients": [.1, .05, 0.1],
68
    # "phase": np.pi / 2,
69
    # "offset": 50,
70
    # },
71
}
72
73
COR_THRES = 0.02
74
PAIRS_OF_INTEREST = [(1, 2), (3, 4)]  # ,(77,75),(99,70),(38,2),(67,62)]
75
76
# Path to store output files
77
outpath = Path("./") / f"dataset_creation_outputs_{PROJECT_NAME}"
78
outpath.mkdir(exist_ok=True, parents=True)
79
80
################################ Functions ####################################
81
82
83
def get_feature_names(settings):
84
    all_feature_names = [
85
        f"{key}_{i+1}"
86
        for key in settings.keys()
87
        for i in range(settings[key]["features"])
88
    ]
89
    return all_feature_names
90
91
92
def create_mean_profiles(settings):
93
    feature_means = []
94
    for key in settings.keys():
95
        mean = settings[key]["offset"]
96
        for freq, coef in zip(
97
            settings[key]["frequencies"], settings[key]["coefficients"]
98
        ):
99
            mean += coef * (
100
                np.sin(
101
                    freq * np.arange(settings[key]["features"]) + settings[key]["phase"]
102
                )
103
                + 1
104
            )
105
        feature_means.extend(list(mean))
106
    return feature_means
107
108
109
def create_ground_truth_correlations_file(correlations):
110
    sort_ids = np.argsort(abs(correlations), axis=None)[::-1]  # 1D: N x C
111
    corr = np.take(correlations, sort_ids)  # 1D: N x C
112
    sig_ids = sort_ids[abs(corr) > COR_THRES]
113
    sig_ids = np.vstack(
114
        (sig_ids // len(all_feature_names), sig_ids % len(all_feature_names))
115
    ).T
116
    associations = pd.DataFrame(sig_ids, columns=["feature_a_id", "feature_b_id"])
117
    a_df = pd.DataFrame(dict(feature_a_name=all_feature_names))
118
    a_df.index.name = "feature_a_id"
119
    a_df.reset_index(inplace=True)
120
    b_df = pd.DataFrame(dict(feature_b_name=all_feature_names))
121
    b_df.index.name = "feature_b_id"
122
    b_df.reset_index(inplace=True)
123
    associations = associations.merge(a_df, on="feature_a_id", how="left").merge(
124
        b_df, on="feature_b_id", how="left"
125
    )
126
    associations["Correlation"] = corr[abs(corr) > COR_THRES]
127
    associations = associations[
128
        associations.feature_a_id > associations.feature_b_id
129
    ]  # Only one half of the matrix
130
    return associations
131
132
133
def plot_score_matrix(
134
    array, feature_names, cmap="bwr", vmin=None, vmax=None, label_step=50
135
):
136
    if vmin is None:
137
        vmin = np.min(array)
138
    elif vmax is None:
139
        vmax = np.max(array)
140
    # if ax is None:
141
    fig = plt.figure(figsize=(15, 15))
142
    plt.imshow(array, cmap=cmap, vmin=vmin, vmax=vmax)
143
    plt.xticks(
144
        np.arange(0, len(feature_names), label_step),
145
        feature_names[::label_step],
146
        fontsize=8,
147
        rotation=90,
148
    )
149
    plt.yticks(
150
        np.arange(0, len(feature_names), label_step),
151
        feature_names[::label_step],
152
        fontsize=8,
153
    )
154
    plt.tight_layout()
155
    # ax
156
    return fig
157
158
159
def plot_feature_profiles(dataset, feature_means):
160
    ## Plot profiles
161
    fig = plt.figure(figsize=(15, 5))
162
    plt.plot(
163
        np.arange(len(feature_means)), feature_means, lw=1, marker=".", markersize=0
164
    )
165
    for sample in dataset:
166
        plt.plot(
167
            np.arange(len(feature_means)), sample, lw=0.1, marker=".", markersize=0
168
        )
169
    plt.xlabel("Feature number")
170
    plt.ylabel("Count number")
171
    plt.title("Patient specific profiles")
172
    plt.tight_layout()
173
174
    return fig
175
176
177
def plot_feature_correlations(dataset, pairs_2_plot):
178
    fig = plt.figure()
179
    for f1, f2 in pairs_2_plot:
180
        plt.plot(
181
            dataset[:, f1],
182
            dataset[:, f2],
183
            lw=0,
184
            marker=".",
185
            markersize=1,
186
            label=f"{correlations[f1,f2]:.2f}",
187
        )
188
    plt.xlabel("Feature A")
189
    plt.ylabel("Feature B")
190
    plt.legend(
191
        loc="upper center",
192
        bbox_to_anchor=(0.5, -0.1),
193
        fancybox=True,
194
        shadow=True,
195
        ncol=5,
196
    )
197
    plt.title("Feature correlations")
198
    plt.tight_layout()
199
200
    return fig
201
202
203
def save_splitted_datasets(
204
    settings: dict, PROJECT_NAME, dataset, all_feature_names, n_samples, outpath
205
):
206
    # Save index file
207
    index = pd.DataFrame({"ID": list(np.arange(1, n_samples + 1))})
208
    index.to_csv(outpath / f"random.{PROJECT_NAME}.ids.txt", index=False)
209
    # Save continuous files
210
    df = pd.DataFrame(
211
        dataset, columns=all_feature_names, index=list(np.arange(1, n_samples + 1))
212
    )
213
    cum_feat = 0
214
    for key in settings.keys():
215
        df_feat = settings[key]["features"]
216
        df_cont = df.iloc[:, cum_feat : cum_feat + df_feat]
217
        df_cont.insert(0, "ID", np.arange(1, n_samples + 1))
218
        df_cont.to_csv(
219
            outpath / f"random.{PROJECT_NAME}.{key}.tsv", sep="\t", index=False
220
        )
221
        cum_feat += df_feat
222
223
224
################################## Main script ##################################
225
if __name__ == "__main__":
226
    # %%
227
    # Add all datasets in a single matrix:
228
    all_feature_names = get_feature_names(SETTINGS)
229
    feat_means = create_mean_profiles(SETTINGS)
230
231
    # %%
232
    ###### Covariance matrix definition ######
233
    if MODE == "linear":
234
        covariance_matrix = make_sparse_spd_matrix(
235
            dim=len(all_feature_names),
236
            alpha=COV_ALPHA,
237
            smallest_coef=-30,
238
            largest_coef=30,
239
            norm_diag=False,
240
            random_state=SEED_1,
241
        )
242
    elif MODE == "non-linear":
243
        covariance_matrix = np.identity(len(all_feature_names))
244
245
    ABS_MAX = np.max(abs(covariance_matrix))
246
    fig = plot_score_matrix(
247
        covariance_matrix, all_feature_names, vmin=-ABS_MAX, vmax=ABS_MAX
248
    )
249
    fig.savefig(outpath / f"Covariance_matrix_{PROJECT_NAME}.png")
250
251
    #    dataset = np.array(
252
    #        [
253
    #            list(np.random.multivariate_normal(feat_means, covariance_matrix))
254
    #            for _ in range(N_SAMPLES)
255
    #        ]
256
    #    )
257
258
    dataset = np.random.multivariate_normal(feat_means, covariance_matrix, N_SAMPLES)
259
    print(dataset.shape)
260
261
    # Add non-linearities
262
    if MODE == "non-linear":
263
        for i, j in PAIRS_OF_INTEREST:
264
            freq = np.random.choice([4, 5, 6])
265
            dataset[:, i] += np.sin(freq * dataset[:, j])
266
267
    scaled_dataset, _ = scale(dataset)
268
269
    # Actual correlations:
270
    # correlations = np.empty(np.shape(covariance_matrix))
271
    # for ifeat in range(len(covariance_matrix)):
272
    #     for jfeat in range(len(covariance_matrix)):
273
    #         correlations[ifeat, jfeat] = pearsonr(dataset[:, ifeat], dataset[:, jfeat])[
274
    #             0
275
    #         ]
276
277
    correlations = np.corrcoef(dataset, rowvar=False)
278
    fig = plot_score_matrix(correlations, all_feature_names, vmin=-1, vmax=1)
279
    fig.savefig(outpath / f"Correlations_{PROJECT_NAME}.png", dpi=200)
280
281
    # Sort correlations by absolute value
282
    associations = create_ground_truth_correlations_file(correlations)
283
    associations.to_csv(outpath / f"changes.{PROJECT_NAME}.txt", sep="\t", index=False)
284
285
    # Plot feature profiles per sample
286
    fig = plot_feature_profiles(dataset, feat_means)
287
    fig.savefig(outpath / "Multi-omic_profiles.png")
288
289
    ## Plot correlations
290
    fig = plot_feature_correlations(dataset, PAIRS_OF_INTEREST)
291
    fig.savefig(outpath / "Feature_correlations.png")
292
293
    fig = plot_feature_correlations(scaled_dataset, PAIRS_OF_INTEREST)
294
    fig.savefig(outpath / "Feature_correlations_scaled.png")
295
296
    # Write tsv files with feature values for all samples in both datasets:
297
    save_splitted_datasets(
298
        SETTINGS, PROJECT_NAME, dataset, all_feature_names, N_SAMPLES, outpath
299
    )