[f48632]: / zachs_rerun / make_deltas_feb20.R

Download this file

185 lines (143 with data), 7.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#' @author Emily Yeo
#' @email emily.yeo@colorado.edu
#' @purpose Analysis for the Stanislawski Lab
#' @lab Stanislawski Lab
#' @affiliation University of Colorado Denver - Anschutz Medical, Dept of Biomedical Informatics & Personalized Medicine
###############################
### Reading R data files
###############################
# In[1]: Imports ----
rm(list = ls())
source("zc_functions.R")
library(pacman)
p_load(tools, reticulate, viridis, tidyplots, patchwork, jsonlite, maps, ggvenn, caret, caretEnsemble,
readr, plyr, dplyr, tidyr, purrr, tibble, stringr, psych, randomForest, glmnet, xgboost, ggplot2,
reshape2, scales, gridExtra, plotly, sf, tidyverse)
###############################
### Data Preprocessing
###############################
# In[2]: Data Imports ----
# Define the base path for the data files
output_dir <- "drift_fs/csv/unprocessed_data"
zc_pl_dir <- "unprocessed_input/"
local_path <- "drift_fs/csv/unprocessed_data/"
data_dir <- "drift_fs/csv/processed_data/"
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/5.combined/"
gen_dir <- "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/genetic/"
tax_dir <- "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/taxa/aim2_transformed/genus/"
path_dir <- "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/functional/aim2/"
micom_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/micom/aim2/"
m1_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2/"
# Genetic info
gen_all <- read_csv("/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/genetic/genetic_risk.csv")
gen_test <- read_csv(paste0(gen_dir, "genetic_risk_testing.csv"))
gen_train <- read_csv("~/projects/research/Stanislawski/comps/mutli-omic-predictions/data/genetic/genetic_risk_training.csv")
gen_all <- gen_all %>% dplyr::select(c("subject_id", "bmi_prs"))
# Taxa info
genus_all <- read_csv(paste0(tax_dir, "clr_taxa_all_feb20.csv"))
genus_test <- read_csv(paste0(tax_dir, "aim2_clr_testing_feb20.csv"))
genus_train <- read_csv(paste0(tax_dir, "aim2_clr_training_feb20.csv"))
#genus_all <- make_new_columns(genus_all, genus_all$SampleID)
#genus_test <- make_new_columns(genus_test, genus_test$SampleID)
#genus_train <- make_new_columns(genus_train, genus_train$SampleID)
genus_all <- rename_columns_species_to_domain(genus_all) %>%
dplyr::rename("sample_id" = "...1")
genus_test <- rename_columns_species_to_domain(genus_test) # rename the columns
genus_train <- rename_columns_species_to_domain(genus_train)
# Meta data
merge_metadata <- read_csv(paste0(zc_pl_dir, "merge_meta_methyl.csv"))
metadata <- read_csv(paste0(zc_pl_dir, "DRIFT_working_dataset_meta_deltas_filtered_05.21.2024.csv"))
full_raw = read_csv(paste0(m1_dir, "a2_meta_not_Transformed_standard_clinical.csv"))
meta_all <- read_csv(paste0(m1_dir, "a2_meta_Transformed_standard_clinical_feb20.csv"))
meta_test <- read_csv(paste0(m1_dir, "a2_test_samples_standard_clinical_feb20.csv"))
meta_train <- read_csv(paste0(m1_dir, "a2_train_samples_standard_clinical_feb20.csv"))
# MICOM data
micom_all = read_csv(paste0(micom_dir, "flux_all_feb20.csv"))
micom_test = read_csv(paste0(micom_dir, "flux_all_testing_feb20.csv"))
micom_train = read_csv(paste0(micom_dir, "flux_all_training_feb20.csv"))
micom_all$id <- micom_all$sample_id
micom_all <- micom_all %>%
separate(sample_id, into = c("subject_id", "time"), sep = "\\.")
# Pathway
path_all <- read_csv(paste0(path_dir, "clr_taxa_all_feb20.csv"))
path_test <- read_csv(paste0(path_dir, "all_clr_testing_feb20.csv"))
path_train <- read_csv(paste0(path_dir, "all_clr_training_feb20.csv"))
path_all$id <- path_all$SampleID
path_all <- path_all %>%
separate(SampleID,
into = c("subject_id",
"time"), sep = "\\.")
rm(micom_test, micom_train, path_test, path_train, meta_test, meta_train, full_raw,
genus_test, genus_train, gen_test, gen_train)
# In[4.2]: Merge the updated_analysis and metadata ----
# Ensure both datasets have 'record_id' for the join
meta_data <- merge_data(gen_all, meta_all,
inner_join, "subject_id")
meta_full <- gen_all %>% full_join(meta_all, by = "subject_id")
### START MERGING GENUS RA with MICOM
# Full join: Keeps all rows, with NA for non-matching rows
meta_full_path <- meta_full %>% full_join(path_all, by = "subject_id")
g_ra_micom_path <- meta_full_path %>%
full_join(micom_all, by = "id")
g_ra_micom_path <- g_ra_micom_path %>%
full_join(genus_all, by = c("id" = "sample_id"))
### START MERGING GENUS RA & MICOM with Pathway data
meta_slim <- metadata %>%
dplyr::select("subject_id",
"Glucose (6m-BL)", "Glucose (12m-BL)",
"HOMA-IR (6m-BL)", "HOMA-IR (12m-BL)",
"Insulin (6m-BL)", "Insulin (12m-BL)",
"HDL (6m-BL)", "HDL (12m-BL)",
"LDL (6m-BL)", "LDL (12m-BL)",
"Triglyceride lipid (6m-BL)", "Triglyceride lipid (12m-BL)",
"BMI (6m-BL)", "BMI (12m-BL)",
"Weight (6m-BL)", "Weight (12m-6m)", "Weight (12m-BL)" ,
"outcome_wt_fnl_BL", "outcome_wt_fnl_6m", "outcome_wt_fnl_12m")
# Full join: Keeps all rows, with NA for non-matching rows
meta_path_g_ra_micom <- meta_slim %>%
full_join(g_ra_micom_path, by = c("subject_id" = "subject_id.x"))
#rm(meta_data_df, meta_data, merge_meta_data, metadata, merge_metadata)
# In[4.4]: Remove the columns that are not needed ----
colnames(meta_path_g_ra_micom)
tail(colnames(meta_path_g_ra_micom), 300)
# Define columns and pattern to remove
columns_to_remove <- c(
"...1.y", "...1.x", "id",
"subject_id.y", "time.y"
)
pattern_to_remove <- "3m|6m|12m|18m"
BL_pattern <- "3m|6m|12m|18m"
pattern_3m <- "BL|6m|12m|18m"
pattern_6m <- "3m|BL|12m|18m"
pattern_12m <- "3m|6m|BL|18m"
# Remove columns from genus dataset
g_ra_all_BL <- remove_columns(meta_path_g_ra_micom,
columns_to_remove = columns_to_remove,
pattern = BL_pattern)
meta_path_g_ra_micom <- meta_path_g_ra_micom %>%
dplyr::select(-c("subject_id.y", "...1", "time.y",
"...1.y", "...1.x",
"time.x"))
# Remove rows that have 3 and 18m
meta_path_g_ra_micom_fil <- meta_path_g_ra_micom %>%
filter(!grepl("\\.(3m|18m)$", id))
# save these dataframes
save_dir <- "drift_fs/csv/all_omic_processed_data/deltas/"
# check if the directory exists
if (!dir.exists(save_dir)) {
dir.create(save_dir, recursive = TRUE)
}
write.csv(meta_path_g_ra_micom_fil,
paste0(save_dir,
"feb20_g_ra_all_omics_deltas_outer.csv"),
row.names = FALSE)
# Look at missingness for innerjoined df
library(mice)
md.pattern(meta_path_g_ra_micom_fil)
# Percentage of missing data by column
colSums(is.na(meta_path_g_ra_micom_fil)) / nrow(meta_path_g_ra_micom_fil) * 100
# Overall percentage of missing data
sum(is.na(meta_path_g_ra_micom_fil)) / (nrow(meta_path_g_ra_micom_fil) * ncol(meta_path_g_ra_micom_fil)) * 100
# Create a correlation matrix for missingness
missing_corr <- cor(is.na(meta_path_g_ra_micom_fil))
print(meta_path_g_ra_micom_fil)