multiomic_drift / Git / [f48632] /zachs_rerun/zc

Models:
joseph-gordon/
multiomic_drift
Downloads: 1
[f48632]: / zachs_rerun / zc_functions.R
History
Download this file
1100 lines (880 with data), 39.2 kB

# Useful functions 

### Function to load RData file into a new environment and return the environment
load_rdata <- function(file_path) {
  if (file_ext(file_path) == "RData") {
    env <- new.env()
    load(file_path, envir = env)
    message("RData file loaded successfully.")
    return(env)
  } else {
    stop("The file is not an RData file.")
  }
}


### Make new columns 
make_new_columns <- function(data, column_name) {
  data <- data %>%
    mutate(
      subject_id = str_split(column_name, "\\.", simplify = TRUE)[, 1],
      TIMEPOINT = str_split(column_name, "\\.", simplify = TRUE)[, 2]
    )
  return(data)
}


### Filter data
filter_data <- function(data, column, value) {
  data <- data %>%
    filter(column == value)
  return(data)
}


### Merge data
merge_data <- function(data1, data2, join_type, columnname) {
  data <- join_type(data1, data2, by = columnname)
  
  # Remove the duplicated columns and rename as necessary
  data <- data %>%
    dplyr::select(-matches(paste0(columnname, "\\.y$"))) %>%
    rename_with(~ gsub("\\.x$", "", .), ends_with(".x"))
  
  return(data)
}


### Remove columns 
remove_columns <- function(data, columns_to_remove = NULL, pattern = NULL) {
  # Remove specified columns if provided
  if (!is.null(columns_to_remove)) {
    data <- data %>% dplyr::select(-all_of(columns_to_remove))
  }
  
  # Remove columns matching the pattern if provided
  if (!is.null(pattern)) {
    data <- data %>% dplyr::select(-matches(pattern))
  }
  
  return(data)
}


### Extract columns 
extract_columns <- function(data, columns_to_extract = NULL, pattern = NULL) {
  # Case 1: Both columns_to_extract and pattern are specified
  if (!is.null(columns_to_extract) && !is.null(pattern)) {
    data <- data %>% 
      select(all_of(intersect(names(data), 
                              columns_to_extract)), 
             matches(pattern))
    
    # Case 2: Only pattern is specified
  } else if (is.null(columns_to_extract) && !is.null(pattern)) {
    data <- data %>% select(matches(pattern))
    
    # Case 3: Only columns_to_extract is specified
  } else if (!is.null(columns_to_extract) && is.null(pattern)) {
    data <- data %>% select(all_of(columns_to_extract))
  }
  
  # Return data with the selected columns
  return(data)
}


### Function to automatically save all data frames/matrices in an environment as CSV files
save_env_to_csv <- function(env, output_dir) {
  # Ensure the output directory exists
  if (!dir.exists(output_dir)) {
    dir.create(output_dir, recursive = TRUE)
  }
  
  # Loop through the objects in the environment
  for (obj_name in ls(envir = env)) {
    obj <- get(obj_name, envir = env)
    
    # Check if the object is a data frame or matrix
    if (is.data.frame(obj) || is.matrix(obj)) {
      file_path <- file.path(output_dir, paste0(obj_name, ".csv"))
      write.csv(obj, file = file_path)
      message(paste("Saved:", file_path))
    } else {
      message(paste("Skipping:", obj_name, "as it is not a data frame or matrix."))
    }
  }
}

### Function to split column into id and time col
make_new_columns <- function(data, column_name) {
  data <- data %>%
    mutate(
      subject_id = str_split(column_name, "\\.", simplify = TRUE)[, 1],
      TIMEPOINT = str_split(column_name, "\\.", simplify = TRUE)[, 2]
    )
  return(data)
}

### Function to deal with Taxa names 
rename_columns_species_to_domain <- function(dataframe) {
  # Define the order of taxonomic levels
  order <- c("d__", "p__", "c__", "o__", "f__", "g__", "s__")
  
  # Get all column names
  columns <- colnames(dataframe)
  
  # Filter columns starting with 'd__'
  species_columns <- columns[grepl("^d__", columns)]
  
  # Loop through each species column
  for (column in species_columns) {
    # Split by '_{any single letter}__'
    split_column <- unlist(strsplit(column, "_[a-z]__"))
    split_column[1] <- sub("d__", "", split_column[1]) # Remove 'd__' prefix
    
    # Remove empty elements
    split_column <- split_column[split_column != ""]
    
    # Get the lowest level of taxonomy
    order_index <- length(split_column) - 1
    new_column_name <- paste0(order[order_index + 1], split_column[length(split_column)])
    
    # Rename the column in the data frame
    colnames(dataframe)[colnames(dataframe) == column] <- new_column_name
  }
  
  return(dataframe)
}


### Function to standardize and impute the data
process_data <- function(data, columns_to_remove, columns_to_standardize, impute_method = "medianImpute") {
  data_cleaned <- remove_columns(data, columns_to_remove)
  data_standardized <- preprocess_data(data_cleaned, columns_to_standardize, impute_method)
  return(data_standardized)
}

### Function to preprocess the data 
preprocess_data <- function(data, columns_to_standardize, imputation_method) {
  data_imputed <- predict(preProcess(data, method = c(imputation_method)), data)
  data_standardized <- predict(preProcess(data_imputed[, columns_to_standardize], method = c("center", "scale")), data_imputed)
  return(data_standardized)
}

### Function to train all models 
train_all_models <- function(data, target, train_control) {
  lasso_model <- train(
    x = as.data.frame(data[, -which(names(data) %in% c(target, "subject_id"))]),
    y = as.numeric(data[[target]]),
    method = "glmnet",
    trControl = train_control,
    tuneGrid = expand.grid(alpha = 1, lambda = seq(0.1, 1, 0.1))
  )
  
  ridge_model <- train(
    x = as.data.frame(data[, -which(names(data) %in% c(target, "subject_id"))]),
    y = as.numeric(data[[target]]),
    method = "glmnet",
    trControl = train_control,
    tuneGrid = expand.grid(alpha = 0, lambda = seq(0.1, 1, 0.1))
  )
  
  elastic_net_model <- train(
    x = as.data.frame(data[, -which(names(data) %in% c(target, "subject_id"))]),
    y = as.numeric(data[[target]]),
    method = "glmnet",
    trControl = train_control,
    tuneGrid = expand.grid(alpha = seq(0.1, 1, 0.1), lambda = seq(0.1, 1, 0.1))
  )
  
  rf_model <- train(
    x = as.data.frame(data[, -which(names(data) %in% c(target, "subject_id"))]),
    y = as.numeric(data[[target]]),
    method = "rf",
    trControl = train_control,
    tuneGrid = expand.grid(mtry = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)),
    ntree = 10000
  )
  
  xgb_model <- train(
    x = as.data.frame(data[, -which(names(data) %in% c(target, "subject_id"))]),
    y = as.numeric(data[[target]]),
    method = "xgbTree",
    trControl = train_control
  )
  
  caret.list <- caretList(
    x = as.data.frame(data[, -which(names(data) %in% c(target, "subject_id"))]),
    y = as.numeric(data[[target]]),
    trControl = train_control,
    methodList = c("glmnet", "rf", "xgbTree")
  )
  
  ens <- caretEnsemble(caret.list)
  
  return(list(
    lasso_model = lasso_model,
    ridge_model = ridge_model,
    elastic_net_model = elastic_net_model,
    rf_model = rf_model,
    xgb_model = xgb_model,
    ens = ens
  ))
}

### Function to extract importances 
extract_importance_df <- function(model, label) {
  importance <- varImp(model)$importance %>%
    as.data.frame() %>%
    rownames_to_column("Variable")
  colnames(importance)[2] <- label
  return(importance)
}

### Function to combine importances 
combine_importances <- function(model_list, labels) {
  importance_dfs <- map2(model_list, labels, extract_importance_df)
  reduce(importance_dfs, full_join, by = "Variable")
}

### Function to extract the best beta values 
extract_best_betas <- function(model_list, labels) {
  beta_dfs <- map2(model_list, labels, function(model, label) {
    best_lambda <- model$bestTune$lambda
    betas <- as.data.frame(as.matrix(coef(model$finalModel, s = best_lambda))) %>%
      rownames_to_column("Variable")
    colnames(betas)[2] <- label
    return(betas)
  })
  beta_combined <- reduce(beta_dfs, full_join, by = "Variable") %>%
    mutate(across(everything(), ~ replace_na(., 0)))
  return(beta_combined)
}

### Function to replace NA values 
replace_na <- function(x, value) {
  ifelse(is.na(x), value, x)
}

### Helper function to calculate model performance metrics for either training or testing data
calculate_metrics <- function(model, data, target_var, model_name, data_type) {
  predictions <- predict(model, data)
  actuals <- data[[target_var]]
  
  r2 <- caret::R2(predictions, actuals)
  mae <- caret::MAE(predictions, actuals)
  rmse <- caret::RMSE(predictions, actuals)
  
  return(data.frame(Model = model_name, DataType = data_type, R2 = r2, MAE = mae, RMSE = rmse))
}


### Function to run all the models 
train_and_save_models <- function(data, target_var, 
                                  train_control, result_prefix, 
                                  test_size = 0.3) {
  # Split data into training and testing sets
  set.seed(123) # Ensure reproducibility
  train_indices <- sample(seq_len(nrow(data)), size = (1 - test_size) * nrow(data))
  train_data <- data[train_indices, ]
  test_data <- data[-train_indices, ]
  
  # Train models
  results <- train_all_models(train_data, target_var, train_control)
  
  # Combine importances
  feature_importance <- combine_importances(
    list(results$rf_model, results$lasso_model, 
         results$ridge_model, results$elastic_net_model, results$xgb_model),
    c("RF_Importance", "Lasso_Importance", 
      "Ridge_Importance", "Enet_Importance", "XGBoost_Importance")
  )
  # ensure output directory exists
  if (!dir.exists("drift_fs/csv/results/")) {
    dir.create("drift_fs/csv/results/", recursive = TRUE)
  }
  
  write.csv(feature_importance, paste0("drift_fs/csv/results/", 
                                       result_prefix, "_feature_importance.csv"), 
            row.names = FALSE)
  
  # Extract best betas
  beta_coefficients <- extract_best_betas(
    list(results$lasso_model, results$ridge_model, results$elastic_net_model),
    c("Lasso_Beta", "Ridge_Beta", "Enet_Beta")
  )
  write.csv(beta_coefficients, paste0("drift_fs/csv/results/", result_prefix, "_beta.csv"), row.names = FALSE)
  
  # Initialize an empty DataFrame to store performance metrics
  metrics_df <- data.frame(Model = character(), DataType = character(), R2 = numeric(), MAE = numeric(), RMSE = numeric(), stringsAsFactors = FALSE)
  
  # Calculate and store metrics for each model on both training and testing data
  for (model_name in names(results[-length(results)])) {
    model <- results[[model_name]]
    
    # Training metrics
    train_metrics <- calculate_metrics(model, train_data, target_var, model_name, "Train")
    metrics_df <- rbind(metrics_df, train_metrics)
    
    # Testing metrics
    test_metrics <- calculate_metrics(model, test_data, target_var, model_name, "Test")
    metrics_df <- rbind(metrics_df, test_metrics)
  }
  
  # Save the metrics DataFrame as CSV
  write.csv(metrics_df, paste0("drift_fs/csv/results/", result_prefix, "_metrics.csv"), row.names = FALSE)
  
  # ensure the model directory exists
  if (!dir.exists("drift_fs/models/")) {
    dir.create("drift_fs/models/", recursive = TRUE)
  }
  # Save model results
  saveRDS(results, paste0("drift_fs/models/", result_prefix, "_results.rds"))
  
  return(results)
}

####Same fucntion as above but multiple models 
train_and_save_multiple_models <- function(datasets, target_vars, 
                                           train_control, result_prefixes, 
                                           test_size = 0.3) {
  
  # Check if lengths of datasets, target_vars, and result_prefixes are equal
  if (length(datasets) != length(target_vars) || length(datasets) != length(result_prefixes)) {
    stop("The number of datasets, target variables, and result prefixes must be the same.")
  }
  
  # Loop over each dataset, target variable, and result prefix
  for (i in seq_along(datasets)) {
    data <- datasets[[i]]
    target_var <- target_vars[i]
    result_prefix <- result_prefixes[i]
    
    # Split data into training and testing sets
    set.seed(123) # Ensure reproducibility
    train_indices <- sample(seq_len(nrow(data)), size = (1 - test_size) * nrow(data))
    train_data <- data[train_indices, ]
    test_data <- data[-train_indices, ]
    
    # Train models
    results <- train_all_models(train_data, target_var, train_control)
    
    # Combine importances
    feature_importance <- combine_importances(
      list(results$rf_model, results$lasso_model, 
           results$ridge_model, results$elastic_net_model, results$xgb_model),
      c("RF_Importance", "Lasso_Importance", 
        "Ridge_Importance", "Enet_Importance", "XGBoost_Importance")
    )
    
    # Ensure output directory exists
    if (!dir.exists("drift_fs/csv/results/feb20/")) {
      dir.create("drift_fs/csv/results/feb20/", recursive = TRUE)
    }
    
    write.csv(feature_importance, 
              paste0("drift_fs/csv/results/feb20/", 
                     result_prefix, "_feature_importance.csv"), 
              row.names = FALSE)
    
    # Extract best betas
    beta_coefficients <- extract_best_betas(
      list(results$lasso_model, results$ridge_model, results$elastic_net_model),
      c("Lasso_Beta", "Ridge_Beta", "Enet_Beta")
    )
    write.csv(beta_coefficients, paste0("drift_fs/csv/results/feb20/", 
                                        result_prefix, "_beta.csv"), row.names = FALSE)
    
    # Initialize an empty DataFrame to store performance metrics
    metrics_df <- data.frame(Model = character(), DataType = character(), 
                             R2 = numeric(), MAE = numeric(), 
                             RMSE = numeric(), stringsAsFactors = FALSE)
    
    # Calculate and store metrics for each model on both training and testing data
    for (model_name in names(results[-length(results)])) {
      model <- results[[model_name]]
      
      # Training metrics
      train_metrics <- calculate_metrics(model, train_data, target_var, model_name, "Train")
      metrics_df <- rbind(metrics_df, train_metrics)
      
      # Testing metrics
      test_metrics <- calculate_metrics(model, test_data, target_var, model_name, "Test")
      metrics_df <- rbind(metrics_df, test_metrics)
    }
    
    # Save the metrics DataFrame as CSV
    write.csv(metrics_df, paste0("drift_fs/csv/results/feb20/", 
                                 result_prefix, "_metrics.csv"), row.names = FALSE)
    
    # Ensure the model directory exists
    if (!dir.exists("drift_fs/models/feb20/")) {
      dir.create("drift_fs/models/feb20/", recursive = TRUE)
    }
    
    # Save model results
    saveRDS(results, paste0("drift_fs/models/feb20/", result_prefix, "_results.rds"))
  }
  
  return("Model training completed for all datasets.")
}

####

# Function to get top N important features for a given model
get_top_n_features <- function(feature_importance, model_importance_column, n = 20) {
  feature_importance %>%
    select(Variable, all_of(model_importance_column)) %>%
    arrange(desc(get(model_importance_column))) %>%
    head(n)
}

# Function to get top N important features for all models in the feature importance dataframe
get_top_n_features_all_models <- function(feature_importance, n = 20) {
  # Identify columns ending with '_Importance'
  importance_columns <- names(feature_importance)[grepl("_Importance$", names(feature_importance))]
  
  # Initialize a list to store results for each model
  top_features_list <- list()
  
  # Loop through each importance column and get top N features
  for (column in importance_columns) {
    model_name <- gsub("_Importance", "", column) # Extract model name
    top_features_list[[model_name]] <- get_top_n_features(feature_importance, column, n)
  }
  
  return(top_features_list)
}

##### 
train_multiple_models <- function(datasets, target_vars, 
                                  train_control, result_prefixes, 
                                  test_size = 0.3) {
  
  # Check if lengths of datasets, target_vars, and result_prefixes are equal
  if (length(datasets) != length(target_vars) || length(datasets) != length(result_prefixes)) {
    stop("The number of datasets, target variables, and result prefixes must be the same.")
  }
  
  # Initialize an empty list to store the results
  all_results <- list()
  
  # Loop over each dataset, target variable, and result prefix
  for (i in seq_along(datasets)) {
    data <- datasets[[i]]
    target_var <- target_vars[i]
    result_prefix <- result_prefixes[i]
    
    # Split data into training and testing sets
    set.seed(123) # Ensure reproducibility
    train_indices <- sample(seq_len(nrow(data)), size = (1 - test_size) * nrow(data))
    train_data <- data[train_indices, ]
    test_data <- data[-train_indices, ]
    
    # Train models
    results <- train_all_models(train_data, target_var, train_control)
    
    # Save results with a unique name based on result_prefix
    all_results[[result_prefix]] <- results
  }
  
  # Return the list of all results
  return(all_results)
}


#####

# Function to plot Venn diagram for top features
plot_venn_diagram <- function(feature_sets, colors = NULL, 
                              figurename = "venn_diagram.png", 
                              output_dir = "drift_fs/figures/") {
  if (is.null(colors)) {
    colors <- viridis(length(feature_sets))
  }
  
  venn.plot <- venn.diagram(
    x = feature_sets,
    category.names = names(feature_sets),
    filename = NULL, # Plot directly to the object
    output = TRUE,
    col = "transparent",
    fill = colors,
    alpha = 0.3,
    cex = 1.5,
    cat.cex = 1.2,
    cat.pos = 0,
    margin = 0.1
  )
  
  ggsave(file.path(output_dir, figurename), plot = venn.plot, dpi = 600, bg = "white")
}

# Function to plot feature importance or beta values
plot_importance_or_beta <- function(data, value_column, plot_title, y_label, figurename, palette = "viridis") {
  long_format <- data %>%
    pivot_longer(
      cols = ends_with(value_column),
      names_to = "Model",
      values_to = value_column
    )
  
  ggplot(long_format, aes(y = reorder(Variable, !!sym(value_column)), x = !!sym(value_column), fill = Model)) +
    geom_bar(stat = "identity", position = position_dodge(width = 0.9), width = 0.7) +
    scale_fill_viridis_d(option = palette) +
    labs(
      title = plot_title,
      x = paste(value_column, "Score"),
      y = y_label
    ) +
    theme_minimal() +
    theme(
      axis.text.y = element_text(size = 5), # Adjust text size
      axis.title.y = element_text(vjust = 1)
    ) +
    scale_y_discrete(expand = expansion(mult = c(0.1, 0.2))) # Add padding between features
  
  ggsave(file.path("drift_fs/figures/", figurename), dpi = 600, bg = "white")
}

# Function to plot model performance metrics
plot_performance_metrics <- function(metrics, dataset_name) {
  metrics_long <- metrics %>%
    pivot_longer(
      cols = c("R2", "MAE", "RMSE"),
      names_to = "Metric",
      values_to = "Value"
    )
  
  # Reorder factors for better readability
  metrics_long$DataType <- factor(metrics_long$DataType, levels = c("Train", "Test"))
  metrics_long$Metric <- factor(metrics_long$Metric, levels = c("R2", "MAE", "RMSE"))
  
  ggplot(metrics_long, aes(x = Model, y = Value, fill = DataType)) +
    geom_bar(stat = "identity", position = position_dodge(width = 0.8), width = 0.7) +
    labs(
      title = paste("Model Performance Metrics (R², MAE, RMSE) -", dataset_name),
      x = "Model",
      y = "Metric Value",
      fill = "Dataset"
    ) +
    scale_fill_viridis(discrete = TRUE) +
    facet_wrap(~Metric, scales = "free_y", nrow = 3) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      axis.text.y = element_text(size = 10),
      strip.text = element_text(size = 12),
      legend.position = "top",
      axis.title.x = element_text(size = 12),
      axis.title.y = element_text(size = 12)
    )
  
  ggsave(file.path("drift_fs/figures/", paste0(dataset_name, "_performance_metrics.png")), dpi = 600, bg = "white")
}


### Function to process data and generate plots for a dataset
process_and_plot_data <- function(data_list, dataset_name, n = 20) {
  beta <- data_list$beta
  feature_importance <- data_list$feature_importance
  metrics <- data_list$metrics
  
  # Get top N features for all models
  top_20_features <- get_top_n_features_all_models(feature_importance, n)
  
  # Create feature sets for Venn diagram
  feature_sets <- lapply(top_20_features, function(df) df$Variable)
  
  # Plot Venn diagram
  plot_venn_diagram(feature_sets, figurename = paste0(dataset_name, "_venn_diagram.png"))
  
  # Extract top features and sort by total importance
  all_top_features <- unique(unlist(feature_sets))
  filtered_feature_importance <- feature_importance %>%
    filter(Variable %in% all_top_features) %>%
    mutate(Total_Importance = rowSums(select(., ends_with("_Importance")), na.rm = TRUE)) %>%
    arrange(desc(Total_Importance))
  
  # Remove total importance column
  filtered_feature_importance <- select(filtered_feature_importance, -Total_Importance)
  
  # Plot Feature Importances
  plot_importance_or_beta(filtered_feature_importance, "Importance", "Top 20 Feature Importances", "Features", paste0(dataset_name, "_feature_importance.png"))
  
  # Plot model performance metrics
  plot_performance_metrics(metrics, dataset_name)
}


### Function to add points and recalculate regression
generate_plot <- function(x, y, modified_x = NULL, modified_y = NULL) {
  # Create the data frame
  plot_data <- data.frame(x = x, y = y)
  
  # Add modified points if provided
  if (!is.null(modified_x) && !is.null(modified_y)) {
    modified_data <- data.frame(x = modified_x, y = modified_y)
    plot_data <- rbind(plot_data, modified_data)
  }
  
  # Fit the linear regression model
  model <- lm(y ~ x, data = plot_data)
  r_squared <- summary(model)$r.squared
  
  # Generate the base plot
  p <- ggplot(plot_data, aes(x = x, y = y)) +
    geom_point(aes(color = ifelse(x %in% modified_x, "Modified", "Original")), size = 5) +
    scale_color_manual(values = c("Original" = "#1C7C54", "Modified" = "red")) +
    theme_minimal() +
    xlim(0, max(plot_data$x)) +
    ylim(min(plot_data$y) - 2, max(plot_data$y) + 2) +
    theme(
      legend.position = "none",
      axis.text.x = element_text(size = 20),
      axis.text.y = element_text(size = 20),
      axis.ticks.length = unit(0.3, "cm"), # Adjust tick size
      axis.title.x = element_text(size = 22, face = "bold"), # Larger x-axis title
      axis.title.y = element_text(size = 22, face = "bold") # Larger y-axis title
    )
  
  # Add regression line and R² annotation
  p1 <- p +
    geom_smooth(method = "lm", se = TRUE, color = "black", fill = "gray", alpha = 0.5) +
    annotate("text",
             x = max(plot_data$x) - 2, y = mean(plot_data$y),
             label = paste("R² =", round(r_squared, 2)),
             color = "black", size = 8
    )
  
  # Return both plots in a list
  return(list(base_plot = p, regression_plot = p1))
}

### Define a function to create plots
create_plots <- function(data_list, max_r2, titles) {
  plots <- lapply(seq_along(data_list), function(i) {
    ggplot(data_list[[i]], aes(x = R2, y = Model, fill = Model)) +
      geom_bar(stat = "identity", position = "dodge") +
      labs(
        title = titles[i],
        x = "R²",
        y = "Model",
        fill = "Model"
      ) +
      coord_cartesian(xlim = c(0, max_r2)) +
      #coord_cartesian(xlim(0, 0.5)) +
      theme_minimal() +
      scale_fill_viridis_d() +
      theme(
        axis.text.x = element_text(angle = 45, hjust = 1, size = 15),
        axis.text.y = element_text(size = 15, angle = 45),
        legend.position = "none",
        axis.title.x = element_text(size = 20),
        axis.title.y = element_text(size = 20),
        plot.title = element_text(size = 20)
      )
  })
  # Combine the plots vertically
  Reduce(`/`, plots)
}

### Updated function to create and save plots
create_feature_plot <- function(features, title, save_path) {
  # Prepare data for plotting, excluding the `Model` column from pivoting
  features_long <- features %>%
    pivot_longer(
      cols = c(-Variable), # Exclude `Variable` and `Model`
      names_to = "Model",
      values_to = "Importance"
    )
  
  # Create the plot
  feature_plot <- ggplot(features_long, aes(x = Importance, y = reorder(Variable, Importance), fill = Model)) +
    geom_bar(stat = "identity", position = "dodge") +
    labs(
      title = title,
      x = "Importance",
      y = "Feature",
      fill = "Model"
    ) +
    scale_fill_viridis_d() +
    xlim(0, 100) +  # Set x-axis range from 0 to 100
    theme_minimal() +
    theme(
      axis.text.x = element_text(size = 13),
      axis.text.y = element_text(size = 13),
      axis.title.x = element_text(size = 15),
      axis.title.y = element_text(size = 15),
      title = element_text(size = 15),
      legend.position = "top"
    )
  
  # Print and save the plot
  print(feature_plot)
  pdf(file = save_path, width = 10, height = 10)
  print(feature_plot)
  dev.off()
}


### Function to extract top features
extract_top_features <- function(dataset, top_n = 15) {
  # Extract columns ending with '_Importance'
  importance_columns <- names(dataset$feature_importance) %>%
    grep("_Importance$", ., value = TRUE)
  
  # Check if importance columns exist
  if (length(importance_columns) == 0) {
    stop("No columns ending with '_Importance' found in feature_importance.")
  }
  
  # get a column with the total importance
  dataset$feature_importance <- dataset$feature_importance %>%
    mutate(Total_Importance = rowSums(select(., ends_with("_Importance")), na.rm = TRUE))
  
  # sort by total importance from high to low
  dataset$feature_importance <- dataset$feature_importance %>%
    arrange(desc(Total_Importance))
  
  # get the top n features
  top_features <- dataset$feature_importance %>%
    select(Variable, Total_Importance) %>%
    head(10)
  
  # remove the total importance column
  dataset$feature_importance <- select(dataset$feature_importance, -Total_Importance) %>% head(10)
  
  return(dataset$feature_importance)
}


### Function to get top 20 features for each model
get_top_features <- function(dataset, model_name, importance_col) {
  dataset$feature_importance %>%
    select(Variable, all_of(importance_col)) %>%
    arrange(desc(get(importance_col))) %>%
    head(20)
}

### Extract top 20 features for each model and dataset
get_features_for_dataset <- function(dataset_name) {
  dataset <- datasets[[dataset_name]]
  map2(
    model_names, model_names_map_to,
    ~ get_top_features(dataset, .x, .y)
  ) %>%
    set_names(model_names)
}

### Function to get top models based on R²
get_top_models <- function(dataset, top_n = 3) {
  dataset$metrics %>%
    filter(DataType == "Test") %>%
    arrange(desc(R2)) %>%
    slice_head(n = top_n) %>%
    pull(Model)
}


# Function to extract metrics and calculate max R²
extract_metrics <- function(dataset_name, datasets) {
  metrics <- datasets[[dataset_name]]$metrics %>%
    filter(DataType == "Test") %>%
    select(Model, R2)
  max_r2 <- max(metrics$R2, na.rm = TRUE)
  list(metrics = metrics, max_r2 = max_r2)
}


#### Get LASSO features
get_lasso_features <- function(lasso_model) {
  # Ensure the model is of class 'train' and that 'finalModel' is available
  if (!inherits(lasso_model, "train")) {
    stop("The provided object is not a valid 'train' model object.")
  }
  
  # Extract the best lambda value based on cross-validation
  best_lambda <- lasso_model$bestTune$lambda
  
  # Extract coefficients for the final model at the best lambda
  best_coefs <- coef(lasso_model$finalModel, s = best_lambda)
  
  # Get selected features (non-zero coefficients)
  selected_features <- rownames(best_coefs)[which(best_coefs != 0)]
  
  # Remove intercept from selected features
  selected_features <- selected_features[selected_features != "(Intercept)"]
  
  # Return the outcomes as a list
  result <- list(
    best_lambda = best_lambda,
    selected_features = selected_features,
    coefficients = best_coefs
  )
  
  return(result)
}

#### Extract best model from LASSO ###

get_lasso_features <- function(lasso_model, data, outcome_col) {
  # Ensure the model is of class 'train' and that 'finalModel' is available
  if (!inherits(lasso_model, "train")) {
    stop("The provided object is not a valid 'train' model object.")
  }
  
  # Extract the best lambda value based on cross-validation
  best_lambda <- lasso_model$bestTune$lambda
  
  # Extract coefficients for the final model at the best lambda
  best_coefs <- coef(lasso_model$finalModel, s = best_lambda)
  
  # Get selected features (non-zero coefficients)
  selected_features <- rownames(best_coefs)[which(best_coefs != 0)]
  
  # Remove intercept from selected features
  selected_features <- selected_features[selected_features != "(Intercept)"]
  
  # Extract the suffix from the model name (after the last underscore)
  model_name <- deparse(substitute(lasso_model))
  model_suffix <- sub(".*_(.*)", "\\1", model_name)
  
  # Create a new data frame with the selected features
  train_selected <- data[, c(outcome_col, selected_features)]
  
  # Create the formula for the final model using the selected features
  final_formula <- as.formula(paste(outcome_col, "~", paste(selected_features, collapse = " + ")))
  
  # Fit the final model (use lm, or modify depending on your desired model type)
  final_model <- lm(final_formula, data = train_selected)
  
  # Store the final model with a dynamic name
  assign(paste0("final_model_", model_suffix), final_model, envir = .GlobalEnv)
  
  # Return the outcomes as a list, including the model and selected features
  result <- list(
    best_lambda = best_lambda,
    selected_features = selected_features,
    coefficients = best_coefs,
    final_model = final_model
  )
  
  return(result)
}

### Get predictions 
best_lasso_predictions <- function(lasso_model, data, outcome_col, test_data) { 
  # Ensure the model is of class 'train' and that 'finalModel' is available
  if (!inherits(lasso_model, "train")) {
    stop("The provided object is not a valid 'train' model object.")
  }
  
  # Extract the best lambda value based on cross-validation
  best_lambda <- lasso_model$bestTune$lambda
  
  # Extract coefficients for the final model at the best lambda
  best_coefs <- coef(lasso_model$finalModel, s = best_lambda)
  
  # Get selected features (non-zero coefficients)
  selected_features <- rownames(best_coefs)[which(best_coefs != 0)]
  
  # Remove intercept from selected features
  selected_features <- selected_features[selected_features != "(Intercept)"]
  
  # Extract the suffix from the model name (after the last underscore)
  model_name <- deparse(substitute(lasso_model))
  model_suffix <- sub(".*_(.*)", "\\1", model_name)
  
  # Create a new data frame with the selected features
  train_selected <- data[, c(outcome_col, selected_features)]
  
  # Create the formula for the final model using the selected features
  final_formula <- as.formula(paste(outcome_col, "~", paste(selected_features, collapse = " + ")))
  
  # Fit the final model (use lm, or modify depending on your desired model type)
  final_model <- lm(final_formula, data = train_selected)
  
  # Store the final model with a dynamic name
  assign(paste0("final_model_", model_suffix), final_model, envir = .GlobalEnv)
  
  # Extract summary information of the model
  mymodsum <- summary(final_model)
  mod_coef_df <- as.data.frame(mymodsum[["coefficients"]])  # Extract coefficients from model summary
  
  # prediction on reserved validation samples
  test <- test_data[complete.cases(test_data), ]  # remove rows with missing values
  
  # Grab only the columns that we have coefficients for (starting at index 2 removes the intercept)
  test_vars <- test %>% dplyr::select(rownames(mod_coef_df)[2:nrow(mod_coef_df)])
  test_vars$Intercept <- 1  # add a column for the intercept
  test_vars <- test_vars %>% dplyr::select(Intercept, everything())
  
  # Predict the risk scores without covariates
  pred_risk_scores <- as.matrix(test_vars) %*% as.matrix(mod_coef_df$Estimate)
  
  # Combine the predicted and actual values into a data frame
  pred_df <- as.data.frame(cbind(
    test$subject_id,
    test$time,
    test$bmi_bL_6m,
    scale(pred_risk_scores)  # Scale the predictions if needed
  ))
  
  # Rename the columns for clarity
  colnames(pred_df) <- c("subject_id", "time", "actual", "predicted")
  
  # Return the prediction data frame as the result
  return(pred_df)
}

####
best_lasso_predictions <- function(lasso_model, data, outcome_col, test_data, s = NULL) {  
  # Ensure the model is of class 'train' and that 'finalModel' is available
  if (!inherits(lasso_model, "train")) {
    stop("The provided object is not a valid 'train' model object.")
  }
  
  # Extract the best lambda value based on cross-validation if s is NULL
  best_lambda <- if (is.null(s)) {
    lasso_model$bestTune$lambda
  } else {
    s  # Use the provided s if not NULL
  }
  
  # Extract coefficients for the final model at the selected lambda (s)
  best_coefs <- coef(lasso_model$finalModel, s = best_lambda)
  
  # Get selected features (non-zero coefficients)
  selected_features <- rownames(best_coefs)[which(best_coefs != 0)]
  
  # Remove intercept from selected features
  selected_features <- selected_features[selected_features != "(Intercept)"]
  
  # Extract the suffix from the model name (after the last underscore)
  model_name <- deparse(substitute(lasso_model))
  model_suffix <- sub(".*_(.*)", "\\1", model_name)
  
  # Create a new data frame with the selected features
  train_selected <- data[, c(outcome_col, selected_features)]
  
  # Create the formula for the final model using the selected features
  final_formula <- as.formula(paste(outcome_col, "~", paste(selected_features, collapse = " + ")))
  
  # Fit the final model (use lm, or modify depending on your desired model type)
  final_model <- lm(final_formula, data = train_selected)
  
  # Store the final model with a dynamic name
  assign(paste0("final_model_", model_suffix), final_model, envir = .GlobalEnv)
  
  # Extract summary information of the model
  mymodsum <- summary(final_model)
  mod_coef_df <- as.data.frame(mymodsum[["coefficients"]])  # Extract coefficients from model summary
  
  # Prediction on reserved validation samples
  test <- test_data[complete.cases(test_data), ]  # Remove rows with missing values
  
  # Grab only the columns that we have coefficients for (starting at index 2 removes the intercept)
  test_vars <- test %>% dplyr::select(rownames(mod_coef_df)[2:nrow(mod_coef_df)])
  test_vars$Intercept <- 1  # Add a column for the intercept
  test_vars <- test_vars %>% dplyr::select(Intercept, everything())
  
  # Predict the risk scores without covariates
  pred_risk_scores <- as.matrix(test_vars) %*% as.matrix(mod_coef_df$Estimate)
  
  # Combine the predicted and actual values into a data frame
  pred_df <- as.data.frame(cbind(
    test$subject_id,
    test$time,
    test$bmi_bL_6m,
    scale(pred_risk_scores)  # Scale the predictions if needed
  ))
  
  # Rename the columns for clarity
  colnames(pred_df) <- c("subject_id", "time", "actual", "predicted")
  
  # Return the prediction data frame as the result
  return(pred_df)
}

best_lasso_predextra <- function(lasso_model, data, outcome_col, test_data, extra_var, s = NULL) {   
  # Ensure the model is of class 'train' and that 'finalModel' is available 
  if (!inherits(lasso_model, "train")) { 
    stop("The provided object is not a valid 'train' model object.") 
  } 
  
  # Extract the best lambda value based on cross-validation if s is NULL
  best_lambda <- if (is.null(s)) { 
    lasso_model$bestTune$lambda 
  } else { 
    s  # Use the provided s if not NULL
  } 
  
  # Extract coefficients for the final model at the selected lambda (s)
  best_coefs <- coef(lasso_model$finalModel, s = best_lambda) 
  
  # Get selected features (non-zero coefficients)
  selected_features <- rownames(best_coefs)[which(best_coefs != 0)] 
  
  # Remove intercept from selected features
  selected_features <- selected_features[selected_features != "(Intercept)"] 
  
  # Extract the suffix from the model name (after the last underscore)
  model_name <- deparse(substitute(lasso_model)) 
  model_suffix <- sub(".*_(.*)", "\\1", model_name) 
  
  # Create a new data frame with the selected features
  train_selected <- data[, c(outcome_col, selected_features)] 
  
  # Create the formula for the final model using the selected features
  final_formula <- as.formula(paste(outcome_col, "~", paste(selected_features, collapse = " + "))) 
  
  # Fit the final model (use lm, or modify depending on your desired model type)
  final_model <- lm(final_formula, data = train_selected) 
  
  # Store the final model with a dynamic name
  assign(paste0("final_model_", model_suffix), final_model, envir = .GlobalEnv) 
  
  # Extract summary information of the model
  mymodsum <- summary(final_model) 
  mod_coef_df <- as.data.frame(mymodsum[["coefficients"]])  # Extract coefficients from model summary 
  
  # Prediction on reserved validation samples
  test <- test_data[complete.cases(test_data), ]  # Remove rows with missing values
  
  # Grab only the columns that we have coefficients for (starting at index 2 removes the intercept)
  test_vars <- test %>% dplyr::select(rownames(mod_coef_df)[2:nrow(mod_coef_df)]) 
  test_vars$Intercept <- 1  # Add a column for the intercept
  test_vars <- test_vars %>% dplyr::select(Intercept, everything()) 
  
  # Predict the risk scores without covariates
  pred_risk_scores <- as.matrix(test_vars) %*% as.matrix(mod_coef_df$Estimate) 
  
  # Combine the predicted and actual values into a data frame
  pred_df <- as.data.frame(cbind(
    test$subject_id,
    test$time,
    test[[extra_var]],  # Use the extra_var instead of "bmi_bL_6m"
    scale(pred_risk_scores)  # Scale the predictions if needed
  )) 
  
  # Rename the columns for clarity
  colnames(pred_df) <- c("subject_id", "time", "actual", "predicted") 
  
  # Return the prediction data frame as the result
  return(pred_df) 
}


### Get RF predictions
best_rf_predictions <- function(rf_model, data, outcome_col, test_data) {  
  # Ensure the model is of class 'train' and that 'finalModel' is available
  if (!inherits(rf_model, "train")) {
    stop("The provided object is not a valid 'train' model object.")
  }
  
  # Extract the best mtry value from the model
  best_mtry <- rf_model$bestTune$mtry
  
  # Extract the Random Forest model (final model)
  rf_final_model <- rf_model$finalModel
  
  # Ensure that the test data is formatted similarly to the training data (excluding outcome column)
  test_data_prepared <- test_data[, -which(names(test_data) %in% c(outcome_col, "subject_id"))]
  
  # Predict using the Random Forest model
  predictions <- predict(rf_final_model, newdata = test_data_prepared)
  
  # Combine the predictions with actual values from the test set
  pred_df <- data.frame(
    subject_id = test_data$subject_id, 
    time = test_data$time, 
    actual = test_data[[outcome_col]], 
    predicted = predictions
  )
  
  # Return the prediction data frame
  return(pred_df)
}