Sex-Specific-ACL-Injury / Git / [074d07] /README_for_Sigurdsson_2019

Datasets:
GaryG/
Sex-Specific-ACL-Injury
Downloads: 1
[074d07]: / README_for_Sigurdsson_2019_ValgusMomentClusterAnalysis.R
History
Download this file
404 lines (317 with data), 17.3 kB

##################
# Thank you for downloading this data set and analysis script. 
# This document is intended for researchers who
# are unfamiliar with R and it's coding structure
# to allow for easy reproducibility of results of a 
# paper published online in (((YEAR)))

# If you have no interest in reading the script but just want to
# generate the results, press Ctrl+A and then Ctrl+Enter,
# You will be prompted for the folder where you stored the file called 
# article2minimumdata.csv and the rest is automatic.

# If you want to read the script and are unfamiliar with R, here's
# the basics:

# All lines following the "#" sign are comments. This means they do not
# contain any code that is run by the system. This is used by me,
# the first author, to explain the procedure contained herein.

# In order to run this script unmodified, the file containing the data set,
# ("Article2MinimumData.csv") MUST be located in the same folder
# as the current project. 

# For more detailed explanations about the methods used,
# see the published paper available at (((SOURCE)))

# If you are familiar with R and just want to skim over
# the script, feel free to ignore the comments.


#Installs and loads required packages
#####--------
install.packages("factoextra", "cluster", "TSclust",
                 "NbClust", "tidyverse", "TeachingDemos",
                 "data.table")
library(factoextra)
library(cluster)
library(TSclust)
library(NbClust)
library(tidyverse)
library(data.table)
library(TeachingDemos)
#####---------

# Setting a seed such that any random process involved in selecting
# initial conditions for clustering always results in the same choice
# This ensures that the graphs generated here are identical to those
# in the article
# The seed is re-set liberally before any function with a random component
set.seed(1337)


# Prompts user for the folder where the data file is stored
setwd(choose.dir(default = "", caption = "Select folder where the data file is stored"))

# Reads the comma-seperated-values file into a data.table
CleanDT <- fread("Sigurdsson_2019_ValgusMomentClusterAnalysis.csv")
CleanDT[, TimePercent := TimePercent * 100]

# This is a custom built interpolation function to increase the length of all
# time series' to lengths equal to the longest + 2 and performs the 
# data transformation

# interpolate function that takes the sign(diff()) of each series
interpolate <- function(x){ 
  rmNull <- function(x) {
    x <- Filter(Negate(is.null), x)
    lapply(x, function(x) if (is.list(x)) rmNull(x) else x)
  }  

  lengthy <- max(x$Frames)
  lengthy <- lengthy + 2
  

  intlist <- list()
  Counter <- 1
  for(i in unique(x$identifier)){
    
    if(length(na.omit(x[identifier == i]$ZValue)) > 3){ 
      
      if(lengthy > nrow(na.omit(x[identifier == i]))){
        
        intlist[[Counter]] <- approx(x[identifier == i]$ZValue, n = lengthy)[2]
        intlist[[Counter]][[1]] <- sign(diff(intlist[[Counter]][[1]]))
        names(intlist[[Counter]]) <- i
        Counter <- Counter+1
      }}
    
  }
  
  intlist <- rmNull(intlist)
  intframe <- do.call(cbind, intlist)
  eachname <- c()
  
  for(i in 1:length(intlist)){
    eachname <- c(eachname, names(intlist[[i]]))  
  }
  
  intframe <- as.matrix(intframe)
  intframe = setNames(intframe, eachname)
  return(intframe)
}
InterpolatedSigns <- interpolate(x = CleanDT)



# Next a distance matrix is created for the standardized curves
Sign_distance_matrix <- diss(InterpolatedSigns, "EUCL")

# The next 2 lines create a folder in which to store graphs generated
pathy <- paste(getwd(), "/ClusterRepetition", "/", sep = "")
dir.create(pathy, showWarnings = FALSE, recursive = TRUE)

# This generates a heat map from the distance matrix and
# saves it on the computer. The plot is located in a
# subfolder of the project directory called ClusterAnalysisRepetition
############ HEAT MAP ############## 
plote <-  fviz_dist(Sign_distance_matrix, order = TRUE, show_labels = FALSE, 
                    gradient = list(low = "red", mid = "white", high = "blue"))
ggsave(filename = paste("1_", "HeatMap", "Signs.jpg", sep = ""), plot = plote, device = "jpeg", path = pathy,
       scale = 0.2, width = 1153, height = 700, units = c("mm"), dpi = 1200, limitsize = TRUE)


Cluster_Sign <- NbClust(diss = Sign_distance_matrix, 
                             distance = NULL, 
                             min.nc = 2, 
                             max.nc = 39, 
                             method = "ward.D2", 
                             index = "cindex")


ClusterCIndex <- c(paste("Value Index: ", Cluster_Sign$Best.nc[2], sep = ""))

#  Creates the C-Index plots
jpeg(paste(pathy, "signs", "CIndexEuclidean.jpg", sep = ""))
plot(x = names(Cluster_Sign$All.index), y = Cluster_Sign$All.index,
     xlab = "Number of Clusters",
     ylab = "C-Index",
     main = "C-Index by cluster numbers",
     type = "b")
dev.off()

ClusterName <- paste("SignEucl", sep = "")
for(i in 1:max(Cluster_Sign$Best.partition)){
  TheCluster <- names(Sign_distance_matrix)[Cluster_Sign$Best.partition == i]
  CleanDT[identifier %in% TheCluster, "SignEucl" := i]}

# For cleaner graphs, the time series with insufficient numbers of frames
# to be extrapolated are removed - they have NA as the cluster
# A total of 8 rows and 4 identifiers are removed in this step
CleanDT <- CleanDT[!is.na(SignEucl)]


# This creates the plot showing the 39 shapes
plote <- ggplot(CleanDT) + aes(x = TimePercent, y = ZValue) + 
  geom_smooth(level = 0.95) + facet_wrap(~SignEucl, scales = "free") + theme_classic() + 
  ggtitle(paste("Sign lotsa Clusters", sep = "")) +
  xlab("Stance phase duration (%)") +
  ylab(paste("Knee Valgus Moment", " (", "Scaled (Z-Score)", ")", sep = ""))

ggsave(filename = paste("Initial50Clusters.jpg", sep = ""), plot = plote, device = "jpeg", path = pathy,
       scale = 0.2, width = 1153, height = 700, units = c("mm"), dpi = 1200, limitsize = TRUE)


# extracts 20 curves from each cluster for visual inspection
for(i in unique(CleanDT$SignEucl)){
    pathy <- paste(getwd(),"/ClusterRepetition/" , i, "/", sep = "")
    dir.create(pathy, showWarnings = FALSE, recursive = TRUE)
    individuals <- unique(CleanDT[SignEucl == i]$identifier)
    set.seed(1337)
    randomindividuals <- sample(1:length(individuals), if(length(individuals) < 20){length(individuals)}else{20})
    for(skie in individuals[randomindividuals]){
      ggplot(CleanDT[identifier == skie]) +
        geom_point(aes(x = TimePercent, y = value)) +
        theme_classic() + 
        geom_hline(aes(yintercept = 0, color = "y-axis zero"))
      labs(x = "Stance Phase (%)", y = "Nm/kg", title = "Valgus moment")
      ggsave(filename = paste(skie, ".jpg", sep = ""), plot = last_plot(), device = "jpeg", path = pathy,
             scale = 0.2, width = 550, height = 350, units = c("mm"), dpi = 300, limitsize = TRUE)
    }}


# Describing the shapes:
# These are assigned based on visual inspection. A total of 1025 unique shapes are identified
# some of which form a cluster of 1 where every cluster is exactly the same. The C-Index
# curve shows a gradual decrease in C-Index, and a cut-off of 0.05 was decided upon. 
EarlyPeaks <- c(1, 3, 7, 8, 9, 10, 16, 17, 18, 22, 24, 28, 35, 37, 39)
Peaks <- c(11, 15, 19, 25, 32, 34)
Downslopes <- c(2, 4, 5, 6, 14)
Upslopes <- c(13, 26, 29, 30, 31)
EarlyTroughs <- c(20, 33)
Troughs <- c(12, 21, 23, 27, 36, 38) 

CleanDT[SignEucl %in% EarlyPeaks, "SignShape" := "EarlyPeaks"]
CleanDT[SignEucl %in% Peaks, "SignShape" := "Peaks"]
CleanDT[SignEucl %in% Downslopes, "SignShape" := "Downslopes"]
CleanDT[SignEucl %in% Upslopes, "SignShape" := "Upslopes"]
CleanDT[SignEucl %in% EarlyTroughs, "SignShape" := "EarlyTroughs"]
CleanDT[SignEucl %in% Troughs, "SignShape" := "Troughs"]

pathy <- paste(getwd(), "/ClusterRepetition", "/", sep = "")
plote <- ggplot(CleanDT) + aes(x = TimePercent, y = value) + 
  geom_smooth(level = 0.95) + facet_wrap(~SignShape, scales = "free") + theme_classic() + 
  ggtitle(paste("Sign lotsa Clusters", sep = "")) +
  xlab("Stance phase duration (%)") +
  ylab(paste("Knee Valgus Moment", " (", "Scaled (Z-Score)", ")", sep = ""))

ggsave(filename = paste("Initial6SignShapes.jpg", sep = ""), plot = plote, device = "jpeg", path = pathy,
       scale = 0.2, width = 1153, height = 700, units = c("mm"), dpi = 1200, limitsize = TRUE)

# Extracts 20 curves from each of the 6 basic shapes. These are for visual inspection
# to ensure the clustering process is working. 
for(i in unique(CleanDT$SignShape)){
    pathy <- paste(getwd(),"/ClusterRepetition/" , i, "/", sep = "")
  dir.create(pathy, showWarnings = FALSE, recursive = TRUE)
  individuals <- unique(CleanDT[SignShape == i]$identifier)
  set.seed(1337)
  randomindividuals <- sample(1:length(individuals), if(length(individuals) < 20){length(individuals)}else{20})
  for(skie in individuals[randomindividuals]){
    ggplot(CleanDT[identifier == skie]) +
      geom_point(aes(x = TimePercent, y = value)) +
      theme_classic() + 
      geom_hline(aes(yintercept = 0, color = "y-axis zero"))
    labs(x = "Stance Phase (%)", y = "Nm/kg", title = "Valgus moment")
    ggsave(filename = paste(skie, ".jpg", sep = ""), plot = last_plot(), device = "jpeg", path = pathy,
           scale = 0.2, width = 550, height = 350, units = c("mm"), dpi = 300, limitsize = TRUE)
  }}

## Second clustering step
# Each of the 6 shapes are clustered based on the magnitude of the curves to from 2-4 clusters
# for each shape. 

pathy <- paste(getwd(), "/ClusterRepetition", "/", sep = "")
interpolate <- function(x){ 
  rmNull <- function(x) {
    x <- Filter(Negate(is.null), x)
    lapply(x, function(x) if (is.list(x)) rmNull(x) else x)}  
  lengthy <- max(x$Frames)
  lengthy <- lengthy + 2
  
  intlist <- list()
  Counter <- 1
  for(i in unique(x$identifier)){
    
    if(length(na.omit(x[identifier == i]$value)) > 3){ 
      
      if(lengthy > nrow(na.omit(x[identifier == i]))){
        
        intlist[[Counter]] <- approx(x[identifier == i]$value, n = lengthy)[2]
        names(intlist[[Counter]]) <- i
        Counter <- Counter+1
      }}
    
  }
  

  intlist <- rmNull(intlist)
  intframe <- do.call(cbind, intlist)
  eachname <- c()
  
  for(i in 1:length(intlist)){
    eachname <- c(eachname, names(intlist[[i]]))  
  }
  
  intframe <- as.matrix(intframe)
  intframe = setNames(intframe, eachname)
  return(intframe)
}

# This is the loop
# Note: each sub-step of the clustering process overwrites the previous step
for(curvegroups in unique(CleanDT$SignShape)){
  # Each cluster is interpolated seperately
  InterpolatedValues <- interpolate(x = CleanDT[SignShape == curvegroups])
  
  ## Four distance matrixes are calculated
  Euclidean_distance_matrix <- diss(InterpolatedValues, "EUCL")
  
  # -----------
  ############ HEAT MAP ############## 
  plote <-  fviz_dist(Euclidean_distance_matrix, 
                      order = TRUE, 
                      show_labels = FALSE, 
                      gradient = list(low = "red", mid = "white", high = "blue"))
  
  ggsave(filename = paste("2_HeatMap", curvegroups, "Euclid.jpg", sep = ""), 
         plot = plote, 
         device = "jpeg", 
         path = pathy,
         scale = 0.2, 
         width = 1153, 
         height = 700, 
         units = c("mm"), 
         dpi = 1200, 
         limitsize = TRUE)
  set.seed(1337)
  Cluster_euclidean <- NbClust(diss = Euclidean_distance_matrix, 
                               distance = NULL, 
                               min.nc = 2, 
                               max.nc = 4, 
                               method = "ward.D2", 
                               index = "cindex")
  
  
  ClusterCIndex <- c(paste("Value Index: ", Cluster_euclidean$Best.nc[2], sep = ""))
  
  #  Creates the C-Index plots
  jpeg(paste(pathy, curvegroups, "CIndexEuclidean.jpg", sep = ""))
  plot(x = names(Cluster_euclidean$All.index), y = Cluster_euclidean$All.index,
       xlab = "Number of Clusters",
       ylab = "C-Index",
       main = "C-Index by cluster numbers",
       type = "b")
  dev.off()
  
  ClusterName <- paste("Euclidean", sep = "")
  
  # updates the data.table with the new sub-clusters
  for(i in 1:max(Cluster_euclidean$Best.partition)){
    TheCluster <- names(InterpolatedValues[Cluster_euclidean$Best.partition == i])
    CleanDT[identifier %in% TheCluster, "Euclidean" := i]}
  
  # Plots the sub-clusters created
  plote <- ggplot(CleanDT[SignShape %in% curvegroups]) + 
    aes(x = TimePercent, y = value) + 
    geom_smooth(level = 0.95) + facet_wrap(~Euclidean) + theme_classic() + 
    ggtitle(paste(ClusterName, sep = "")) +
    xlab("Stance Phase (%)") +
    ylab(paste(ClusterName, " (", "Nm/kg", ")", sep = ""))
  
  ggsave(filename = paste(curvegroups, "_OverViewEucl_TwoStep.jpg", sep = ""), plot = plote, device = "jpeg", path = pathy,
         scale = 0.2, width = 1153, height = 700, units = c("mm"), dpi = 1200, limitsize = TRUE)
  
}

# Each of the sub-clusters is then identified according to the magnitude
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 1, "CurveShape" := "EarlyPeak"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 2, "CurveShape" := "EarlyPeak"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 2, "CurveMagnitude" := "Medium"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 3, "CurveShape" := "EarlyPeak"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 3, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 4, "CurveShape" := "EarlyPeak"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 4, "CurveMagnitude" := "Large"]


CleanDT[SignShape == "Downslopes" & Euclidean == 1, "CurveShape" := "Downslopes"]
CleanDT[SignShape == "Downslopes" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "Downslopes" & Euclidean == 2, "CurveShape" := "Downslopes"]
CleanDT[SignShape == "Downslopes" & Euclidean == 2, "CurveMagnitude" := "Large"]

CleanDT[SignShape == "Peaks" & Euclidean == 1, "CurveShape" := "Peaks"]
CleanDT[SignShape == "Peaks" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "Peaks" & Euclidean == 2, "CurveShape" := "Peaks"]
CleanDT[SignShape == "Peaks" & Euclidean == 2, "CurveMagnitude" := "Large"]

CleanDT[SignShape == "Troughs" & Euclidean == 1, "CurveShape" := "Troughs"]
CleanDT[SignShape == "Troughs" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "Troughs" & Euclidean == 2, "CurveShape" := "Troughs"]
CleanDT[SignShape == "Troughs" & Euclidean == 2, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "Troughs" & Euclidean == 3, "CurveShape" := "Troughs"]
CleanDT[SignShape == "Troughs" & Euclidean == 3, "CurveMagnitude" := "Medium"]
CleanDT[SignShape == "Troughs" & Euclidean == 4, "CurveShape" := "Troughs"]
CleanDT[SignShape == "Troughs" & Euclidean == 4, "CurveMagnitude" := "Large"]

CleanDT[SignShape == "Upslopes" & Euclidean == 1, "CurveShape" := "Upslopes"]
CleanDT[SignShape == "Upslopes" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "Upslopes" & Euclidean == 2, "CurveShape" := "Upslopes"]
CleanDT[SignShape == "Upslopes" & Euclidean == 2, "CurveMagnitude" := "Large"]

CleanDT[SignShape == "EarlyTroughs" & Euclidean == 1, "CurveShape" := "EarlyTroughs"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 2, "CurveShape" := "EarlyTroughs"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 2, "CurveMagnitude" := "Medium"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 3, "CurveShape" := "EarlyTroughs"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 3, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 4, "CurveShape" := "EarlyTroughs"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 4, "CurveMagnitude" := "Large"]



# Now up to 100 curves are plotted in each shape for visual inspection

for(i in unique(CleanDT$SignShape)){
    pathy <- paste(getwd(),"/ClusterRepetition/" , i, "/", sep = "")
  dir.create(pathy, showWarnings = FALSE, recursive = TRUE)
  individuals <- unique(CleanDT[SignShape == i]$identifier)
  set.seed(1337)
  randomindividuals <- sample(1:length(individuals), if(length(individuals) < 100){length(individuals)}else{100})
  for(skie in individuals[randomindividuals]){
    ggplot(CleanDT[identifier == skie]) +
      geom_point(aes(x = TimePercent, y = value)) +
      theme_classic() + 
      geom_hline(aes(yintercept = 0, color = "y-axis zero"))
    labs(x = "Stance Phase (%)", y = "Nm/kg", title = "Valgus moment")
    ggsave(filename = paste(skie, ".jpg", sep = ""), plot = last_plot(), device = "jpeg", path = pathy,
           scale = 0.2, width = 550, height = 350, units = c("mm"), dpi = 300, limitsize = TRUE)
  }}