##################
# Thank you for downloading this data set and analysis script.
# This document is intended for researchers who
# are unfamiliar with R and it's coding structure
# to allow for easy reproducibility of results of a
# paper published online in (((YEAR)))
# If you have no interest in reading the script but just want to
# generate the results, press Ctrl+A and then Ctrl+Enter,
# You will be prompted for the folder where you stored the file called
# article2minimumdata.csv and the rest is automatic.
# If you want to read the script and are unfamiliar with R, here's
# the basics:
# All lines following the "#" sign are comments. This means they do not
# contain any code that is run by the system. This is used by me,
# the first author, to explain the procedure contained herein.
# In order to run this script unmodified, the file containing the data set,
# ("Article2MinimumData.csv") MUST be located in the same folder
# as the current project.
# For more detailed explanations about the methods used,
# see the published paper available at (((SOURCE)))
# If you are familiar with R and just want to skim over
# the script, feel free to ignore the comments.
#Installs and loads required packages
#####--------
install.packages("factoextra", "cluster", "TSclust",
"NbClust", "tidyverse", "TeachingDemos",
"data.table")
library(factoextra)
library(cluster)
library(TSclust)
library(NbClust)
library(tidyverse)
library(data.table)
library(TeachingDemos)
#####---------
# Setting a seed such that any random process involved in selecting
# initial conditions for clustering always results in the same choice
# This ensures that the graphs generated here are identical to those
# in the article
# The seed is re-set liberally before any function with a random component
set.seed(1337)
# Prompts user for the folder where the data file is stored
setwd(choose.dir(default = "", caption = "Select folder where the data file is stored"))
# Reads the comma-seperated-values file into a data.table
CleanDT <- fread("Sigurdsson_2019_ValgusMomentClusterAnalysis.csv")
CleanDT[, TimePercent := TimePercent * 100]
# This is a custom built interpolation function to increase the length of all
# time series' to lengths equal to the longest + 2 and performs the
# data transformation
# interpolate function that takes the sign(diff()) of each series
interpolate <- function(x){
rmNull <- function(x) {
x <- Filter(Negate(is.null), x)
lapply(x, function(x) if (is.list(x)) rmNull(x) else x)
}
lengthy <- max(x$Frames)
lengthy <- lengthy + 2
intlist <- list()
Counter <- 1
for(i in unique(x$identifier)){
if(length(na.omit(x[identifier == i]$ZValue)) > 3){
if(lengthy > nrow(na.omit(x[identifier == i]))){
intlist[[Counter]] <- approx(x[identifier == i]$ZValue, n = lengthy)[2]
intlist[[Counter]][[1]] <- sign(diff(intlist[[Counter]][[1]]))
names(intlist[[Counter]]) <- i
Counter <- Counter+1
}}
}
intlist <- rmNull(intlist)
intframe <- do.call(cbind, intlist)
eachname <- c()
for(i in 1:length(intlist)){
eachname <- c(eachname, names(intlist[[i]]))
}
intframe <- as.matrix(intframe)
intframe = setNames(intframe, eachname)
return(intframe)
}
InterpolatedSigns <- interpolate(x = CleanDT)
# Next a distance matrix is created for the standardized curves
Sign_distance_matrix <- diss(InterpolatedSigns, "EUCL")
# The next 2 lines create a folder in which to store graphs generated
pathy <- paste(getwd(), "/ClusterRepetition", "/", sep = "")
dir.create(pathy, showWarnings = FALSE, recursive = TRUE)
# This generates a heat map from the distance matrix and
# saves it on the computer. The plot is located in a
# subfolder of the project directory called ClusterAnalysisRepetition
############ HEAT MAP ##############
plote <- fviz_dist(Sign_distance_matrix, order = TRUE, show_labels = FALSE,
gradient = list(low = "red", mid = "white", high = "blue"))
ggsave(filename = paste("1_", "HeatMap", "Signs.jpg", sep = ""), plot = plote, device = "jpeg", path = pathy,
scale = 0.2, width = 1153, height = 700, units = c("mm"), dpi = 1200, limitsize = TRUE)
Cluster_Sign <- NbClust(diss = Sign_distance_matrix,
distance = NULL,
min.nc = 2,
max.nc = 39,
method = "ward.D2",
index = "cindex")
ClusterCIndex <- c(paste("Value Index: ", Cluster_Sign$Best.nc[2], sep = ""))
# Creates the C-Index plots
jpeg(paste(pathy, "signs", "CIndexEuclidean.jpg", sep = ""))
plot(x = names(Cluster_Sign$All.index), y = Cluster_Sign$All.index,
xlab = "Number of Clusters",
ylab = "C-Index",
main = "C-Index by cluster numbers",
type = "b")
dev.off()
ClusterName <- paste("SignEucl", sep = "")
for(i in 1:max(Cluster_Sign$Best.partition)){
TheCluster <- names(Sign_distance_matrix)[Cluster_Sign$Best.partition == i]
CleanDT[identifier %in% TheCluster, "SignEucl" := i]}
# For cleaner graphs, the time series with insufficient numbers of frames
# to be extrapolated are removed - they have NA as the cluster
# A total of 8 rows and 4 identifiers are removed in this step
CleanDT <- CleanDT[!is.na(SignEucl)]
# This creates the plot showing the 39 shapes
plote <- ggplot(CleanDT) + aes(x = TimePercent, y = ZValue) +
geom_smooth(level = 0.95) + facet_wrap(~SignEucl, scales = "free") + theme_classic() +
ggtitle(paste("Sign lotsa Clusters", sep = "")) +
xlab("Stance phase duration (%)") +
ylab(paste("Knee Valgus Moment", " (", "Scaled (Z-Score)", ")", sep = ""))
ggsave(filename = paste("Initial50Clusters.jpg", sep = ""), plot = plote, device = "jpeg", path = pathy,
scale = 0.2, width = 1153, height = 700, units = c("mm"), dpi = 1200, limitsize = TRUE)
# extracts 20 curves from each cluster for visual inspection
for(i in unique(CleanDT$SignEucl)){
pathy <- paste(getwd(),"/ClusterRepetition/" , i, "/", sep = "")
dir.create(pathy, showWarnings = FALSE, recursive = TRUE)
individuals <- unique(CleanDT[SignEucl == i]$identifier)
set.seed(1337)
randomindividuals <- sample(1:length(individuals), if(length(individuals) < 20){length(individuals)}else{20})
for(skie in individuals[randomindividuals]){
ggplot(CleanDT[identifier == skie]) +
geom_point(aes(x = TimePercent, y = value)) +
theme_classic() +
geom_hline(aes(yintercept = 0, color = "y-axis zero"))
labs(x = "Stance Phase (%)", y = "Nm/kg", title = "Valgus moment")
ggsave(filename = paste(skie, ".jpg", sep = ""), plot = last_plot(), device = "jpeg", path = pathy,
scale = 0.2, width = 550, height = 350, units = c("mm"), dpi = 300, limitsize = TRUE)
}}
# Describing the shapes:
# These are assigned based on visual inspection. A total of 1025 unique shapes are identified
# some of which form a cluster of 1 where every cluster is exactly the same. The C-Index
# curve shows a gradual decrease in C-Index, and a cut-off of 0.05 was decided upon.
EarlyPeaks <- c(1, 3, 7, 8, 9, 10, 16, 17, 18, 22, 24, 28, 35, 37, 39)
Peaks <- c(11, 15, 19, 25, 32, 34)
Downslopes <- c(2, 4, 5, 6, 14)
Upslopes <- c(13, 26, 29, 30, 31)
EarlyTroughs <- c(20, 33)
Troughs <- c(12, 21, 23, 27, 36, 38)
CleanDT[SignEucl %in% EarlyPeaks, "SignShape" := "EarlyPeaks"]
CleanDT[SignEucl %in% Peaks, "SignShape" := "Peaks"]
CleanDT[SignEucl %in% Downslopes, "SignShape" := "Downslopes"]
CleanDT[SignEucl %in% Upslopes, "SignShape" := "Upslopes"]
CleanDT[SignEucl %in% EarlyTroughs, "SignShape" := "EarlyTroughs"]
CleanDT[SignEucl %in% Troughs, "SignShape" := "Troughs"]
pathy <- paste(getwd(), "/ClusterRepetition", "/", sep = "")
plote <- ggplot(CleanDT) + aes(x = TimePercent, y = value) +
geom_smooth(level = 0.95) + facet_wrap(~SignShape, scales = "free") + theme_classic() +
ggtitle(paste("Sign lotsa Clusters", sep = "")) +
xlab("Stance phase duration (%)") +
ylab(paste("Knee Valgus Moment", " (", "Scaled (Z-Score)", ")", sep = ""))
ggsave(filename = paste("Initial6SignShapes.jpg", sep = ""), plot = plote, device = "jpeg", path = pathy,
scale = 0.2, width = 1153, height = 700, units = c("mm"), dpi = 1200, limitsize = TRUE)
# Extracts 20 curves from each of the 6 basic shapes. These are for visual inspection
# to ensure the clustering process is working.
for(i in unique(CleanDT$SignShape)){
pathy <- paste(getwd(),"/ClusterRepetition/" , i, "/", sep = "")
dir.create(pathy, showWarnings = FALSE, recursive = TRUE)
individuals <- unique(CleanDT[SignShape == i]$identifier)
set.seed(1337)
randomindividuals <- sample(1:length(individuals), if(length(individuals) < 20){length(individuals)}else{20})
for(skie in individuals[randomindividuals]){
ggplot(CleanDT[identifier == skie]) +
geom_point(aes(x = TimePercent, y = value)) +
theme_classic() +
geom_hline(aes(yintercept = 0, color = "y-axis zero"))
labs(x = "Stance Phase (%)", y = "Nm/kg", title = "Valgus moment")
ggsave(filename = paste(skie, ".jpg", sep = ""), plot = last_plot(), device = "jpeg", path = pathy,
scale = 0.2, width = 550, height = 350, units = c("mm"), dpi = 300, limitsize = TRUE)
}}
## Second clustering step
# Each of the 6 shapes are clustered based on the magnitude of the curves to from 2-4 clusters
# for each shape.
pathy <- paste(getwd(), "/ClusterRepetition", "/", sep = "")
interpolate <- function(x){
rmNull <- function(x) {
x <- Filter(Negate(is.null), x)
lapply(x, function(x) if (is.list(x)) rmNull(x) else x)}
lengthy <- max(x$Frames)
lengthy <- lengthy + 2
intlist <- list()
Counter <- 1
for(i in unique(x$identifier)){
if(length(na.omit(x[identifier == i]$value)) > 3){
if(lengthy > nrow(na.omit(x[identifier == i]))){
intlist[[Counter]] <- approx(x[identifier == i]$value, n = lengthy)[2]
names(intlist[[Counter]]) <- i
Counter <- Counter+1
}}
}
intlist <- rmNull(intlist)
intframe <- do.call(cbind, intlist)
eachname <- c()
for(i in 1:length(intlist)){
eachname <- c(eachname, names(intlist[[i]]))
}
intframe <- as.matrix(intframe)
intframe = setNames(intframe, eachname)
return(intframe)
}
# This is the loop
# Note: each sub-step of the clustering process overwrites the previous step
for(curvegroups in unique(CleanDT$SignShape)){
# Each cluster is interpolated seperately
InterpolatedValues <- interpolate(x = CleanDT[SignShape == curvegroups])
## Four distance matrixes are calculated
Euclidean_distance_matrix <- diss(InterpolatedValues, "EUCL")
# -----------
############ HEAT MAP ##############
plote <- fviz_dist(Euclidean_distance_matrix,
order = TRUE,
show_labels = FALSE,
gradient = list(low = "red", mid = "white", high = "blue"))
ggsave(filename = paste("2_HeatMap", curvegroups, "Euclid.jpg", sep = ""),
plot = plote,
device = "jpeg",
path = pathy,
scale = 0.2,
width = 1153,
height = 700,
units = c("mm"),
dpi = 1200,
limitsize = TRUE)
set.seed(1337)
Cluster_euclidean <- NbClust(diss = Euclidean_distance_matrix,
distance = NULL,
min.nc = 2,
max.nc = 4,
method = "ward.D2",
index = "cindex")
ClusterCIndex <- c(paste("Value Index: ", Cluster_euclidean$Best.nc[2], sep = ""))
# Creates the C-Index plots
jpeg(paste(pathy, curvegroups, "CIndexEuclidean.jpg", sep = ""))
plot(x = names(Cluster_euclidean$All.index), y = Cluster_euclidean$All.index,
xlab = "Number of Clusters",
ylab = "C-Index",
main = "C-Index by cluster numbers",
type = "b")
dev.off()
ClusterName <- paste("Euclidean", sep = "")
# updates the data.table with the new sub-clusters
for(i in 1:max(Cluster_euclidean$Best.partition)){
TheCluster <- names(InterpolatedValues[Cluster_euclidean$Best.partition == i])
CleanDT[identifier %in% TheCluster, "Euclidean" := i]}
# Plots the sub-clusters created
plote <- ggplot(CleanDT[SignShape %in% curvegroups]) +
aes(x = TimePercent, y = value) +
geom_smooth(level = 0.95) + facet_wrap(~Euclidean) + theme_classic() +
ggtitle(paste(ClusterName, sep = "")) +
xlab("Stance Phase (%)") +
ylab(paste(ClusterName, " (", "Nm/kg", ")", sep = ""))
ggsave(filename = paste(curvegroups, "_OverViewEucl_TwoStep.jpg", sep = ""), plot = plote, device = "jpeg", path = pathy,
scale = 0.2, width = 1153, height = 700, units = c("mm"), dpi = 1200, limitsize = TRUE)
}
# Each of the sub-clusters is then identified according to the magnitude
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 1, "CurveShape" := "EarlyPeak"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 2, "CurveShape" := "EarlyPeak"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 2, "CurveMagnitude" := "Medium"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 3, "CurveShape" := "EarlyPeak"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 3, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 4, "CurveShape" := "EarlyPeak"]
CleanDT[SignShape == "EarlyPeaks" & Euclidean == 4, "CurveMagnitude" := "Large"]
CleanDT[SignShape == "Downslopes" & Euclidean == 1, "CurveShape" := "Downslopes"]
CleanDT[SignShape == "Downslopes" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "Downslopes" & Euclidean == 2, "CurveShape" := "Downslopes"]
CleanDT[SignShape == "Downslopes" & Euclidean == 2, "CurveMagnitude" := "Large"]
CleanDT[SignShape == "Peaks" & Euclidean == 1, "CurveShape" := "Peaks"]
CleanDT[SignShape == "Peaks" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "Peaks" & Euclidean == 2, "CurveShape" := "Peaks"]
CleanDT[SignShape == "Peaks" & Euclidean == 2, "CurveMagnitude" := "Large"]
CleanDT[SignShape == "Troughs" & Euclidean == 1, "CurveShape" := "Troughs"]
CleanDT[SignShape == "Troughs" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "Troughs" & Euclidean == 2, "CurveShape" := "Troughs"]
CleanDT[SignShape == "Troughs" & Euclidean == 2, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "Troughs" & Euclidean == 3, "CurveShape" := "Troughs"]
CleanDT[SignShape == "Troughs" & Euclidean == 3, "CurveMagnitude" := "Medium"]
CleanDT[SignShape == "Troughs" & Euclidean == 4, "CurveShape" := "Troughs"]
CleanDT[SignShape == "Troughs" & Euclidean == 4, "CurveMagnitude" := "Large"]
CleanDT[SignShape == "Upslopes" & Euclidean == 1, "CurveShape" := "Upslopes"]
CleanDT[SignShape == "Upslopes" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "Upslopes" & Euclidean == 2, "CurveShape" := "Upslopes"]
CleanDT[SignShape == "Upslopes" & Euclidean == 2, "CurveMagnitude" := "Large"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 1, "CurveShape" := "EarlyTroughs"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 1, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 2, "CurveShape" := "EarlyTroughs"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 2, "CurveMagnitude" := "Medium"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 3, "CurveShape" := "EarlyTroughs"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 3, "CurveMagnitude" := "Small"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 4, "CurveShape" := "EarlyTroughs"]
CleanDT[SignShape == "EarlyTroughs" & Euclidean == 4, "CurveMagnitude" := "Large"]
# Now up to 100 curves are plotted in each shape for visual inspection
for(i in unique(CleanDT$SignShape)){
pathy <- paste(getwd(),"/ClusterRepetition/" , i, "/", sep = "")
dir.create(pathy, showWarnings = FALSE, recursive = TRUE)
individuals <- unique(CleanDT[SignShape == i]$identifier)
set.seed(1337)
randomindividuals <- sample(1:length(individuals), if(length(individuals) < 100){length(individuals)}else{100})
for(skie in individuals[randomindividuals]){
ggplot(CleanDT[identifier == skie]) +
geom_point(aes(x = TimePercent, y = value)) +
theme_classic() +
geom_hline(aes(yintercept = 0, color = "y-axis zero"))
labs(x = "Stance Phase (%)", y = "Nm/kg", title = "Valgus moment")
ggsave(filename = paste(skie, ".jpg", sep = ""), plot = last_plot(), device = "jpeg", path = pathy,
scale = 0.2, width = 550, height = 350, units = c("mm"), dpi = 300, limitsize = TRUE)
}}