library(MASS) # make sure to load mass before tidyverse to avoid conflicts!
library(tidyverse)
library(patchwork)
library(ggdendro)
Hierarchical and k-means clustering
Introduction
We use the following packages:
In this practical, we will apply hierarchical and k-means clustering to two synthetic datasets. The data can be generated by running the code below.
- Try to understand what is happening as you run each line of the code below.
# randomly generate bivariate normal data
set.seed(123)
<- matrix(c(1, .5, .5, 1), 2, 2)
sigma <- mvrnorm(n = 100, mu = c(5, 5), Sigma = sigma)
sim_matrix colnames(sim_matrix) <- c("x1", "x2")
# change to a data frame (tibble) and add a cluster label column
<-
sim_df %>%
sim_matrix as_tibble() %>%
mutate(class = sample(c("A", "B", "C"), size = 100, replace = TRUE))
# Move the clusters to generate separation
<-
sim_df_small %>%
sim_df mutate(x2 = case_when(class == "A" ~ x2 + .5,
== "B" ~ x2 - .5,
class == "C" ~ x2 + .5),
class x1 = case_when(class == "A" ~ x1 - .5,
== "B" ~ x1 - 0,
class == "C" ~ x1 + .5))
class <-
sim_df_large %>%
sim_df mutate(x2 = case_when(class == "A" ~ x2 + 2.5,
== "B" ~ x2 - 2.5,
class == "C" ~ x2 + 2.5),
class x1 = case_when(class == "A" ~ x1 - 2.5,
== "B" ~ x1 - 0,
class == "C" ~ x1 + 2.5)) class
- Prepare two unsupervised datasets by removing the class feature.
<- sim_df_small %>% select(-class)
df_s <- sim_df_large %>% select(-class) df_l
- For each of these datasets, create a scatterplot. Combine the two plots into a single frame (look up the “patchwork” package to see how to do this!) What is the difference between the two datasets?
# patchwork defines the "+" operator to combine entire ggplots!
%>% ggplot(aes(x = x1, y = x2)) + geom_point() + ggtitle("Small") +
df_s %>% ggplot(aes(x = x1, y = x2)) + geom_point() + ggtitle("Large") df_l
# df_s has a lot of class overlap, df_l has very little overlap
Hierarchical clustering
- Run a hierarchical clustering on these datasets and display the result as dendrograms. Use euclidian distances and the complete agglomeration method. Make sure the two plots have the same y-scale. What is the difference between the dendrograms? (Hint: functions you’ll need are
hclust
,ggdendrogram
, andylim
)
<- dist(df_s, method = "euclidian")
dist_s <- dist(df_l, method = "euclidian")
dist_l
<- hclust(dist_s, method = "complete")
res_s <- hclust(dist_l, method = "complete")
res_l
ggdendrogram(res_s, labels = FALSE) + ggtitle("Small") + ylim(0, 10) +
ggdendrogram(res_l, labels = FALSE) + ggtitle("Large") + ylim(0, 10)
# the dataset with large differences segments into 3 classes much higher up.
# Interestingly, the microstructure (lower splits) is almost exactly the same
# because within the three clusters there is no difference between the datasets
- For the dataset with small differences, also run a complete agglomeration hierarchical cluster with manhattan distance.
<- dist(df_s, method = "manhattan")
dist_s2 <- hclust(dist_s2, method = "complete") res_s2
- Use the
cutree()
function to obtain the cluster assignments for three clusters and compare the cluster assignments to the 3-cluster euclidian solution. Do this comparison by creating two scatter plots with cluster assignment mapped to the colour aesthetic. Which difference do you see?
<- as_factor(cutree(res_s, 3))
clus_1 <- as_factor(cutree(res_s2, 3))
clus_2
<- df_s %>%
p1 ggplot(aes(x = x1, y = x2, colour = clus_1)) +
geom_point() +
ggtitle("Euclidian") +
theme(legend.position = "n") +
coord_fixed()
<- df_s %>%
p2 ggplot(aes(x = x1, y = x2, colour = clus_2)) +
geom_point() +
ggtitle("Manhattan") +
theme(legend.position = "n") +
coord_fixed()
+ p2 p1
# The manhattan distance clustering prefers more rectangular classes, whereas
# the euclidian distance clustering prefers circular classes. The difference is
# most prominent in the very center of the plot and for the top right cluster
K-means clustering
- Create k-means clusterings with 2, 3, 4, and 6 classes on the large difference data. Again, create coloured scatter plots for these clusterings.
# I set the seed for reproducibility
set.seed(45)
# we can do it in a single pipeline and with faceting
# (of course there are many ways to do this, though)
%>%
df_l mutate(
class_2 = as_factor(kmeans(df_l, 2)$cluster),
class_3 = as_factor(kmeans(df_l, 3)$cluster),
class_4 = as_factor(kmeans(df_l, 4)$cluster),
class_6 = as_factor(kmeans(df_l, 6)$cluster)
%>%
) pivot_longer(cols = c(class_2, class_3, class_4, class_6),
names_to = "class_num", values_to = "cluster") %>%
ggplot(aes(x = x1, y = x2, colour = cluster)) +
geom_point() +
scale_colour_brewer(type = "qual") + # use easy to distinguish scale
facet_wrap(~class_num)
- Do the same thing again a few times. Do you see the same results every time? where do you see differences?
# I set the seed for reproducibility
set.seed(46)
%>%
df_l mutate(
class_2 = as_factor(kmeans(df_l, 2)$cluster),
class_3 = as_factor(kmeans(df_l, 3)$cluster),
class_4 = as_factor(kmeans(df_l, 4)$cluster),
class_6 = as_factor(kmeans(df_l, 6)$cluster)
%>%
) pivot_longer(cols = c(class_2, class_3, class_4, class_6),
names_to = "class_num", values_to = "cluster") %>%
ggplot(aes(x = x1, y = x2, colour = cluster)) +
geom_point() +
scale_colour_brewer(type = "qual") + # use easy to distinguish scale
facet_wrap(~class_num)
# there is label switching in all plots. There is a different result altogether
# in the class_4 solution in my case.
- Find a way online to perform bootstrap stability assessment for the 3 and 6-cluster solutions.
# I decided to use the function clusterboot from the fpc package
# NB: this package needs to be installed first!
# install.packages("fpc")
library(fpc)
# you can read the documentation ?clusterboot to figure out how to use this
# function. This can take a while but is something you will need to do a lot in
# the real world!
<- clusterboot(df_l, B = 1000, clustermethod = kmeansCBI, k = 3,
boot_3 count = FALSE)
<- clusterboot(df_l, B = 1000, clustermethod = kmeansCBI, k = 6,
boot_6 count = FALSE)
# the average stability is much lower for 6 means than for 3 means:
$bootmean boot_3
[1] 0.9844165 0.9730219 0.9739694
$bootmean boot_6
[1] 0.7844248 0.5383414 0.7593547 0.7230086 0.6897734 0.7091287