suppressPackageStartupMessages({
    library(tidyverse)
    library(cowplot)
    library(broom)
    library(dbscan)
    theme_set(theme_cowplot())
})
options(repr.plot.width=15,repr.plot.height=9)

Clustering#

k-means clustering#

data("penguins", package = "modeldata")
head(penguins,3)

data <- na.omit(penguins)
A tibble: 3 × 7
speciesislandbill_length_mmbill_depth_mmflipper_length_mmbody_mass_gsex
<fct><fct><dbl><dbl><int><int><fct>
AdelieTorgersen39.118.71813750male
AdelieTorgersen39.517.41863800female
AdelieTorgersen40.318.01953250female
kmeans.obj <- 
    select(data, bill_length_mm, bill_depth_mm) |>
    kmeans(centers=3)
glance(kmeans.obj)
tidy(kmeans.obj)
A tibble: 1 × 4
totsstot.withinssbetweenssiter
<dbl><dbl><dbl><int>
11216.36162162162265.066596638668951.295024982973
A tibble: 3 × 5
bill_length_mmbill_depth_mmsizewithinsscluster
<dbl><dbl><int><dbl><fct>
38.424264705882318.2779411764706136904.9837500000001
45.509821428571415.6830357142857112742.0969642857152
50.903529411764717.3364705882353 85617.9858823529413
augment(kmeans.obj, data) |>
ggplot(aes(x=bill_length_mm, y=bill_depth_mm, color=.cluster)) +
geom_point() +
geom_point(data=tidy(kmeans.obj), shape=4, size=3, stroke=2, aes(color='centroid'))  +
stat_ellipse()

hierarchical clustering#

select(data, bill_length_mm, bill_depth_mm) |>
as.matrix() |>
dist(method = 'canberra') |>
hclust(method='ward.D2') -> hc
plot(hc)
mutate(data, cluster=factor(cutree(hc, k=3))) |>
ggplot(aes(x=bill_length_mm, y=bill_depth_mm, color=cluster, group=cluster)) +
geom_point() +
stat_ellipse()

density clustering#

data(DS3, package='dbscan')
ggplot(DS3, aes(x=X, y=Y)) + 
geom_point()
dbscan.obj <- hdbscan(DS3, minPts = 25)
augment(dbscan.obj, DS3) |>
# cluster 0 is noise
mutate(.cluster=if_else(.cluster==0, NA, .cluster)) |>
ggplot(aes(x=X, y=Y, color=.cluster)) +
geom_point()