Dimensionality Reduction

suppressPackageStartupMessages({
library(recipes)
library(tidyverse)
library(embed)
library(ggforce)
library(tidymodels)
library(mixOmics)
library(fastICA)
library(bestNormalize)
})
theme_set(cowplot::theme_cowplot())
options(repr.plot.width=15,repr.plot.height=9)

Dimensionality Reduction#

Dataset#

library(beans)
head(beans)
A tibble: 6 × 17
areaperimetermajor_axis_lengthminor_axis_lengthaspect_ratioeccentricityconvex_areaequiv_diameterextentsolidityroundnesscompactnessshape_factor_1shape_factor_2shape_factor_3shape_factor_4class
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><fct>
28395610.291208.1781173.88871.1971910.549812228715190.14110.76392250.98885600.95802710.91335780.0073315060.0031472890.83422240.9987239seker
28734638.018200.5248182.73441.0973560.411785329172191.27280.78396810.98498560.88703360.95386080.0069786590.0035636240.90985050.9984303seker
29380624.110212.8261175.93111.2097130.562727329690193.41090.77811320.98955880.94784950.90877420.0072439120.0030477330.82587060.9990661seker
30008645.884210.5580182.51651.1536380.498616030724195.46710.78268130.97669570.90393640.92832880.0070167290.0032145620.86179440.9941988seker
30140620.134201.8479190.27931.0607980.333679730417195.89650.77309800.99089330.98487710.97051550.0066970100.0036649720.94190040.9991661seker
30279634.927212.5606181.51021.1710670.520400730600196.34770.77568850.98950980.94385180.92372600.0070200650.0031527790.85326960.9992358seker
data_split <- initial_split(beans, prop=0.7, strata=class)

train = training(data_split)
test <- testing(data_split)
table(test$class)
barbunya   bombay     cali dermason    horoz    seker     sira 
     377      149      501     1068      571      616      804 
rec <- recipe(class ~ ., data=training(data_split)) |>
step_zv(all_numeric_predictors()) |>
step_orderNorm(all_numeric_predictors()) |>
step_normalize(all_numeric_predictors())
prep(rec) |>
bake(new_data = NULL) |>
head()
A tibble: 6 × 17
areaperimetermajor_axis_lengthminor_axis_lengthaspect_ratioeccentricityconvex_areaequiv_diameterextentsolidityroundnesscompactnessshape_factor_1shape_factor_2shape_factor_3shape_factor_4class
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><fct>
-0.22707620-0.23018275-0.16469484-0.23910970-0.20754041-0.20754041-0.22559124-0.22707620-0.8822719-0.1943821 0.38369760.22397181 0.22262276 0.185807160.22397181 0.59773080barbunya
-0.04290947 0.09303214-0.04185585-0.01631711-0.14312389-0.14312389-0.03974877-0.04290947-0.0650482-0.5539308-0.63754090.12612765 0.02526677 0.070059820.12612765-0.47992550barbunya
0.18527169 0.18152486-0.21076825 0.68236860-1.20737228-1.20737228 0.19223703 0.18527169-0.3552394-0.8085312-0.27283071.20901030-0.69437546 0.697392811.20901030 1.14281500barbunya
0.28980582 0.33791292 0.15696362 0.40932021-0.06188383-0.06188383 0.29901242 0.28980582-0.3292879-0.9801218-0.90067530.08140858-0.42626321-0.087747240.08140858 1.02670620barbunya
0.32956574 0.40303270 0.11181557 0.56562739-0.50518005-0.50518005 0.35551973 0.32956574-0.3493591-2.0537833-1.36693280.50458217-0.56562739 0.052130760.50458217 0.02026528barbunya
0.33401467 0.35664129 0.15829591 0.52864141-0.25620575-0.25620575 0.35159776 0.33401467-0.6570107-1.6255933-0.84069820.23450863-0.52139209-0.049495690.23450863-0.66848372barbunya
plot_rec <- function(recipe, outcome, data) {
    prep(recipe) |>
    bake(new_data = data) |>
    ggplot(aes(x = .panel_x, y = .panel_y, fill={{outcome}}, color={{outcome}})) +
    geom_point(alpha = 0.4, size = 0.5) +
    geom_autodensity(alpha = .3) +
    facet_matrix(vars(-{{outcome}}), layer.diag = 2)
}

PCA#

Principal Components Analysis is a dimensionality reduction technique to transform data onto a new coordinate system such that the directions (principal components) capture the largest variation in the data.

For a more detailed visualization of PCA results, see PCA Visualization in R

rec |>
step_pca(all_predictors(), num_comp = 4) |>
plot_rec(class, test)

PLS#

Partial Least Squares regression is a supervised technique similar to PCA, it finds a linear regression model by projecting the predicted variables and the observable variables to a new space of maximum covariance.

rec |>
step_pls(all_predictors(), outcome='class', num_comp = 4) |>
plot_rec(class, test)

ICA#

Independent Component Analysis is a computational method for separating a signal into additive subcomponents. This is done by assuming that at most one subcomponent is Gaussian and that the subcomponents are statistically independent from each other.

rec |>
step_ica(all_predictors(), num_comp = 4) |>
plot_rec(class, test)

UMAP#

Uniform Manifold Approximation and Projection is a dimension reduction technique that can be used for visualisation similarly to t-SNE, but also for general non-linear dimension reduction.

rec |>
step_umap(all_predictors(), num_comp = 4) |>
plot_rec(class, test)

UMAP can also to make use of a label information to perform supervised dimension reduction.

rec |>
step_umap(all_predictors(), outcome=vars(class), num_comp = 4) |>
plot_rec(class, test)

NMF#

Non-negative Matrix Factorization is a technique to factorize a matrix V into two matrices W and H, such that W * H = V, with the property that all three matrices have no negative elements.

Note that for demonstration purposes the data was transformed to be within 0 and 1, in practice if your data have negative numbers NMF is not a good fit.

rec |>
# transform data to be non-negative
step_range(all_predictors(), min=0, max=1, clipping = FALSE) |>
step_nnmf_sparse(all_predictors(), num_comp = 4) |>
plot_rec(class, test)

Further reading#

https://www.tmwr.org/dimensionality