---
title: "Perception of English Voice Quality"
subtitle: "Random Forest Analysis"
format:
html:
toc: true
code-tools: true
self-contained: true
embed-resources: true
code-fold: true
code-summary: "Show the code"
execute:
code-fold: true
warning: false
message: false
author: Mykel Brinkerhoff
date: 2025-08-27 (W)
updated: 2025-09-22 (M)
---
```{r}
#| label: load-packages
#| include: false
# Modeling process packages
library(lme4) # for creating residual H1*
library(rsample) # for resampling procedures
library(caret) # for resampling and model training
library(randomForest) # for tree generation
library(vip) # for feature interpretation
library(ranger) # for performing Random Forest CART analysis
# Helper packages
library(tidyverse) # for data manipulation, graphic, and data wrangling
library(viridis) # for colorblind friendly colors in ggplot
library(here) # for creating pathways relative to the top-level directory
library(reshape2) # for data manipulation
library(Cairo) # for saving the plots as .eps files
library(cowplot) # For creating complex plots
```
```{r}
#| label: adding_data
#| include: false
# Load in the raw vq data at data/raw/VoicesVQ_data.csv
vq_raw <- readr::read_csv(here::here("data", "raw", "VoicesVQ_data.csv"))
# load in the classification data at data/raw/b-c_voice_cluster_labels.csv
vq_classification <- readr::read_csv(here::here(
"data",
"raw",
"b-c_voice_cluster_labels.csv"
))
# Create a variable for colorblind palette
colorblind <- grDevices::palette.colors(palette = "Okabe-Ito")
# Remove unneeded columns
vq_raw <- vq_raw |>
dplyr::select(
-c(
sF0_mean,
pF0_mean,
shrF0_mean,
oF0_mean,
pF1_mean,
pF2_mean,
pF3_mean,
pF4_mean,
oF1_mean,
oF2_mean,
oF3_mean,
oF4_mean,
pB1_mean,
pB2_mean,
pB3_mean,
pB4_mean,
oB1_mean,
oB2_mean,
oB3_mean,
oB4_mean
)
)
# left_join vq_raw with vq_classification by Gender and where Voice = Talker
vq_raw <- vq_raw |>
dplyr::inner_join(vq_classification, by = c("Talker" = "Voice", "Gender")) |>
dplyr::rename_at("ClusterCoding", ~"Phonation") |>
dplyr::relocate(Phonation, .after = Label) |>
dplyr::select(-c(median_stan_xmouse, cluster_id))
# Adding function for calculating Mahalanobis distance
source(here::here("scripts", "functions", "vmahalanobis.R"))
# create a new dataframe for the cleaned data
vq_clean <- vq_raw
# flag for f0 outliers
vq_clean <- vq_clean |>
dplyr::group_by(Talker) |>
dplyr::mutate(
F0z = (strF0_mean - mean(strF0_mean, na.rm = T)) / sd(strF0_mean)
) |>
dplyr::ungroup()
vq_clean <- vq_clean |>
dplyr::mutate(f0_outlier = dplyr::if_else(abs(F0z) > 3, "outlier", "OK"))
# Flag for formant outlier
## set distance cutoff for Mahalanobis distance
distance_cutoff <- 6
## calculate mahalnobis distance on formant
vq_clean <- vq_clean |>
dplyr::group_by(Vowel) |>
# dplyr::group_by(Talker) |>
dplyr::do(vmahalanobis(.)) |>
dplyr::ungroup() |>
dplyr::mutate(formant_outlier = NA)
## visualize the formant outliers
vq_clean |>
dplyr::filter(is.na(formant_outlier)) |>
ggplot2::ggplot(aes(
x = sF2_mean,
y = sF1_mean,
colour = zF1F2 > distance_cutoff
)) +
ggplot2::geom_point(size = 0.6) +
ggplot2::facet_wrap(. ~ Vowel) +
ggplot2::scale_x_reverse(limits = c(3500, 0), position = "top") +
ggplot2::scale_y_reverse(limits = c(2000, 0), position = "right") +
ggplot2::theme_bw()
for (i in 1:nrow(vq_clean)) {
if (!is.na(vq_clean$zF1F2[i])) {
if (vq_clean$zF1F2[i] > 6) {
vq_clean$formant_outlier[i] <- "outlier"
}
}
}
## visualize with formant outliers removed
vq_clean |>
dplyr::filter(is.na(formant_outlier)) |>
ggplot2::ggplot(aes(
x = sF2_mean,
y = sF1_mean
)) +
ggplot2::geom_point(size = 0.6) +
ggplot2::facet_wrap(. ~ Vowel) +
ggplot2::scale_x_reverse(limits = c(3500, 0), position = "top") +
ggplot2::scale_y_reverse(limits = c(2000, 0), position = "right") +
ggplot2::theme_bw()
# flag energy outliers
## convert 0s to NA
vq_clean$Energy_mean[vq_clean$Energy_mean == 0] <- NA
## log10 transform energy
vq_clean <- vq_clean |>
dplyr::mutate(log_energy = log10(Energy_mean))
# remove f0, formant, and energy outliers
vq_clean <- vq_clean |>
dplyr::filter(f0_outlier == "OK") |>
dplyr::filter(is.na(formant_outlier)) |>
dplyr::filter(!is.na(log_energy))
# number of rows removed as outliers
nrow(vq_raw) - nrow(vq_clean)
# remove columns that where created
vq_clean <- vq_clean |>
dplyr::select(-c(f0_outlier, formant_outlier, zF1F2, F0z, Energy_mean))
# Standardization across all pertinate columns. Variables are speaker normalized.
vq_clean <- vq_clean |>
dplyr::group_by(Talker) |>
dplyr::mutate(
dplyr::across(
.cols = -c(
Gender,
Vowel,
Word,
Filename,
Label,
Phonation,
seg_Start,
seg_End
),
.fns = ~ (. - mean(., na.rm = TRUE) / sd(., na.rm = TRUE)),
.names = "{.col}_z"
)
)
# normalize soe
vq_clean <- vq_clean |>
dplyr::group_by(Talker) |>
dplyr::mutate(
log_soe = log10(soe_mean + 0.001),
m_log_soe = mean(log_soe, na.rm = T),
sd_log_soe = sd(log_soe, na.rm = T),
z_log_soe = (log_soe - m_log_soe) / sd_log_soe,
max_soe = max(log_soe),
min_soe = min(log_soe),
soe_norm = (log_soe - min_soe) / (max_soe - min_soe)
) |>
dplyr::select(
-c(log_soe, m_log_soe, sd_log_soe, z_log_soe, max_soe, min_soe, soe_mean_z)
) |>
dplyr::ungroup()
# Adding function for calculating residual H1*
source(here::here("scripts", "functions", "calc_residH1.R"))
# Applying the function to the dataframe
vq_clean <- vq_clean |>
calc_residH1(
h1cz_col = "H1c_mean_z",
energyz_col = "log_energy_z",
speaker_col = "Talker"
)
# uses the dataframe where residual H1* was added.
vq_resid <- vq_clean |>
select(-H1c_mean_z)
# Factorizing the phonation variable
vq_resid$Phonation <- factor(vq_resid$Phonation)
```
## Data
The voice quality and classification data comes from `VoicesVQ_data` and `b-c_voice_cluster_labels`.
The dataset `VoicesVQ_data` contains acoustic measures extracted from the audio files used in the perception experiment. These measures were extracted using VoiceSauce
The dataset `b-c_voice_cluster_labels` contains the classification labels that participants assigned to each speaker.
These data were combined so each line of `VoicesVQ_data` contained the classification label based on subject id. This was stored as a new dataframe called `vq_raw`. This dataframe was cleaned of outliers following Brinekrhoff & McGuire 2025.
Data was then standardized to make comparisons easier. Energy was first log-transformed to account of its right tail before being z-scored. SoE was normalized following Garellek et al. 2020. H1* was normalized following Chai & Garellek 2022 and replaced z-scored H1*.
Uncorrected acoustic measures, the non-STRAIGHT f0, and non-snack formants and bandwidths were removed.
This resulted in a dataframe with the following columns:
- `Gender`: factor indicating the talkers' gender
- `Talker`: factor indicating the speakers' id
- `Vowel`: factor indicating which vowel was spoken
- `Word`: character indicating which word was spoken
- `Filename`: character indicating the filename of the audio file
- `Label`: character indicating the classification label
- `Phonation`: character indicating the phonation type
- `seg_Start`: numeric indicating the start time of the segment
- `seg_End`: numeric indicating the end time of the segment
- `H2c_mean_z`: numeric indicating the z-scored H2c mean
- `H4c_mean_z`: numeric indicating the z-scored H4c mean
- `A1c_mean_z`: numeric indicating the z-scored A1c mean
- `A2c_mean_z`: numeric indicating the z-scored A2c mean
- `A3c_mean_z`: numeric indicating the z-scored A3c mean
- `H2Kc_mean_z`: numeric indicating the z-scored H2Kc mean
- `H1H2c_mean_z`: numeric indicating the z-scored H1H2c mean
- `H2H4c_mean_z`: numeric indicating the z-scored H2H4c mean
- `H1A1c_mean_z`: numeric indicating the z-scored H1A1c mean
- `H1A2c_mean_z`: numeric indicating the z-scored H1A2c mean
- `H1A3c_mean_z`: numeric indicating the z-scored H1A3c mean
- `H42Kc_mean_z`: numeric indicating the z-scored H42Kc mean
- `H2KH5Kc_mean_z`: numeric indicating the z-scored H2KH5Kc mean
- `CPP_mean_z`: numeric indicating the z-scored CPP mean
- `HNR05_mean_z`: numeric indicating the z-scored HNR05 mean
- `HNR15_mean_z`: numeric indicating the z-scored HNR15 mean
- `HNR25_mean_z`: numeric indicating the z-scored HNR25 mean
- `HNR35_mean_z`: numeric indicating the z-scored HNR35 mean
- `SHR_mean_z`: numeric indicating the z-scored SHR mean
- `strF0_mean_z`: numeric indicating the z-scored strF0 mean
- `sF1_mean_z`: numeric indicating the z-scored sF1 mean
- `sF2_mean_z`: numeric indicating the z-scored sF2 mean
- `sF3_mean_z`: numeric indicating the z-scored sF3 mean
- `sF4_mean_z`: numeric indicating the z-scored sF4 mean
- `sB1_mean_z`: numeric indicating the z-scored sB1 mean
- `sB2_mean_z`: numeric indicating the z-scored sB2 mean
- `sB3_mean_z`: numeric indicating the z-scored sB3 mean
- `sB4_mean_z`: numeric indicating the z-scored sB4 mean
- `epoch_mean_z`: numeric indicating the z-scored epoch mean
- `log_energy_z`: numeric indicating the z-scored log energy mean
- `soe_norm`: numeric indicating the normalized SoE
- `resid_H1c`: numeric indicating the residual H1c
## Analysis
### Preparing for analysis
I removed the columns `Gender`, `Talker`, `Vowel`, `Word`, `Filename`, `Label`,`seg_Start`, and `seg_End` from the dataframe and factorized `Phonation` for Random Forest analysis.
### Splitting into test and training
```{r}
#| label: splitting
#| code-fold: true
# remove all columns except Phonation and the standardized data
vq_resid <- vq_resid |>
dplyr::select(
Phonation,
dplyr::ends_with("_z"),
soe_norm,
resid_H1c
) |>
dplyr::rename_with(~ stringr::str_remove_all(., "_mean_z$|_z$|_norm$")) |>
dplyr::select(-dplyr::matches("u$"))
# stratified sampling with the rsample package with respect to phonation
set.seed(123) # needed for reproducibility
resid_split <- rsample::initial_split(
vq_resid,
prop = 0.7,
strata = "Phonation"
)
resid_train <- rsample::training(resid_split)
resid_test <- rsample::testing(resid_split)
```
Data was randommly split into a testing and training dataset. The proporations of each phonation type was maintained in each dataset.
```{r}
#| label: tab_training_split
#| fig-cap: "Phonation proporation in the training dataset"
table(resid_train$Phonation) |>
prop.table()
```
```{r}
#| label: tab_testing_split
#| fig-cap: "Phonation proporation in the testing dataset"
table(resid_test$Phonation) |>
prop.table()
```
### Tuning the model
Random Forest models need to have the correct parameter settings for the analysis to work. These parameters were chosen based on a grid search and the model with the lowest out-of-bag (OOB) error was chosen.
First a default model was trained to determine a baseline prediction error. Then a hypergrid of parameters was created and a model was trained for each combination of parameters. The OOB error for each model was recorded and the model with the lowest OOB error was chosen.
The parameters that were tuned were:
- `mtry`: number of variables to possibly split at in each node equal to
- 5% of the total number of features
- 15% of the total number of features
- 20% of the total number of features
- 25% of the total number of features
- 33.3% of the total number of features (default)
- 40% of the total number of features
- 100% of the total number of features
- `min.node.size`: minimal node size equal to
- 1
- 3
- 5
- 10
- `replace`: whether to sample with replacement
- TRUE
- FALSE
- `sample.fraction`: fraction of the data to sample at each split equal to:
- 50%
- 63%
- 80%
- `num.trees`: number of trees to grow (not tuned here, but tuned in a subsequent step).
```{r}
#| label: hypergrid_search
# determine the number of features
resid_features <- length(setdiff(names(vq_resid), "Phonation"))
# Train a default Random Forest model for comparison
resid_default <- ranger::ranger(
formula = Phonation ~ .,
data = resid_train,
num.trees = resid_features * 100,
mtry = floor(sqrt(resid_features)),
respect.unordered.factors = "order",
seed = 123
)
# default prediction error for comparison
resid_error <- resid_default$prediction.error
# generate hypergrid paramaters
hypergrid_resid <- expand.grid(
mtry = floor(resid_features * c(.05, .15, .2, .25, .333, .4, 1)),
min.node.size = c(1, 3, 5, 10),
replace = c(TRUE, FALSE),
sample.fraction = c(.5, .63, .8),
error = NA
)
# execute full grid search
for (i in seq_len(nrow(hypergrid_resid))) {
# fit the model with the i-th hyperparameter combonation
fit_resid <- ranger::ranger(
formula = Phonation ~ .,
data = resid_train,
num.trees = resid_features * 100,
mtry = hypergrid_resid$mtry[i],
min.node.size = hypergrid_resid$min.node.size[i],
replace = hypergrid_resid$replace[i],
sample.fraction = hypergrid_resid$sample.fraction[i],
respect.unordered.factors = "order",
seed = 123
)
# export OOB RMSE
hypergrid_resid$error[i] <- fit_resid$prediction.error
}
```
This results in a hypergrid with the OOB error for each combination of parameters. The top 10 models with the lowest OOB error are shown below. The percentage gain is calculated as the reduction in error compared to the default model.
```{r}
#| label: hypergrid_results
#| tbl-cap: "Top 10 models with the lowest OOB error"
# assessing the model parameters
hypergrid_resid |>
dplyr::arrange(error) |>
dplyr::mutate(perc_gain = (resid_error - error) / resid_error * 100) |>
head(10)
```
Based on the results of the grid search, the following parameters were chosen for the final model:
- `mtry = 12`
- `min.node.size = 5`
- `replace = FALSE`
- `sample.fraction = 0.8`
There were two models with the same OOB error, but I chose the one with `mtry = 12` instead of `mtry = 32`. The rationale for this if we went with `mtry = 32`, then each tree would be considering all the features at each split. This would also make the model equivalent to a *bagging* model. By choosing `mtry = 12`, we allow for more diversity among the trees, which can lead to better overall performance. This is further supported when we tune for the number of trees to use for the final model, which shows that the model with `mtry = 12` performs better than the model with `mtry = 32`.
### Tuning the number of trees
The number of trees was tuned separately from the other parameters. A hypergrid was created with the number of trees ranging from 50 to 2000 in increments of 50. The `mtry` parameter was set to the same values as before. A model was trained for each combination of parameters and the OOB error was recorded. The model with the lowest OOB error was chosen.
```{r}
#| label: hypergrid_trees
#| code-fold: true
# create a hypergrid for number of trees
hypergrid_resid_trees <- expand.grid(
num.trees = seq(50, 2000, 50),
mtry = floor(resid_features * c(.05, .15, .2, .25, .333, .4, 1)),
accuracy = NA
)
# perform grid search for correct number of trees
for (i in seq_len(nrow(hypergrid_resid_trees))) {
# fit the model with i-th hyperparameter combination
fit_resid_trees <- ranger::ranger(
formula = Phonation ~ .,
data = resid_train,
num.trees = hypergrid_resid_trees$num.trees[i],
mtry = hypergrid_resid_trees$mtry[i],
min.node.size = 5,
replace = FALSE,
sample.fraction = 0.8,
respect.unordered.factors = "order",
seed = 123
)
# export OOB RMSE
hypergrid_resid_trees$accuracy[i] <- fit_resid_trees$prediction.error
}
```
In the table below, the top 10 models with the lowest OOB error are shown. The percentage gain is calculated as the reduction in error compared to the default model. This shows that the model with `num.trees = 650` and `mtry = 12` has the lowest OOB error.
```{r}
#| label: hypergrid_trees_results
#| tab-cap: "OOB error for different number of trees and mtry values"
# assessing the model parameters
hypergrid_resid_trees |>
dplyr::arrange(accuracy) |>
dplyr::mutate(perc_gain = (resid_error - accuracy) / resid_error * 100) |>
head(10)
```
This is further visualized in the plot below. The model with `num.trees = 650` and `mtry = 12` has the lowest OOB error.
```{r}
#| label: hypergrid_trees_plot
#| fig-cap: "OOB error for different number of trees and mtry values"
#| fig-width: 7
#| fig-height: 5
#| fig-align: center
#| out-width: "70%"
# plotting the results
hypergrid_resid_trees |>
ggplot2::ggplot(aes(x = num.trees, y = accuracy, color = factor(mtry))) +
ggplot2::geom_line(linewidth = 1) +
# geom_line(aes(linetype = factor(mtry)), linewidth = 1) +
ggplot2::labs(
title = "Prediction error for Random Forest Hyperparameter Tuning",
x = "number of trees",
y = "% incorrect",
color = "mtry"
) +
ggplot2::scale_color_manual(values = colorblind) +
ggplot2::theme_bw()
```
### Final model and variable importance
The final model was trained using the optimal hyperparameters identified during the tuning process. The variable importance was assessed using both impurity and permutation-based methods.
```{r}
#| label: final_model
# final random forest model with impurity
resid_final_impurity <- ranger::ranger(
formula = Phonation ~ .,
data = resid_train,
num.trees = 650,
mtry = 12,
min.node.size = 5,
replace = FALSE,
sample.fraction = 0.8,
respect.unordered.factors = "order",
seed = 123,
classification = TRUE,
importance = "impurity",
probability = TRUE
)
# final model with permutation
resid_final_permutation <- ranger::ranger(
formula = Phonation ~ .,
data = resid_train,
num.trees = 650,
mtry = 12,
min.node.size = 5,
replace = FALSE,
sample.fraction = 0.8,
respect.unordered.factors = "order",
seed = 123,
classification = TRUE,
importance = "permutation",
probability = TRUE
)
```
The variable importance scores were extracted from both models and visualized using lollipop charts.
```{r}
#| label: variable_importance
#| fig-cap: "Variable importance scores for the final Random Forest model using impurity and permutation methods"
#| fig-width: 10
#| fig-height: 5
#| fig-align: center
#| out-width: "90%"
# Extract variable importance scores for impurity-based importance
resid_impurity_scores <- vip::vi(resid_final_impurity)
# Extract variable importance scores for permutation-based importance
resid_permutation_scores <- vip::vi(resid_final_permutation)
# Create a Lollipop chart of variable importance scores
resid_impurity_plot <- resid_impurity_scores |>
dplyr::rename_with(~ stringr::str_remove_all(., "_mean_z$|_z$|_norm$")) |>
ggplot2::ggplot(
aes(x = reorder(Variable, Importance), y = Importance)
) +
geom_segment(aes(xend = Variable, yend = 0)) +
geom_point(size = 2) +
coord_flip() +
labs(
title = "Impurity Importance",
x = "Variable",
y = "Importance (Impurity)"
) +
theme_bw()
resid_permutation_plot <- resid_permutation_scores |>
ggplot2::ggplot(
aes(x = reorder(Variable, Importance), y = Importance)
) +
geom_segment(aes(xend = Variable, yend = 0)) +
geom_point(size = 2) +
coord_flip() +
labs(
title = "Permutation Importance",
x = "Variable",
y = "Importance (Permutation)"
) +
theme_bw()
resid_variable_importance_plot <- cowplot::plot_grid(
resid_impurity_plot,
resid_permutation_plot,
nrow = 1
)
resid_variable_importance_plot
```