| Title: | Some Functions I Wrote that I Find Useful |
|---|---|
| Description: | Misc. functions. |
| Authors: | Mattan S. Ben-Shachar [cre] |
| Maintainer: | Mattan S. Ben-Shachar <[email protected]> |
| License: | GPL-3 |
| Version: | 0.0.1.15 |
| Built: | 2026-06-24 13:26:46 UTC |
| Source: | https://github.com/mattansb/MSBMisc |
Age in some units
age_in_unit( DOB, REFDATE = Sys.Date(), years = TRUE, months = TRUE, weeks = TRUE, days = TRUE )age_in_unit( DOB, REFDATE = Sys.Date(), years = TRUE, months = TRUE, weeks = TRUE, days = TRUE )
DOB, REFDATE
|
Two dates |
years, months, weeks, days
|
the units. |
DOB <- as.Date("1989-08-05") TODAY <- Sys.Date() age_in_unit(DOB, TODAY)DOB <- as.Date("1989-08-05") TODAY <- Sys.Date() age_in_unit(DOB, TODAY)
Adpated from stla/bracketify
bracketifyFile() bracketifySelection()bracketifyFile() bracketifySelection()
Follow-up for contingency table test
chisq_pairwise( Xsq, population_in_row = TRUE, adjust = stats::p.adjust.methods, effect_size = c("V", "phi"), ci = 0.95, ... ) chisq_residual( Xsq, adjust = stats::p.adjust.methods, res_type = c("pearson", "standardized"), ci = 0.95 )chisq_pairwise( Xsq, population_in_row = TRUE, adjust = stats::p.adjust.methods, effect_size = c("V", "phi"), ci = 0.95, ... ) chisq_residual( Xsq, adjust = stats::p.adjust.methods, res_type = c("pearson", "standardized"), ci = 0.95 )
Xsq |
Result from |
population_in_row |
Comparisons by row? (If not, by column.) |
adjust |
Method for correcting p-values. See |
effect_size |
Type of effect size to use. |
ci |
Confidence Interval (CI) level |
... |
Passed to |
res_type |
Type of residuals to use. |
M <- as.table(rbind( c(762, 327, 468), c(484, 239, 477) )) dimnames(M) <- list( gender = c("F", "M"), party = c("Democrat", "Independent", "Republican") ) M res <- chisq.test(M) chisq_pairwise(res) chisq_pairwise(res, population_in_row = FALSE) chisq_residual(res)M <- as.table(rbind( c(762, 327, 468), c(484, 239, 477) )) dimnames(M) <- list( gender = c("F", "M"), party = c("Democrat", "Independent", "Republican") ) M res <- chisq.test(M) chisq_pairwise(res) chisq_pairwise(res, population_in_row = FALSE) chisq_residual(res)
This function is a wrapper around psych::r.test.
compare_cor(data, r1, r2, data2 = NULL, by = NULL, ci = 0.95)compare_cor(data, r1, r2, data2 = NULL, by = NULL, ci = 0.95)
data |
A data frame |
r1, r2
|
Names of variable for the first and second correlations |
data2 |
Where to look for the |
by |
Name of column to split |
ci |
Confidence level for correlations. |
If data2 is provided, the correlation between r1 (in data) and r2 (in data2) is compared.
If r2 not provided, the correlation between r1 (in data) and r1 (in data2) is compared
If by is provided r1 (in data) is compared between the 2 groups.
Else, a test for the difference of two dependent correlations is conducted.
A list of two data frames:
The two correlations + their CIs
The test results
# Test dependent correlations ------------------ ## different variables compare_cor(mtcars, r1 = c("mpg", "hp"), r2 = c("drat", "am")) ## 1 shared variable compare_cor(mtcars, r1 = c("mpg", "hp"), r2 = c("mpg", "am")) # Test independent correlations ----------------- ## Different data sets compare_cor( data = mtcars, r1 = c("mpg", "hp"), data2 = iris, r2 = c("Sepal.Length", "Sepal.Width") ) ## Groups compare_cor(mtcars, r1 = c("mpg", "hp"), by = "am")# Test dependent correlations ------------------ ## different variables compare_cor(mtcars, r1 = c("mpg", "hp"), r2 = c("drat", "am")) ## 1 shared variable compare_cor(mtcars, r1 = c("mpg", "hp"), r2 = c("mpg", "am")) # Test independent correlations ----------------- ## Different data sets compare_cor( data = mtcars, r1 = c("mpg", "hp"), data2 = iris, r2 = c("Sepal.Length", "Sepal.Width") ) ## Groups compare_cor(mtcars, r1 = c("mpg", "hp"), by = "am")
Using stats::mcnemar.test for comparing dependent proportions.
This is function is dubious. Best not to use it.
compare_freqs(f, adjust = stats::p.adjust.methods, correct = TRUE)compare_freqs(f, adjust = stats::p.adjust.methods, correct = TRUE)
f |
A factor vector. |
adjust |
Method for correcting p-values. See |
correct |
a logical indicating whether to apply continuity correction when computing the test statistic. |
f <- c( rep("A", 12), rep("B", 45), rep("C", 42), rep("D", 20) ) compare_freqs(f)f <- c( rep("A", 12), rep("B", 45), rep("C", 42), rep("D", 20) ) compare_freqs(f)
To be used ideally with emmeans::contrast. Each contrasts is tested (sum to 0?) and scaled so that all positive weights sum to 1 (and all negative weights to -1).
contrast_weights(..., .name = "custom", .adjust = NULL) cw(..., .name = "custom", .adjust = NULL)contrast_weights(..., .name = "custom", .adjust = NULL) cw(..., .name = "custom", .adjust = NULL)
... |
Can be:
|
.name |
The label as it will appear in the results in |
.adjust |
Gives the default adjustment method for multiplicity (used in
|
Depending on input, either a vector or a data frame of scaled weights.
data(mtcars) mod <- lm(mpg ~ factor(cyl) * am, mtcars) my_contrasts <- data.frame( "squares" = c(-1, 2, -1), "4 vs 6" = c(-30, 30, 0), check.names = FALSE ) (my_contrasts2 <- cw(my_contrasts)) my_contrasts3 <- cw(my_contrasts, .adjust = "fdr") library(emmeans) (emms <- emmeans(mod, ~ cyl + am)) contrast(emms, method = my_contrasts, by = "am") contrast(emms, method = my_contrasts2, by = "am") # estimate is affected! contrast(emms, method = my_contrasts3, by = "am") # p value is affected # Also in interaction contrasts contrast(emms, interaction = list(cyl = my_contrasts2, am = "pairwise"))data(mtcars) mod <- lm(mpg ~ factor(cyl) * am, mtcars) my_contrasts <- data.frame( "squares" = c(-1, 2, -1), "4 vs 6" = c(-30, 30, 0), check.names = FALSE ) (my_contrasts2 <- cw(my_contrasts)) my_contrasts3 <- cw(my_contrasts, .adjust = "fdr") library(emmeans) (emms <- emmeans(mod, ~ cyl + am)) contrast(emms, method = my_contrasts, by = "am") contrast(emms, method = my_contrasts2, by = "am") # estimate is affected! contrast(emms, method = my_contrasts3, by = "am") # p value is affected # Also in interaction contrasts contrast(emms, interaction = list(cyl = my_contrasts2, am = "pairwise"))
Deprecated functions
crop_coord_polar(...)crop_coord_polar(...)
... |
Arguments (unused) |
Bind matrices diagonally
dbind(..., .fill = NULL)dbind(..., .fill = NULL)
... |
Matrices. |
.fill |
Value to fill the off-"diagonal" values. If |
M1 <- matrix(1:8, 2, 4) M2 <- matrix(9:14, 2, 3) dbind(M1, M2) dbind(M1, M2, .fill = NA) M1 <- matrix(letters[1:4], 2, 2) M2 <- matrix(LETTERS[5:10], 3, 2) dbind(M1, M2) dbind(M1, M2, .fill = "Banana") M1 <- matrix(TRUE, 2, 3) M2 <- matrix(NA, 3, 4) dbind(M1, M2)M1 <- matrix(1:8, 2, 4) M2 <- matrix(9:14, 2, 3) dbind(M1, M2) dbind(M1, M2, .fill = NA) M1 <- matrix(letters[1:4], 2, 2) M2 <- matrix(LETTERS[5:10], 3, 2) dbind(M1, M2) dbind(M1, M2, .fill = "Banana") M1 <- matrix(TRUE, 2, 3) M2 <- matrix(NA, 3, 4) dbind(M1, M2)
Transform means a (co)variances using the delta method
delta_method(..., .means, .V, return = c("means", "cov", "stddev", "cor"))delta_method(..., .means, .V, return = c("means", "cov", "stddev", "cor"))
... |
Unquoted transformations. See example. |
.means |
A named vector of means. |
.V |
A covariance matrix. |
return |
What should be returned? |
A list with one of (optionally named):
means of the transformed variables.
cov (co) variance matrix of the the transformed variables.
stddev standard deviations of the transformed variables (sqrt(diag(cov))).
cor correlation matrix of the transformed variables. (cov2cor(cov))
Most of this function is mostly copied from msm::deltamethod().
M <- sapply(mtcars, mean) V <- cov(mtcars) delta_method( (mpg^2) / hp, log_am = log1p(am), .means = M, .V = V, return = "cor" ) # Sobel Test ---- data("mtcars") mod.y <- lm(mpg ~ hp + cyl, mtcars[1:5, ]) mod.m <- lm(hp ~ cyl, mtcars[1:5, ]) bhat <- c(coef(mod.y), coef(mod.m))[c(2, 5)] Vhat <- dbind(vcov(mod.y), vcov(mod.m))[c(2, 5), c(2, 5)] res <- delta_method( hp * cyl, .means = bhat, .V = Vhat, return = c("means", "stddev") ) res$means / res$stddev # Compare: (bhat[1] * bhat[2]) / sqrt(bhat[1]^2 * Vhat[2, 2] + bhat[2]^2 * Vhat[1, 1]) # Special character will give you a bad time... m <- lm(mpg ~ factor(cyl), mtcars[1:5, ]) bhat <- coef(m) names(bhat) <- c("cyl4", "cyl6", "cyl8") V <- vcov(m) delta_method(cyl4, cyl4 + cyl6, cyl4 + cyl8, .means = bhat, .V = V )M <- sapply(mtcars, mean) V <- cov(mtcars) delta_method( (mpg^2) / hp, log_am = log1p(am), .means = M, .V = V, return = "cor" ) # Sobel Test ---- data("mtcars") mod.y <- lm(mpg ~ hp + cyl, mtcars[1:5, ]) mod.m <- lm(hp ~ cyl, mtcars[1:5, ]) bhat <- c(coef(mod.y), coef(mod.m))[c(2, 5)] Vhat <- dbind(vcov(mod.y), vcov(mod.m))[c(2, 5), c(2, 5)] res <- delta_method( hp * cyl, .means = bhat, .V = Vhat, return = c("means", "stddev") ) res$means / res$stddev # Compare: (bhat[1] * bhat[2]) / sqrt(bhat[1]^2 * Vhat[2, 2] + bhat[2]^2 * Vhat[1, 1]) # Special character will give you a bad time... m <- lm(mpg ~ factor(cyl), mtcars[1:5, ]) bhat <- coef(m) names(bhat) <- c("cyl4", "cyl6", "cyl8") V <- vcov(m) delta_method(cyl4, cyl4 + cyl6, cyl4 + cyl8, .means = bhat, .V = V )
Calculate reliability from E/CFA
fa_reliability(x, ...) ## S3 method for class 'fa' fa_reliability(x, keys = NULL, threshold = 0, labels = NULL, ...) ## S3 method for class 'lavaan' fa_reliability(x, ...)fa_reliability(x, ...) ## S3 method for class 'fa' fa_reliability(x, keys = NULL, threshold = 0, labels = NULL, ...) ## S3 method for class 'lavaan' fa_reliability(x, ...)
x |
E/CFA model (e.g., the result from |
... |
For |
keys |
optional, see ?psych::make.keys |
threshold |
which values from the loadings should be used? Only used if
|
labels |
optional factor labels |
data("Harman74.cor") EFA <- psych::fa(Harman74.cor$cov, 4) fa_reliability(EFA) HS.model <- " visual =~ x1 + x2 + x3 textual =~ x4 + x5 + x6 speed =~ x7 + x8 + x9 " CFA <- lavaan::cfa(HS.model, data = lavaan::HolzingerSwineford1939) fa_reliability(CFA)data("Harman74.cor") EFA <- psych::fa(Harman74.cor$cov, 4) fa_reliability(EFA) HS.model <- " visual =~ x1 + x2 + x3 textual =~ x4 + x5 + x6 speed =~ x7 + x8 + x9 " CFA <- lavaan::cfa(HS.model, data = lavaan::HolzingerSwineford1939) fa_reliability(CFA)
Get raw data for plotting with model predictions
get_data_for_grid(grid, model, residualize = FALSE, collapse_by = FALSE, ...) ## S3 method for class 'data.frame' get_data_for_grid( grid, model, residualize = FALSE, collapse_by = FALSE, pred_name, ... ) ## S3 method for class 'ggeffects' get_data_for_grid( grid, model, residualize = FALSE, collapse_by = FALSE, protect_names = TRUE, ... ) ## S3 method for class 'emmGrid' get_data_for_grid( grid, model, residualize = FALSE, collapse_by = FALSE, protect_names = TRUE, ... ) ## S3 method for class 'predictions' get_data_for_grid(grid, model, residualize = FALSE, collapse_by = FALSE, ...)get_data_for_grid(grid, model, residualize = FALSE, collapse_by = FALSE, ...) ## S3 method for class 'data.frame' get_data_for_grid( grid, model, residualize = FALSE, collapse_by = FALSE, pred_name, ... ) ## S3 method for class 'ggeffects' get_data_for_grid( grid, model, residualize = FALSE, collapse_by = FALSE, protect_names = TRUE, ... ) ## S3 method for class 'emmGrid' get_data_for_grid( grid, model, residualize = FALSE, collapse_by = FALSE, protect_names = TRUE, ... ) ## S3 method for class 'predictions' get_data_for_grid(grid, model, residualize = FALSE, collapse_by = FALSE, ...)
grid |
A data grid with predictions |
model |
The statistical model |
residualize |
Should data be residualized? |
collapse_by |
Name of grouping variable to collaple across. If |
... |
Args passed from / to other functions. |
pred_name |
Name of column that has the predictions in the data grid |
protect_names |
Logical, if |
data("mtcars") mtcars <- mtcars |> transform(cyl = factor(cyl)) mod <- lm(mpg ~ hp + cyl, data = mtcars[1:10, ]) nd <- expand.grid( hp = seq(50, 350, by = 50), cyl = "4" ) nd$predicted_mpg <- predict(mod, newdata = nd) get_data_for_grid(nd, mod) get_data_for_grid(nd, mod, residualize = TRUE, pred_name = "predicted_mpg") library(ggplot2) ggplot(nd, aes(hp, predicted_mpg)) + geom_line() + geom_point(aes(y = mpg, color = "Raw"), data = get_data_for_grid(nd, mod) ) + geom_point(aes(color = "Residualized"), data = get_data_for_grid(nd, mod, residualize = TRUE, pred_name = "predicted_mpg") ) + labs( title = "Partial residual plot", color = "Data" ) ## Support of data-grid packages ------ # - ggeffects # - emmeans # - marginaleffects pred_ggeffects <- ggeffects::ggpredict(mod, c("hp [50:350, by = 50]", "cyl [4]")) get_data_for_grid(pred_ggeffects, residualize = TRUE) at <- list(hp = seq(50, 350, by = 50), cyl = "4") pred_emmeans <- emmeans::emmeans(mod, ~ hp + cyl, at = at) get_data_for_grid(pred_emmeans, mod, residualize = TRUE) # pred_marginaleffects <- marginaleffects::predictions(mod, newdata = nd) # get_data_for_grid(pred_marginaleffects, residualize = TRUE) ## Collapes across group ------ data("cake", package = "lme4") fm1 <- lme4::lmer(angle ~ temperature + (1 | recipe), data = cake ) nd <- marginaleffects::datagrid( temperature = unique(cake$temperature), model = fm1 ) suppressWarnings(pred_marginaleffects <- marginaleffects::predictions(fm1, newdata = nd)) get_data_for_grid(pred_marginaleffects, collapse_by = TRUE) # get_data_for_grid(pred_marginaleffects, collapse_by = TRUE, residualize = TRUE)data("mtcars") mtcars <- mtcars |> transform(cyl = factor(cyl)) mod <- lm(mpg ~ hp + cyl, data = mtcars[1:10, ]) nd <- expand.grid( hp = seq(50, 350, by = 50), cyl = "4" ) nd$predicted_mpg <- predict(mod, newdata = nd) get_data_for_grid(nd, mod) get_data_for_grid(nd, mod, residualize = TRUE, pred_name = "predicted_mpg") library(ggplot2) ggplot(nd, aes(hp, predicted_mpg)) + geom_line() + geom_point(aes(y = mpg, color = "Raw"), data = get_data_for_grid(nd, mod) ) + geom_point(aes(color = "Residualized"), data = get_data_for_grid(nd, mod, residualize = TRUE, pred_name = "predicted_mpg") ) + labs( title = "Partial residual plot", color = "Data" ) ## Support of data-grid packages ------ # - ggeffects # - emmeans # - marginaleffects pred_ggeffects <- ggeffects::ggpredict(mod, c("hp [50:350, by = 50]", "cyl [4]")) get_data_for_grid(pred_ggeffects, residualize = TRUE) at <- list(hp = seq(50, 350, by = 50), cyl = "4") pred_emmeans <- emmeans::emmeans(mod, ~ hp + cyl, at = at) get_data_for_grid(pred_emmeans, mod, residualize = TRUE) # pred_marginaleffects <- marginaleffects::predictions(mod, newdata = nd) # get_data_for_grid(pred_marginaleffects, residualize = TRUE) ## Collapes across group ------ data("cake", package = "lme4") fm1 <- lme4::lmer(angle ~ temperature + (1 | recipe), data = cake ) nd <- marginaleffects::datagrid( temperature = unique(cake$temperature), model = fm1 ) suppressWarnings(pred_marginaleffects <- marginaleffects::predictions(fm1, newdata = nd)) get_data_for_grid(pred_marginaleffects, collapse_by = TRUE) # get_data_for_grid(pred_marginaleffects, collapse_by = TRUE, residualize = TRUE)
Compare multiple vectors
gt(...) lt(...) eq(...) neq(...) leq(...) geq(...) x %>>% y x %<<% y x %==% y x %!=% y x %<=% y x %>=% ygt(...) lt(...) eq(...) neq(...) leq(...) geq(...) x %>>% y x %<<% y x %==% y x %!=% y x %<=% y x %>=% y
x, y, ...
|
Vectors, typically numerical, to be compared. |
A logical vector. For the operator, a last_y attribute stores the
last RHS values from the comparisons (strip away with as.vector()). See
examples.
x <- c(1, 3, 1, 1, 2) y <- c(2, 2, 1, 1, 1) z <- c(3, 1, 1, 2, 1) lt(x, y, z) # > eq(x, y, z) # == neq(x, y, z) # != leq(x, y, z) # <= geq(x, y, z) # >= gt(x, y, z) # < # same as x %>>% y %>>% z # same as x > y & y > z # Operators can be mixed! x %>>% y %==% z # Or broken (l1 <- x %>>% y) (l2 <- l1 %==% z) # same as x > y & y == z as.vector(l2)x <- c(1, 3, 1, 1, 2) y <- c(2, 2, 1, 1, 1) z <- c(3, 1, 1, 2, 1) lt(x, y, z) # > eq(x, y, z) # == neq(x, y, z) # != leq(x, y, z) # <= geq(x, y, z) # >= gt(x, y, z) # < # same as x %>>% y %>>% z # same as x > y & y > z # Operators can be mixed! x %>>% y %==% z # Or broken (l1 <- x %>>% y) (l2 <- l1 %==% z) # same as x > y & y == z as.vector(l2)
Test rowwise if each row is missing / has all data in select columns
has_any_data(.data, ..., .name) has_all_data(.data, ..., .name) missing_any_data(.data, ..., .name) missing_all_data(.data, ..., .name)has_any_data(.data, ..., .name) has_all_data(.data, ..., .name) missing_any_data(.data, ..., .name) missing_all_data(.data, ..., .name)
.data |
A data frame, data frame extension (e.g. a tibble), or a lazy data frame (e.g. from dbplyr or dtplyr). See Methods, below, for more details. |
... |
< |
.name |
Name of the new column with the logical index. |
data(mtcars) mtcars[1, 1] <- NA mtcars[2, ] <- NA mtcars[1:3, 1:3] |> has_any_data(mpg:disp, .name = "has_any") |> has_all_data(mpg:disp, .name = "has_all") |> missing_any_data(mpg:disp, .name = "missing_any") |> missing_all_data(mpg:disp, .name = "missing_all")data(mtcars) mtcars[1, 1] <- NA mtcars[2, ] <- NA mtcars[1:3, 1:3] |> has_any_data(mpg:disp, .name = "has_any") |> has_all_data(mpg:disp, .name = "has_all") |> missing_any_data(mpg:disp, .name = "missing_any") |> missing_all_data(mpg:disp, .name = "missing_all")
It's just logical
is.TRUE(x) is.FALSE(x) allTRUE(...) anyTRUE(...)is.TRUE(x) is.FALSE(x) allTRUE(...) anyTRUE(...)
x, ...
|
Values to be tested. |
x <- list(1, TRUE, list(TRUE), FALSE, "Hello world!") is.TRUE(x) is.FALSE(x) allTRUE(TRUE, FALSE, stop("NOOOO")) anyTRUE(TRUE, FALSE, stop("NOOOO"))x <- list(1, TRUE, list(TRUE), FALSE, "Hello world!") is.TRUE(x) is.FALSE(x) allTRUE(TRUE, FALSE, stop("NOOOO")) anyTRUE(TRUE, FALSE, stop("NOOOO"))
This function estimates the stability of cluster assignments by calculating the average Jaccard index: how well do cluster assignments of new data from a pre-trained model align with cluster assignments of the same data from a new model fitted on the same data (Hennig, 2007).
jaccard_avg(object, ...) ## S3 method for class 'cluster_spec' jaccard_avg(object, ...) ## S3 method for class 'cluster_fit' jaccard_avg(object, new_data = NULL, ...) ## S3 method for class 'workflow' jaccard_avg(object, new_data = NULL, ...)jaccard_avg(object, ...) ## S3 method for class 'cluster_spec' jaccard_avg(object, ...) ## S3 method for class 'cluster_fit' jaccard_avg(object, new_data = NULL, ...) ## S3 method for class 'workflow' jaccard_avg(object, new_data = NULL, ...)
object |
A fitted tidyclust model |
... |
Not currently used. |
new_data |
A dataset to predict on. |
Note that this metric will re-fit the model on the new dataset
(new_data), and so might be computationally expensive.
A tibble with 3 columns; .metric, .estimator, and .estimate.
Hennig, C. (2007). Cluster-wise assessment of cluster stability. Computational Statistics & Data Analysis, 52(1), 258-271.
Other cluster metric:
pred_strength()
Other ML4Psych:
metric_by_event(),
pred_strength()
library(tidymodels) library(tidyclust) penguins <- drop_na(penguins) kmeans_spec <- k_means(num_clusters = tune()) res <- tune_cluster( kmeans_spec, ~ bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g, resamples = bootstraps(penguins, times = 10), metrics = cluster_metric_set(jaccard_avg) ) # What's the largest k that has high recovery rate? # (values above 0.75 are considered good, values below 0.5 are considered bad) autoplot(res)library(tidymodels) library(tidyclust) penguins <- drop_na(penguins) kmeans_spec <- k_means(num_clusters = tune()) res <- tune_cluster( kmeans_spec, ~ bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g, resamples = bootstraps(penguins, times = 10), metrics = cluster_metric_set(jaccard_avg) ) # What's the largest k that has high recovery rate? # (values above 0.75 are considered good, values below 0.5 are considered bad) autoplot(res)
Functions to convert parameters of a log-normal distribution to meaningfull values on the response scale.
lnorm_mean(meanlog, sdlog, ...) lnorm_median(meanlog, ...) lnorm_var(meanlog, sdlog, ...) lnorm_sd(meanlog, sdlog, ...)lnorm_mean(meanlog, sdlog, ...) lnorm_median(meanlog, ...) lnorm_var(meanlog, sdlog, ...) lnorm_sd(meanlog, sdlog, ...)
meanlog, sdlog
|
mean and standard deviation of the distribution
on the log scale with default values of |
... |
Not used |
x <- rlnorm(1e4, meanlog = 1.5, sdlog = 1.2) m <- lm(log(x) ~ 1) meanlog <- coef(m) sdlog <- sigma(m) lnorm_mean(meanlog, sdlog) mean(x) lnorm_median(meanlog, sdlog) median(x) lnorm_sd(meanlog, sdlog) sd(x)x <- rlnorm(1e4, meanlog = 1.5, sdlog = 1.2) m <- lm(log(x) ~ 1) meanlog <- coef(m) sdlog <- sigma(m) lnorm_mean(meanlog, sdlog) mean(x) lnorm_median(meanlog, sdlog) median(x) lnorm_sd(meanlog, sdlog) sd(x)
Log-likelihood (and by extension AIC and BIC) for log-normal models fit
with stats::lm(log(y) ~ ...) are computed with stats::dnorm(log(y), ...)
instead of with stats::dlnorm(y, ...), which makes comparing different
families difficult. This function is aimed at rectifying this. See examples.
logLik_lnorm(object, REML = FALSE) AIC_lnorm(object, k = 2, REML = FALSE) BIC_lnorm(object, REML = FALSE)logLik_lnorm(object, REML = FALSE) AIC_lnorm(object, k = 2, REML = FALSE) BIC_lnorm(object, REML = FALSE)
object |
A fitted model object. The model must meet all of the following (will throw an error if not met):
|
REML |
Only |
k |
numeric, the penalty per parameter to be used; the
default |
REML is not (yet) supported. Make sure you are comparing correct
LL/AIC/BIC values.
data("mtcars") mtcars$mpg <- floor(mtcars$mpg) model_lnorm <- lm(log(mpg) ~ factor(cyl), mtcars) model_norm <- lm(mpg ~ factor(cyl), mtcars) model_pois <- glm(mpg ~ factor(cyl), mtcars, family = poisson()) # No, that first one is wrong... (aics <- AIC(model_lnorm, model_norm, model_pois)) aics[1, "AIC"] <- AIC_lnorm(model_lnorm) aics # better! # Should support any model really... ===================== model_lnorm <- lme4::lmer(log(mpg) ~ factor(cyl) + (1 | gear), mtcars, REML = FALSE) model_norm <- lme4::lmer(mpg ~ factor(cyl) + (1 | gear), mtcars, REML = FALSE) model_pois <- lme4::glmer(mpg ~ factor(cyl) + (1 | gear), mtcars, family = poisson()) # No, that first one is wrong... (aics <- AIC(model_lnorm, model_norm, model_pois)) aics[1, "AIC"] <- AIC_lnorm(model_lnorm) aics # better!data("mtcars") mtcars$mpg <- floor(mtcars$mpg) model_lnorm <- lm(log(mpg) ~ factor(cyl), mtcars) model_norm <- lm(mpg ~ factor(cyl), mtcars) model_pois <- glm(mpg ~ factor(cyl), mtcars, family = poisson()) # No, that first one is wrong... (aics <- AIC(model_lnorm, model_norm, model_pois)) aics[1, "AIC"] <- AIC_lnorm(model_lnorm) aics # better! # Should support any model really... ===================== model_lnorm <- lme4::lmer(log(mpg) ~ factor(cyl) + (1 | gear), mtcars, REML = FALSE) model_norm <- lme4::lmer(mpg ~ factor(cyl) + (1 | gear), mtcars, REML = FALSE) model_pois <- lme4::glmer(mpg ~ factor(cyl) + (1 | gear), mtcars, family = poisson()) # No, that first one is wrong... (aics <- AIC(model_lnorm, model_norm, model_pois)) aics[1, "AIC"] <- AIC_lnorm(model_lnorm) aics # better!
This function computes class-wise metrics for a metric set in a multiclass setting by binarizing the outcome variable for each class and applying the metric set to each binary version of the data.
metric_by_event(data, metric_set, truth, estimate, ..., .all = FALSE)metric_by_event(data, metric_set, truth, estimate, ..., .all = FALSE)
data |
A data frame containing the columns specified in |
metric_set |
A metric set function (e.g., |
truth |
The column identifier for the true class results
(that is a |
estimate |
The column identifier for the predicted class
results (that is also |
... |
Not currently used. |
.all |
A logical value indicating whether to include all metrics from the metric set (including those that do not use macro/micro estimators). |
A tibble with the same columns to be expected from the metric set,
plus the following additional columns: .class indicating the class for which
the metric was computed. For these rows, .estimator is set to "event" to
indicate that these are event-level metrics.
Other ML4Psych:
jaccard_avg(),
pred_strength()
Adpated from stla/pasteAsComment
Paste(prefix = "#> ") PasteRoxygen() Copy(prefix = "#> ") CopyRoxygen()Paste(prefix = "#> ") PasteRoxygen() Copy(prefix = "#> ") CopyRoxygen()
prefix |
The prefix to use for the copied text. |
Based on Lenth (2007) Post Hoc Power: Tables and Commentary.
php.guf() give's Lenth's grand unified formula for post hoc power.
php.t(tval, p, df = Inf, alpha = 0.05, one.tailed = FALSE) php.F(Fval, p, df1, df2 = Inf, alpha = 0.05) php.z(zval, p, alpha = 0.05, one.tailed = FALSE) php.chisq(chisqval, p, df, alpha = 0.05) php.guf(p, alpha = 0.05)php.t(tval, p, df = Inf, alpha = 0.05, one.tailed = FALSE) php.F(Fval, p, df1, df2 = Inf, alpha = 0.05) php.z(zval, p, alpha = 0.05, one.tailed = FALSE) php.chisq(chisqval, p, df, alpha = 0.05) php.guf(p, alpha = 0.05)
tval, zval, Fval, chisqval
|
Observed test statistic |
p |
Observed p-value (used if test statistic not supplied) |
df, df1, df2
|
Test statistics' degrees of freedom |
alpha |
Confidence level of the test. |
one.tailed |
Is the p-value from a one-tailed test? |
lm(hp ~ am, mtcars) |> summary() php.t(p = 0.18, df = 30) php.guf(p = 0.18) # Table 1 - one tailed expand.grid( p = c(.001, .01, .05, .1, .25, .5, .75), df = c(1, 2, 5, 10, 20, 50, 200, 1000, Inf) ) |> transform(PHP = php.t(p = p, df = df, one.tailed = TRUE)) |> stats::reshape( direction = "wide", idvar = "df", timevar = "p" ) # Table 1 - two tailed expand.grid( p = c(.001, .01, .05, .1, .25, .5, .75), df = c(1, 2, 5, 10, 20, 50, 200, 1000, Inf) ) |> transform(PHP = php.t(p = p, df = df)) |> stats::reshape( direction = "wide", idvar = "df", timevar = "p" ) # Table 2 expand.grid( p = c(.001, .01, .05, .1, .25, .5, .75), df2 = c(1, 2, 5, 10, 20, 50, 200, 1000, Inf), df1 = c(2, 3, 4, 10) ) |> transform(PHP = php.F(p = p, df1 = df1, df2 = df2)) |> stats::reshape( direction = "wide", idvar = c("df1", "df2"), timevar = "p" )lm(hp ~ am, mtcars) |> summary() php.t(p = 0.18, df = 30) php.guf(p = 0.18) # Table 1 - one tailed expand.grid( p = c(.001, .01, .05, .1, .25, .5, .75), df = c(1, 2, 5, 10, 20, 50, 200, 1000, Inf) ) |> transform(PHP = php.t(p = p, df = df, one.tailed = TRUE)) |> stats::reshape( direction = "wide", idvar = "df", timevar = "p" ) # Table 1 - two tailed expand.grid( p = c(.001, .01, .05, .1, .25, .5, .75), df = c(1, 2, 5, 10, 20, 50, 200, 1000, Inf) ) |> transform(PHP = php.t(p = p, df = df)) |> stats::reshape( direction = "wide", idvar = "df", timevar = "p" ) # Table 2 expand.grid( p = c(.001, .01, .05, .1, .25, .5, .75), df2 = c(1, 2, 5, 10, 20, 50, 200, 1000, Inf), df1 = c(2, 3, 4, 10) ) |> transform(PHP = php.F(p = p, df1 = df1, df2 = df2)) |> stats::reshape( direction = "wide", idvar = c("df1", "df2"), timevar = "p" )
This function estimates the stability of cluster assignments by calculating the prediction strength: how well do cluster assignments of new data from a pre-trained model co-incide with cluster assignments of the same data from a new model fitted on the same data (Tibshirani & Walther, 2005).
pred_strength(object, ...) ## S3 method for class 'cluster_spec' pred_strength(object, ...) ## S3 method for class 'cluster_fit' pred_strength(object, new_data = NULL, ...) ## S3 method for class 'workflow' pred_strength(object, new_data = NULL, ...)pred_strength(object, ...) ## S3 method for class 'cluster_spec' pred_strength(object, ...) ## S3 method for class 'cluster_fit' pred_strength(object, new_data = NULL, ...) ## S3 method for class 'workflow' pred_strength(object, new_data = NULL, ...)
object |
A fitted tidyclust model |
... |
Not currently used. |
new_data |
A dataset to predict on. |
Note that this metric will re-fit the model on the new dataset
(new_data), and so might be computationally expensive.
A tibble with 3 columns; .metric, .estimator, and .estimate.
Tibshirani, R. and Walther, G. (2005) Cluster Validation by Prediction Strength, Journal of Computational and Graphical Statistics, 14(3), 511-528.
Other cluster metric:
jaccard_avg()
Other ML4Psych:
jaccard_avg(),
metric_by_event()
library(tidymodels) library(tidyclust) penguins <- drop_na(penguins) kmeans_spec <- k_means(num_clusters = tune()) res <- tune_cluster( kmeans_spec, ~ bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g, resamples = vfold_cv(penguins, v = 2, repeats = 10), metrics = cluster_metric_set(pred_strength) ) # What's the largest k that has a good prediction strength? # (values above 0.8 are considered good) autoplot(res)library(tidymodels) library(tidyclust) penguins <- drop_na(penguins) kmeans_spec <- k_means(num_clusters = tune()) res <- tune_cluster( kmeans_spec, ~ bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g, resamples = vfold_cv(penguins, v = 2, repeats = 10), metrics = cluster_metric_set(pred_strength) ) # What's the largest k that has a good prediction strength? # (values above 0.8 are considered good) autoplot(res)
Useful in RMarkdown.
print_library(..., .character.only = FALSE, .version = TRUE, .load = TRUE) print_require(..., .character.only = FALSE, .version = TRUE, .load = TRUE) print_library_md(..., .character.only = FALSE, .version = TRUE, .load = TRUE)print_library(..., .character.only = FALSE, .version = TRUE, .load = TRUE) print_require(..., .character.only = FALSE, .version = TRUE, .load = TRUE) print_library_md(..., .character.only = FALSE, .version = TRUE, .load = TRUE)
... |
Names of packages. |
.character.only |
Is |
.version |
Print library version? |
.load |
Load package, or just print? |
print_library(afex, tidyverse, emmeans, MASS, .load = FALSE )print_library(afex, tidyverse, emmeans, MASS, .load = FALSE )
This function is a wrapper around qqplotr::stat_pp_*(detrend = TRUE).
qq_worm_plot(x, distribution = "norm", ...)qq_worm_plot(x, distribution = "norm", ...)
x |
A numerical vector |
distribution |
Name of a distribution, matching the |
... |
Args passed to |
x <- rnorm(100) qq_worm_plot(x) x <- rbeta(100, shape1 = 2, shape2 = 3) qq_worm_plot(x, distribution = "beta", shape1 = 2, shape2 = 3) x <- rt(100, df = 3) qq_worm_plot(x, distribution = "t", df = 3) # x <- rexp(100) # qq_worm_plot(x, distribution = "exp") # x <- rpois(100, lambda = 15) # qq_worm_plot(x, distribution = "pois", lambda = 15) # x <- runif(100) # qq_worm_plot(x, distribution = "unif")x <- rnorm(100) qq_worm_plot(x) x <- rbeta(100, shape1 = 2, shape2 = 3) qq_worm_plot(x, distribution = "beta", shape1 = 2, shape2 = 3) x <- rt(100, df = 3) qq_worm_plot(x, distribution = "t", df = 3) # x <- rexp(100) # qq_worm_plot(x, distribution = "exp") # x <- rpois(100, lambda = 15) # qq_worm_plot(x, distribution = "pois", lambda = 15) # x <- runif(100) # qq_worm_plot(x, distribution = "unif")
Spearman-Brown Split half reliability
r_SB(x, y = NULL, var.equal = TRUE)r_SB(x, y = NULL, var.equal = TRUE)
x |
A correlation or numeric vector |
y |
A numeric vector |
var.equal |
Assume equal var of |
r_SB(1:30, -exp(1 / 1:30), var.equal = TRUE) r_SB(1:30, -exp(1 / 1:30), var.equal = FALSE) r_SB(0.57)r_SB(1:30, -exp(1 / 1:30), var.equal = TRUE) r_SB(1:30, -exp(1 / 1:30), var.equal = FALSE) r_SB(0.57)
OverkillWay too many ways to calculate .
R2(pred, obs, type = 1, na.rm = TRUE)R2(pred, obs, type = 1, na.rm = TRUE)
pred |
The predicted values by some model; typically the result of a
call to |
obs |
The true observed values. |
type |
Which of the 8 versions of R-square to use. See details. |
na.rm |
a logical value indicating whether NA values should be stripped before the computation proceeds. |
The types of :
squared multiple correlation coefficient between the regressand and the regressors
Kvålseth, T. O. (1985). Cautionary note about R 2. The American Statistician, 39(4), 279-285.
X <- c(1, 2, 3, 4, 5, 6) Y <- c(15, 37, 52, 59, 83, 92) m1 <- lm(Y ~ X) m2 <- lm(Y ~ 0 + X) m3 <- nls(Y ~ a * X^b, start = c(a = 1, b = 1)) # Table 2 from Kvålset0 (1985) data.frame( mod1 = sapply(1:8, R2, pred = predict(m1), obs = Y), mod2 = sapply(1:8, R2, pred = predict(m2), obs = Y), mod3 = sapply(1:8, R2, pred = 16.3757 * X^0.99, obs = Y) )X <- c(1, 2, 3, 4, 5, 6) Y <- c(15, 37, 52, 59, 83, 92) m1 <- lm(Y ~ X) m2 <- lm(Y ~ 0 + X) m3 <- nls(Y ~ a * X^b, start = c(a = 1, b = 1)) # Table 2 from Kvålset0 (1985) data.frame( mod1 = sapply(1:8, R2, pred = predict(m1), obs = Y), mod2 = sapply(1:8, R2, pred = predict(m2), obs = Y), mod3 = sapply(1:8, R2, pred = 16.3757 * X^0.99, obs = Y) )
This function computes the pseudo R-square for mixed models by quantifying the reduction on the variance-covariance between a full model and a reduced model.
r2_pseudo(mf, mr, mnull = mr) V_table(model)r2_pseudo(mf, mr, mnull = mr) V_table(model)
mf |
The full model |
mr |
The reduced model |
mnull |
The empty model, where some variance of interest is unmodeled (an "unconditional" model) |
model |
A fitted model object |
Variance components are extracted from the 3 models, and where they match (groups and terms must match exactly), we compute:
This gives the increase in variance explained by the full model compared to the reduced model. When the restricted model is the empty model, this is the same as:
{stats}: lm, glm
{lme4}: lmerMod and glmerMod
{rstanarm}: stanreg
{brms}: brmsfit
{glmmTMB}: glmmTMB
A tibble with the following columns:
grp: The name of the grouping factor
var: The name of the variance component
r2: The pseudo R-square for that variance component
mod0 <- lme4::lmer(Reaction ~ 1 + (1 | Subject), data = lme4::sleepstudy) mod1 <- lme4::lmer(Reaction ~ Days + (1 | Subject), data = lme4::sleepstudy) mod2 <- lme4::lmer(Reaction ~ Days + (Days | Subject), data = lme4::sleepstudy) r2_pseudo(mod1, mod0) # Residual variance explained by the fixed effect r2_pseudo(mod2, mod1, mod0) # Residual variance explained by the random effects r2_pseudo(mod2, mod0) # Residual variance explained by the mixed effectsmod0 <- lme4::lmer(Reaction ~ 1 + (1 | Subject), data = lme4::sleepstudy) mod1 <- lme4::lmer(Reaction ~ Days + (1 | Subject), data = lme4::sleepstudy) mod2 <- lme4::lmer(Reaction ~ Days + (Days | Subject), data = lme4::sleepstudy) r2_pseudo(mod1, mod0) # Residual variance explained by the fixed effect r2_pseudo(mod2, mod1, mod0) # Residual variance explained by the random effects r2_pseudo(mod2, mod0) # Residual variance explained by the mixed effects
Sequence Generation Based on the Values of a Vector
seq_range( x, length.out = NULL, by = NULL, along.with = NULL, na.rm = TRUE, padding = 0.05 ) seq_quantile( x, probs, length.out = NULL, by = NULL, along.with = NULL, na.rm = TRUE ) seq_IQR(x, length.out = NULL, by = NULL, along.with = NULL, na.rm = TRUE) mean_sd(x, na.rm = TRUE, out = c("vector", "data.frame")) median_mad(x, na.rm = TRUE, out = c("vector", "data.frame"))seq_range( x, length.out = NULL, by = NULL, along.with = NULL, na.rm = TRUE, padding = 0.05 ) seq_quantile( x, probs, length.out = NULL, by = NULL, along.with = NULL, na.rm = TRUE ) seq_IQR(x, length.out = NULL, by = NULL, along.with = NULL, na.rm = TRUE) mean_sd(x, na.rm = TRUE, out = c("vector", "data.frame")) median_mad(x, na.rm = TRUE, out = c("vector", "data.frame"))
x |
A numeric vector |
length.out |
desired length of the sequence. If no other arguments are valued, defaults to 20. |
by |
number: increment of the sequence. |
along.with |
take the length from the length of this argument. |
na.rm |
a logical evaluating to |
padding |
Padding factor for the range. |
probs |
numeric vector of probabilities with values in
|
out |
If |
set.seed(1) x <- rt(100, df = 3) seq_range(x, length.out = 5) seq_IQR(x, length.out = 5) seq_quantile(x, c(.05, .95), length.out = 5) mean_sd(x) library(ggplot2) ggplot(mtcars, aes(cyl, mpg)) + stat_summary(aes(color = "Mean (SD)"), fun.data = mean_sd, fun.args = list(out = "data.frame") ) + stat_summary(aes(color = "Median (MAD)"), fun.data = median_mad, fun.args = list(out = "data.frame"), position = position_nudge(x = 0.5) )set.seed(1) x <- rt(100, df = 3) seq_range(x, length.out = 5) seq_IQR(x, length.out = 5) seq_quantile(x, c(.05, .95), length.out = 5) mean_sd(x) library(ggplot2) ggplot(mtcars, aes(cyl, mpg)) + stat_summary(aes(color = "Mean (SD)"), fun.data = mean_sd, fun.args = list(out = "data.frame") ) + stat_summary(aes(color = "Median (MAD)"), fun.data = median_mad, fun.args = list(out = "data.frame"), position = position_nudge(x = 0.5) )
This is a wrapper for emmeans::joint_tests() that provides an easy way to
specify which simple effects we wish to test, and within what variable(s).
simple_effects(model, effect, inside, ...)simple_effects(model, effect, inside, ...)
model |
The model. |
effect |
The name of the required simple effect. e.g., |
inside |
A vector of the name(s) of the variable(s) within whose levels
the |
... |
Passed to |
library(afex) data(obk.long, package = "afex") A <- aov_car(value ~ treatment * gender + Error(id / (phase * hour)), data = obk.long ) simple_effects(A, effect = "treatment") simple_effects(A, effect = "treatment:phase") simple_effects(A, effect = "phase", inside = "treatment") simple_effects(A, effect = "phase", inside = c("treatment", "gender")) # simple_effects(A, effect = "phase", inside = "treatment:gender") # same simple_effects(A, effect = "phase", inside = c("treatment", "gender"), at = list(gender = "F") ) simple_effects(A, effect = "phase:treatment", inside = "gender")library(afex) data(obk.long, package = "afex") A <- aov_car(value ~ treatment * gender + Error(id / (phase * hour)), data = obk.long ) simple_effects(A, effect = "treatment") simple_effects(A, effect = "treatment:phase") simple_effects(A, effect = "phase", inside = "treatment") simple_effects(A, effect = "phase", inside = c("treatment", "gender")) # simple_effects(A, effect = "phase", inside = "treatment:gender") # same simple_effects(A, effect = "phase", inside = c("treatment", "gender"), at = list(gender = "F") ) simple_effects(A, effect = "phase:treatment", inside = "gender")
tidySEM
Plot simple mediations with tidySEM
simple_mediation_plot( a = NA, b = NA, direct = NA, indirect = NA, total = NA, X_name = "X", M_name = "M", Y_name = "Y", ... )simple_mediation_plot( a = NA, b = NA, direct = NA, indirect = NA, total = NA, X_name = "X", M_name = "M", Y_name = "Y", ... )
a, b, direct, indirect, total
|
Values or labels to put on the paths (edges). |
X_name, M_name, Y_name
|
Values or labels to put on the variables (nodes). |
... |
Passed to tidySEM::prepare_graph. |
mod_a <- lm(hp ~ gear, data = mtcars) mod_bc <- lm(mpg ~ hp + gear, data = mtcars) a <- coef(mod_a)[2] b <- coef(mod_bc)[2] direct <- coef(mod_bc)[3] indirect <- a * b total <- direct + indirect med_plot <- simple_mediation_plot( a = round(a, 3), b = round(b, 3), direct = round(direct, 3), indirect = round(indirect, 3), total = round(total, 3), X_name = "Gears", M_name = "Horse\nPower", Y_name = "Miles\nPer Gallon" ) plot(med_plot)mod_a <- lm(hp ~ gear, data = mtcars) mod_bc <- lm(mpg ~ hp + gear, data = mtcars) a <- coef(mod_a)[2] b <- coef(mod_bc)[2] direct <- coef(mod_bc)[3] indirect <- a * b total <- direct + indirect med_plot <- simple_mediation_plot( a = round(a, 3), b = round(b, 3), direct = round(direct, 3), indirect = round(indirect, 3), total = round(total, 3), X_name = "Gears", M_name = "Horse\nPower", Y_name = "Miles\nPer Gallon" ) plot(med_plot)
Some stats demos
stat_demo_apps( demo = c("paired ttest", "truncated correlation", "berksons paradox") )stat_demo_apps( demo = c("paired ttest", "truncated correlation", "berksons paradox") )
demo |
Which demo to show? (Partial matching supported) |
Shows a paired vs un-paired ttest, and how these differences are affected by the correlation.
Demo of how truncation affects correlations and doesn't affect MSE.
Demo of how How sampling bias can affect estimated correlations and effects. See related Numberphile video.
vlookup
vlookup(this, data, key, value, add = FALSE)vlookup(this, data, key, value, add = FALSE)
this |
A vector of values |
data |
A data frame to search in |
key |
Where should |
value |
Name of the column from which values should be returned. |
add |
Should |
df <- data.frame( a = letters[c(1, 1:9)], b = 51:60 ) vlookup(c("a", "e", "c"), df, key = "a", value = "b") vlookup(c("a", "e", "c"), df, key = "a", value = "b", add = TRUE)df <- data.frame( a = letters[c(1, 1:9)], b = 51:60 ) vlookup(c("a", "e", "c"), df, key = "a", value = "b") vlookup(c("a", "e", "c"), df, key = "a", value = "b", add = TRUE)