Skip to content

Commit

Permalink
finish factors (closes #6)
Browse files Browse the repository at this point in the history
  • Loading branch information
leeper committed Aug 5, 2016
1 parent e3dfe3b commit e856795
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 38 deletions.
3 changes: 2 additions & 1 deletion NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
* Added `cplot()` generic and methods for "lm" and "glm" class objects to display conditional predictions and conditional marginal effects in the style of the **interplot** and **plotMElm** packages.
* Added various variance estimation procedures for marginal effects: delta method (the default), bootstrap, and simulation (ala **Clarify**).
* Fixed estimation of marginal effect variances for generalized linear models, so that they are correct on both "link" and "response" scales.
* Exposed two internal marginal effect estimation functions. First, `build_margins()` is called by `margins()` methods (perhaps repeatedly) and actually assembles a "margins" object from a model and data. It is never necessary to call this directly, but may be useful for very simple marginal effect estimation procedures (i.e., using original data with no `at` specification). Second, `marginal_effects()` is the very low level function that differentiates a model with respect to some input data. This is the fastest way to obtain marginal effects without the overhead of creating a "margins" object (for which variance estimation is very time-consuming).
* Exposed two internal marginal effect estimation functions. First, `build_margins()` is called by `margins()` methods (perhaps repeatedly) and actually assembles a "margins" object from a model and data. It is never necessary to call this directly, but may be useful for very simple marginal effect estimation procedures (i.e., using original data with no `at` specification). Second, `marginal_effects()` is the very low level function that differentiates a model with respect to some input data (or calculate discrete changes in the outcome with respect to factor variables). This is the fastest way to obtain marginal effects without the overhead of creating a "margins" object (for which variance estimation is fairly time-consuming).
* Implemented estimation of "discrete change" representations of marginal effects of factor variables in models, ala Stata's default settings.
* Re-implemented marginal effects estimation using numeric derivatives provided by `numDeriv::grad()` rather than symbolic differentiation. This allows `margins()` to handle almost any model that can be specified in R, including models that cannot be specified in Stata.
* Used **compiler** to byte compile prediction and gradient fucntions, thereby improving estimation speed.
* The internal `build_datalist()` now checks for specification of illegal factor levels in `at` and errors when these are encountered.
Expand Down
5 changes: 3 additions & 2 deletions R/factories.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@
return(compiler::cmpfun(FUN))
}

.build_grad_fun <- function(data, model, which_me, type = "response", method = c("Richardson", "simple", "complex")) {
.build_grad_fun <- function(data, model, type = "response", method = c("simple", "Richardson", "complex")) {
method <- match.arg(method)
# factory function to return prediction holding data constant but varying coefficients
FUN <- function(coefs) {
for (i in seq_along(coefs)) {
model[["coefficients"]][names(coefs)[i]] <- coefs[i]
}
colMeans(marginal_effects(model = model, data = data, type = type, method = method)[, which_me, drop = FALSE], na.rm = TRUE)
colMeans(marginal_effects(model = model, data = data, type = type, method = method), na.rm = TRUE)
}
return(compiler::cmpfun(FUN))
}
15 changes: 7 additions & 8 deletions R/get_effect_variances.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
get_effect_variances <-
function(data = data,
model = model,
allvars = all.vars(model[["terms"]])[-1],
which = all.vars(model[["terms"]])[-1], # which mes do we need variances of
type = c("response", "link", "terms"),
vce = c("delta", "simulation", "bootstrap"),
iterations = 50L, # if vce == "bootstrap" or "simulation"
Expand All @@ -17,7 +17,7 @@ function(data = data,
# function to calculate AME for one bootstrap subsample
bootfun <- function() {
s <- sample(seq_len(nrow(data)), nrow(data), TRUE)
colMeans(marginal_effects(model = model, data = data[s,], type = type, method = method)[, allvars, drop = FALSE], na.rm = TRUE)
colMeans(marginal_effects(model = model, data = data[s,], type = type, method = method), na.rm = TRUE)
}
# bootstrap the data and take the variance of bootstrapped AMEs
variances <- apply(replicate(iterations, bootfun()), 1, var, na.rm = TRUE)
Expand All @@ -34,13 +34,12 @@ function(data = data,
# http://www.soderbom.net/lecture10notes.pdf
# http://stats.stackexchange.com/questions/122066/how-to-use-delta-method-for-standard-errors-of-marginal-effects

# TO GET UNIT-SPECIFIC VARIANCES, NEED TO TAKE DERIVATIVE OF `.build_grad_fun()` FOR EVERY ROW SEPARATELY
# TODO: TO GET UNIT-SPECIFIC VARIANCES, NEED TO TAKE DERIVATIVE OF `.build_grad_fun()` FOR EVERY ROW SEPARATELY

gradmat <- do.call("rbind", lapply(allvars, function(thisme) {
FUN <- .build_grad_fun(data = data, model = model, which_me = thisme, type = type, method = method)
numDeriv::grad(FUN, model[["coefficients"]])
}))
FUN <- .build_grad_fun(data = data, model = model, type = type, method = method)
gradmat <- numDeriv::jacobian(FUN, model[["coefficients"]])
variances <- diag(gradmat %*% vcov(model) %*% t(gradmat))

} else if (vce == "simulation") {

# copy model for quick use in estimation
Expand All @@ -53,7 +52,7 @@ function(data = data,
# estimate AME from from each simulated coefficient vector
effectmat <- apply(coefmat, 1, function(coefrow) {
tmpmodel[["coefficients"]] <- coefrow
colMeans(marginal_effects(data, model = tmpmodel, type = type, method = method)[, allvars, drop = FALSE])
colMeans(marginal_effects(data, model = tmpmodel, type = type, method = method))
})
# calculate the variance of the simulated AMEs
variances <- apply(effectmat, 1, var, na.rm = TRUE)
Expand Down
49 changes: 24 additions & 25 deletions R/marginal_effects.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
#' @title Differentiate a Model Object
#' @description Extract marginal effects via numerical differentiation from a model object, conditional on data
#' @description Extract marginal effects (via numerical differentiation) and predicted differences in factor changes from a model object, conditional on data
#' @param data A data.frame over which to calculate marginal effects.
#' @param model A model object, perhaps returned by \code{\link[stats]{lm}} or \code{\link[stats]{glm}}
#' @param type A character string indicating the type of marginal effects to estimate. Mostly relevant for non-linear models, where the reasonable options are \dQuote{response} (the default) or \dQuote{link} (i.e., on the scale of the linear predictor in a GLM).
#' @param method A character string indicating the numeric derivative method to use when estimating marginal effects. \dQuote{simple} optimizes for speed; \dQuote{Richardson} optimizes for accuracy. See \code{\link[numDeriv]{grad}} for details.
#' @details This function uses numeric differentiation (\code{\link[numDeriv]{grad}}) to extract marginal effects from an estimated model with respect to all variables specified in \code{data} and returns a data.frame containing the unit-specific marginal effects with respect to each variable included (or not included) in the model. (Note that this is not each \emph{coefficient}.)
#'
#' Note that numerical differentiation is only used for numeric variables. For factor variables (or character variables, which are implicitly coerced to factors by modelling functions), disrete differences in outcomes are reported instead. If you want to use numerical differentiation for these variables (which you probably do not want), enter them into the original modelling function as numeric values rather than factors.
#' @details This function uses numeric differentiation (\code{\link[numDeriv]{grad}}) to extract marginal effects from an estimated model with respect to all numeric variables specified in \code{data} and returns a data.frame containing the unit-specific marginal effects with respect to each variable included (or not included) in the model. (Note that this is not each \emph{coefficient}.) For factor variables (or character variables, which are implicitly coerced to factors by modelling functions), discrete differences in predicted outcomes are reported instead (i.e., change in predicted outcome when factor is set to a given level minus the predicted outcome when the factor is set to its baseline level). If you want to use numerical differentiation for factor variables (which you probably do not want to do), enter them into the original modelling function as numeric values rather than factors.
#'
#' Variable class coercion (other than \code{factor(x)}) inside a formula passed to, for example, \code{\link[stats]{lm}} may cause weird behavior, or errors.
#'
Expand Down Expand Up @@ -38,41 +36,38 @@ marginal_effects <- function(model, data, type = c("response", "link"), method =
method <- match.arg(method)

# identify factors versus numeric terms in `model`
classes <- attributes(terms(model))[["dataClasses"]]
classes <- attributes(terms(model))[["dataClasses"]][-1]
classes[classes == "character"] <- "factor"
nnames <- clean_terms(names(classes)[classes != "factor"])
fnames <- clean_terms(names(classes)[classes == "factor"])
fnames2 <- names(classes)[classes == "factor"] # for checking stupid variable naming behavior by R

# setup obs-x-term data.frame of obs-specific marginal effects
out <- structure(matrix(NA_real_, nrow = nrow(data), ncol = length(nnames)),
rownames = seq_len(nrow(data)))

out <- lapply(seq_len(nrow(data)), function(datarow) {
# estimate numerical derivatives with respect to each variable (for numeric terms in the model)
for (datarow in seq_len(nrow(data))) {
# setup function through predict_factory
FUN <- .build_predict_fun(data = data[datarow, , drop = FALSE], model = model, type = type)

# initialize output object
vec <- setNames(numeric(length(nnames)), nnames)

# extract gradient at input value
vec[nnames] <- numDeriv::grad(FUN, unlist(data[datarow, nnames, drop = FALSE]), method = method)

return(vec)
})

# setup obs-x-term data.frame of obs-specific marginal effects
out <- do.call("rbind.data.frame", out)
out[] <- lapply(out, `class<-`, "marginaleffect")
out <- structure(out, names = nnames)

# add discrete differences for factor variables
out[datarow, ] <- numDeriv::grad(FUN, unlist(data[datarow, nnames, drop = FALSE]), method = method)
}
out <- setNames(as.data.frame(out, optional = TRUE), nnames)
# add discrete differences for factor terms
## exact number depends on number of factor levels
if (any(classes == "factor")) {
for (i in seq_along(fnames)) {
out <- cbind(out, get_pdiff(data = data, model = model, fnames[i], type = type))
out <- cbind(out, get_pdiff(data = data, model = model, fnames[i], type = type, fwrap = (fnames != fnames2)[i]))
}
}

for (i in seq_along(out)) {
class(out[[i]]) <- c("marginaleffect", "numeric")
}
return(out)
}

get_pdiff <- function(data, model, variable, type = c("response", "link")) {
get_pdiff <- function(data, model, variable, type = c("response", "link"), fwrap = FALSE) {
# @title Discrete change in fitted values
# @description This is an internal function used to calculate discrete change in y-hat between factor levels and base factor level. This is used by \code{marginal_effects} for factor variables.
# @param data The dataset on which to to calculate `predict(model)` (and the slope thereof)
Expand All @@ -88,7 +83,11 @@ get_pdiff <- function(data, model, variable, type = c("response", "link")) {
levs <- levs[-1]

# setup response object
outcolnames <- paste0(variable, ":", levs)
if (isTRUE(fwrap)) {
outcolnames <- paste0("factor(", variable, ")", levs)
} else {
outcolnames <- paste0(variable, levs)
}
out <- structure(rep(list(list()), length(levs)),
class = "data.frame",
names = outcolnames,
Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/tests-core.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ test_that("`at` behavior works", {
test_that("factor variables work", {
x1 <- lm(mpg ~ factor(cyl), data = head(mtcars))
expect_true(inherits(marginal_effects(x1), "data.frame"), label = "factors work in formula")
x2 <- lm(Sepal.Length ~ Species, data = head(iris))
x2 <- lm(Sepal.Length ~ Species, data = iris)
expect_true(inherits(marginal_effects(x2), "data.frame"), label = "natural factors work")
})

Expand Down Expand Up @@ -59,7 +59,7 @@ test_that("persp() method for 'lm' works", {
x <- lm(mpg ~ wt * hp, data = mtcars)
expect_true(is.list(persp(x)))
expect_true(is.list(persp(x, theta = c(30, 60))))
expect_true(is.list(persp(x, theta = c(30, 60), phi = c(0, 10)))
expect_true(is.list(persp(x, theta = c(30, 60), phi = c(0, 10))))
expect_true(is.list(persp(x, what = "effect")))
})

Expand Down

0 comments on commit e856795

Please sign in to comment.