finish factors (closes #6)

leeper · Aug 5, 2016 · e856795 · e856795
1 parent e3dfe3b
commit e856795
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 38 deletions.
diff --git a/NEWS b/NEWS
@@ -5,7 +5,8 @@
 * Added `cplot()` generic and methods for "lm" and "glm" class objects to display conditional predictions and conditional marginal effects in the style of the **interplot** and **plotMElm** packages.
 * Added various variance estimation procedures for marginal effects: delta method (the default), bootstrap, and simulation (ala **Clarify**).
 * Fixed estimation of marginal effect variances for generalized linear models, so that they are correct on both "link" and "response" scales.
-* Exposed two internal marginal effect estimation functions. First, `build_margins()` is called by `margins()` methods (perhaps repeatedly) and actually assembles a "margins" object from a model and data. It is never necessary to call this directly, but may be useful for very simple marginal effect estimation procedures (i.e., using original data with no `at` specification). Second, `marginal_effects()` is the very low level function that differentiates a model with respect to some input data. This is the fastest way to obtain marginal effects without the overhead of creating a "margins" object (for which variance estimation is very time-consuming).
+* Exposed two internal marginal effect estimation functions. First, `build_margins()` is called by `margins()` methods (perhaps repeatedly) and actually assembles a "margins" object from a model and data. It is never necessary to call this directly, but may be useful for very simple marginal effect estimation procedures (i.e., using original data with no `at` specification). Second, `marginal_effects()` is the very low level function that differentiates a model with respect to some input data (or calculate discrete changes in the outcome with respect to factor variables). This is the fastest way to obtain marginal effects without the overhead of creating a "margins" object (for which variance estimation is fairly time-consuming).
+* Implemented estimation of "discrete change" representations of marginal effects of factor variables in models, ala Stata's default settings.
 * Re-implemented marginal effects estimation using numeric derivatives provided by `numDeriv::grad()` rather than symbolic differentiation. This allows `margins()` to handle almost any model that can be specified in R, including models that cannot be specified in Stata.
 * Used **compiler** to byte compile prediction and gradient fucntions, thereby improving estimation speed.
 * The internal `build_datalist()` now checks for specification of illegal factor levels in `at` and errors when these are encountered.

diff --git a/R/factories.R b/R/factories.R
@@ -16,13 +16,14 @@
     return(compiler::cmpfun(FUN))
 }
 
-.build_grad_fun <- function(data, model, which_me, type = "response", method = c("Richardson", "simple", "complex")) {
+.build_grad_fun <- function(data, model, type = "response", method = c("simple", "Richardson", "complex")) {
+    method <- match.arg(method)
     # factory function to return prediction holding data constant but varying coefficients
     FUN <- function(coefs) {
         for (i in seq_along(coefs)) {
             model[["coefficients"]][names(coefs)[i]] <- coefs[i]
         }
-        colMeans(marginal_effects(model = model, data = data, type = type, method = method)[, which_me, drop = FALSE], na.rm = TRUE)
+        colMeans(marginal_effects(model = model, data = data, type = type, method = method), na.rm = TRUE)
     }
     return(compiler::cmpfun(FUN))
 }
diff --git a/R/get_effect_variances.R b/R/get_effect_variances.R
@@ -1,7 +1,7 @@
 get_effect_variances <- 
 function(data = data, 
          model = model, 
-         allvars = all.vars(model[["terms"]])[-1],
+         which = all.vars(model[["terms"]])[-1], # which mes do we need variances of
          type = c("response", "link", "terms"),
          vce = c("delta", "simulation", "bootstrap"),
          iterations = 50L, # if vce == "bootstrap" or "simulation"
@@ -17,7 +17,7 @@ function(data = data,
         # function to calculate AME for one bootstrap subsample
         bootfun <- function() {
             s <- sample(seq_len(nrow(data)), nrow(data), TRUE)
-            colMeans(marginal_effects(model = model, data = data[s,], type = type, method = method)[, allvars, drop = FALSE], na.rm = TRUE)
+            colMeans(marginal_effects(model = model, data = data[s,], type = type, method = method), na.rm = TRUE)
         }
         # bootstrap the data and take the variance of bootstrapped AMEs
         variances <- apply(replicate(iterations, bootfun()), 1, var, na.rm = TRUE)
@@ -34,13 +34,12 @@ function(data = data,
         # http://www.soderbom.net/lecture10notes.pdf
         # http://stats.stackexchange.com/questions/122066/how-to-use-delta-method-for-standard-errors-of-marginal-effects
 
-        # TO GET UNIT-SPECIFIC VARIANCES, NEED TO TAKE DERIVATIVE OF `.build_grad_fun()` FOR EVERY ROW SEPARATELY
+        # TODO: TO GET UNIT-SPECIFIC VARIANCES, NEED TO TAKE DERIVATIVE OF `.build_grad_fun()` FOR EVERY ROW SEPARATELY
 
-        gradmat <- do.call("rbind", lapply(allvars, function(thisme) {
-            FUN <- .build_grad_fun(data = data, model = model, which_me = thisme, type = type, method = method)
-            numDeriv::grad(FUN, model[["coefficients"]])
-        }))
+        FUN <- .build_grad_fun(data = data, model = model, type = type, method = method)
+        gradmat <- numDeriv::jacobian(FUN, model[["coefficients"]])
         variances <- diag(gradmat %*% vcov(model) %*% t(gradmat))
+
     } else if (vce == "simulation") {
 
         # copy model for quick use in estimation
@@ -53,7 +52,7 @@ function(data = data,
         # estimate AME from from each simulated coefficient vector
         effectmat <- apply(coefmat, 1, function(coefrow) {
             tmpmodel[["coefficients"]] <- coefrow
-            colMeans(marginal_effects(data, model = tmpmodel, type = type, method = method)[, allvars, drop = FALSE])
+            colMeans(marginal_effects(data, model = tmpmodel, type = type, method = method))
         })
         # calculate the variance of the simulated AMEs
         variances <- apply(effectmat, 1, var, na.rm = TRUE)

diff --git a/R/marginal_effects.R b/R/marginal_effects.R
@@ -1,12 +1,10 @@
 #' @title Differentiate a Model Object
-#' @description Extract marginal effects via numerical differentiation from a model object, conditional on data
+#' @description Extract marginal effects (via numerical differentiation) and predicted differences in factor changes from a model object, conditional on data
 #' @param data A data.frame over which to calculate marginal effects.
 #' @param model A model object, perhaps returned by \code{\link[stats]{lm}} or \code{\link[stats]{glm}}
 #' @param type A character string indicating the type of marginal effects to estimate. Mostly relevant for non-linear models, where the reasonable options are \dQuote{response} (the default) or \dQuote{link} (i.e., on the scale of the linear predictor in a GLM).
 #' @param method A character string indicating the numeric derivative method to use when estimating marginal effects. \dQuote{simple} optimizes for speed; \dQuote{Richardson} optimizes for accuracy. See \code{\link[numDeriv]{grad}} for details.
-#' @details This function uses numeric differentiation (\code{\link[numDeriv]{grad}}) to extract marginal effects from an estimated model with respect to all variables specified in \code{data} and returns a data.frame containing the unit-specific marginal effects with respect to each variable included (or not included) in the model. (Note that this is not each \emph{coefficient}.)
-#'
-#' Note that numerical differentiation is only used for numeric variables. For factor variables (or character variables, which are implicitly coerced to factors by modelling functions), disrete differences in outcomes are reported instead. If you want to use numerical differentiation for these variables (which you probably do not want), enter them into the original modelling function as numeric values rather than factors.
+#' @details This function uses numeric differentiation (\code{\link[numDeriv]{grad}}) to extract marginal effects from an estimated model with respect to all numeric variables specified in \code{data} and returns a data.frame containing the unit-specific marginal effects with respect to each variable included (or not included) in the model. (Note that this is not each \emph{coefficient}.) For factor variables (or character variables, which are implicitly coerced to factors by modelling functions), discrete differences in predicted outcomes are reported instead (i.e., change in predicted outcome when factor is set to a given level minus the predicted outcome when the factor is set to its baseline level). If you want to use numerical differentiation for factor variables (which you probably do not want to do), enter them into the original modelling function as numeric values rather than factors.
 #' 
 #' Variable class coercion (other than \code{factor(x)}) inside a formula passed to, for example, \code{\link[stats]{lm}} may cause weird behavior, or errors.
 #'
@@ -38,41 +36,38 @@ marginal_effects <- function(model, data, type = c("response", "link"), method =
     method <- match.arg(method)
 
     # identify factors versus numeric terms in `model`
-    classes <- attributes(terms(model))[["dataClasses"]]
+    classes <- attributes(terms(model))[["dataClasses"]][-1]
     classes[classes == "character"] <- "factor"
     nnames <- clean_terms(names(classes)[classes != "factor"])
     fnames <- clean_terms(names(classes)[classes == "factor"])
+    fnames2 <- names(classes)[classes == "factor"] # for checking stupid variable naming behavior by R
+
+    # setup obs-x-term data.frame of obs-specific marginal effects
+    out <- structure(matrix(NA_real_, nrow = nrow(data), ncol = length(nnames)), 
+                     rownames = seq_len(nrow(data)))
 
-    out <- lapply(seq_len(nrow(data)), function(datarow) {
+    # estimate numerical derivatives with respect to each variable (for numeric terms in the model)
+    for (datarow in seq_len(nrow(data))) {
         # setup function through predict_factory
         FUN <- .build_predict_fun(data = data[datarow, , drop = FALSE], model = model, type = type)
-
-        # initialize output object
-        vec <- setNames(numeric(length(nnames)), nnames)
-
         # extract gradient at input value
-        vec[nnames] <- numDeriv::grad(FUN, unlist(data[datarow, nnames, drop = FALSE]), method = method)
-
-        return(vec)
-    })
-
-    # setup obs-x-term data.frame of obs-specific marginal effects
-    out <- do.call("rbind.data.frame", out)
-    out[] <- lapply(out, `class<-`, "marginaleffect")
-    out <- structure(out, names = nnames)
-
-    # add discrete differences for factor variables
+        out[datarow, ] <- numDeriv::grad(FUN, unlist(data[datarow, nnames, drop = FALSE]), method = method)
+    }
+    out <- setNames(as.data.frame(out, optional = TRUE), nnames)
+    # add discrete differences for factor terms
     ## exact number depends on number of factor levels
     if (any(classes == "factor")) {
         for (i in seq_along(fnames)) {
-            out <- cbind(out, get_pdiff(data = data, model = model, fnames[i], type = type))
+            out <- cbind(out, get_pdiff(data = data, model = model, fnames[i], type = type, fwrap = (fnames != fnames2)[i]))
         }
     }
-
+    for (i in seq_along(out)) {
+        class(out[[i]]) <- c("marginaleffect", "numeric")
+    }
     return(out)
 }
 
-get_pdiff <- function(data, model, variable, type = c("response", "link")) {
+get_pdiff <- function(data, model, variable, type = c("response", "link"), fwrap = FALSE) {
     # @title Discrete change in fitted values
     # @description This is an internal function used to calculate discrete change in y-hat between factor levels and base factor level. This is used by \code{marginal_effects} for factor variables.
     # @param data The dataset on which to to calculate `predict(model)` (and the slope thereof)
@@ -88,7 +83,11 @@ get_pdiff <- function(data, model, variable, type = c("response", "link")) {
     levs <- levs[-1]
 
     # setup response object
-    outcolnames <- paste0(variable, ":", levs)
+    if (isTRUE(fwrap)) {
+        outcolnames <- paste0("factor(", variable, ")", levs)
+    } else {
+        outcolnames <- paste0(variable, levs)
+    }
     out <- structure(rep(list(list()), length(levs)), 
                      class = "data.frame", 
                      names = outcolnames, 

diff --git a/tests/testthat/tests-core.R b/tests/testthat/tests-core.R
@@ -21,7 +21,7 @@ test_that("`at` behavior works", {
 test_that("factor variables work", {
     x1 <- lm(mpg ~ factor(cyl), data = head(mtcars))
     expect_true(inherits(marginal_effects(x1), "data.frame"), label = "factors work in formula") 
-    x2 <- lm(Sepal.Length ~ Species, data = head(iris))
+    x2 <- lm(Sepal.Length ~ Species, data = iris)
     expect_true(inherits(marginal_effects(x2), "data.frame"), label = "natural factors work")
 })
 
@@ -59,7 +59,7 @@ test_that("persp() method for 'lm' works", {
     x <- lm(mpg ~ wt * hp, data = mtcars)
     expect_true(is.list(persp(x)))
     expect_true(is.list(persp(x, theta = c(30, 60))))
-    expect_true(is.list(persp(x, theta = c(30, 60), phi = c(0, 10)))
+    expect_true(is.list(persp(x, theta = c(30, 60), phi = c(0, 10))))
     expect_true(is.list(persp(x, what = "effect")))
 })