Skip to content

Commit

Permalink
add draft function generating simulated data
Browse files Browse the repository at this point in the history
  • Loading branch information
wenjie2wang committed Oct 8, 2017
1 parent 17d6e30 commit f4237d1
Show file tree
Hide file tree
Showing 7 changed files with 351 additions and 17 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: intsurv
Title: Integrative Survival Modeling
Version: 0.1.0.9000
Date: 2017-09-09
Date: 2017-10-08
Authors@R: c(
person(given = "Wenjie", family = "Wang",
email = "wenjie.2.wang@uconn.edu",
Expand All @@ -19,14 +19,15 @@ Depends:
R (>= 3.2.3)
License: GPL (>= 3)
LazyData: true
Collate:
Collate:
'class.R'
'Survi.R'
'bootSe.R'
'coef.R'
'iCoxph.R'
'intsurv.R'
'show.R'
'simuData.R'
'summary.R'
URL: https://github.com/wenjie2wang/intsurv
BugReports: https://github.com/wenjie2wang/intsurv/issues
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
export(Survi)
export(bootSe)
export(iCoxph)
export(simuWeibull)
exportClasses(Survi)
exportClasses(iCoxph)
exportClasses(iCoxph.summary)
Expand Down Expand Up @@ -30,6 +31,8 @@ importFrom(stats,pnorm)
importFrom(stats,printCoefmat)
importFrom(stats,qnorm)
importFrom(stats,quantile)
importFrom(stats,rnorm)
importFrom(stats,runif)
importFrom(stats,sd)
importFrom(survival,Surv)
importFrom(survival,coxph)
2 changes: 1 addition & 1 deletion R/bootSe.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ bootSe <- function(object, numBoot = 50, se = c("mad", "inter-quantile", "sd"),


### internal functions =========================================================
bootSe_start <- function(beta0, censorRate, multiStart = FALSE, ...,
bootSe_start <- function(beta0, censorRate, multiStart = TRUE, ...,
start0)
{
censorRate0 <- start0$censorRate0
Expand Down
15 changes: 7 additions & 8 deletions R/iCoxph.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ NULL
##' @usage
##' iCoxph(formula, data, subset, na.action, contrasts = NULL,
##' start = list(), control = list(), ...)
##'
##' @aliases iCoxph
##' @param formula \code{Survi} object specifying the covariates and response
##' variable in the model, such as \code{Survi(ID, time, event) ~ x1 + x2}.
##' @param data An optional data frame, list, or environment that contains the
Expand Down Expand Up @@ -180,19 +180,18 @@ NULL
##' Algorithm: A General Framework. \emph{Biometrika}, 80(2), 267--278.
##'
##' @examples
##'
##' ## FIXME: add function generating simulation data
##' ## and example simulated dataset for demonstration
##' ## library(intsurv)
##' ## iCoxph(Survi())
##'
##' library(intsurv)
##' set.seed(1216)
##' simuDat <- simuWeibull(nSubject = 100)
##' iCoxph(Survi(ID, obsTime, eventInd) ~ x1 + x2 + x3 + x4, simuDat,
##' control = list(noSE = TRUE))
##' @seealso
##' \code{\link{summary,iCoxph-method}} for summary of fitted model;
##' \code{\link{coef,iCoxph-method}} for estimated covariate coefficients;
##' \code{\link{bootSe}} for SE estimates from bootstrap method.
##'
##' @importFrom stats na.fail na.omit na.exclude na.pass .getXlevels
##' model.extract model.frame model.matrix nlm pnorm
##' model.extract model.frame model.matrix nlm pnorm
##' @importFrom survival coxph Surv
##' @export
iCoxph <- function(formula, data, subset, na.action, contrasts = NULL,
Expand Down
240 changes: 240 additions & 0 deletions R/simuData.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
################################################################################
##
## R package intsurv by Wenjie Wang, Kun Chen and Jun Yan
## Copyright (C) 2017
##
## This file is part of the R package intsurv.
##
## The R package intsurv is free software: You can redistribute it and/or
## modify it under the terms of the GNU General Public License as published
## by the Free Software Foundation, either version 3 of the License, or
## any later version (at your option). See the GNU General Public License
## at <http://www.gnu.org/licenses/> for details.
##
## The R package intsurv is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
##
################################################################################


##' Generate Simulated Data from Cox Model
##'
##' This function generates simulated survival data with uncertain records.
##' The baseline hazard function is parametric Weibull model
##' and the censoring times follow uniform distribution.
##'
##' @param nSubject Number of subjects
##' @param beta0Vec Time-invariant covariate coefficients.
##' @param xMat Design matrix. By default, three continuous variables following
##' standard normal distribution and one binary variable following bernoulli
##' distribution with equal probability are used.
##' @param maxNum Maximum number of uncertain records.
##' @param nRecordProb Probability of the number of uncertian records.
##' @param matchCensor The matching rate for subjects actually having censoring
##' times.
##' @param matchEvent The matching rate for subjects actually having event
##' times.
##' @param censorMin The lower boundary of the uniform distribution for
##' generating censoring time.
##' @param censorMax The upper boundary of the uniform distribution for
##' generating censoring time.
##' @param lambda A positive number, scale parameter in baseline rate function
##' for true event times.
##' @param rho A positive number, shape parameter in baseline rate function for
##' true event times.
##' @param fakeLambda1 A positive number, scale paramenter in baseline rate
##' function for fake event times from one distribution.
##' @param fakeRho1 A positive number, shape parameter in baseline rate function
##' for fake event times from one distribution.
##' @param fakeLambda2 A positive number, scale paramenter in baseline rate
##' function for fake event times from another distribution.
##' @param fakeRho2 A positive number, shape parameter in baseline rate function
##' for fake event times from another distribution.
##' @param mixture The mixture weights, i.e., the probabilies (summing up to
##' one) of fake event times coming from different mixture components.
##' @param randomMiss A boolean value specifying whether the labels of the true
##' records are missing completely at random (MCAR) or missing not at random
##' (MNAR). The default value is \code{TRUE} for MCAR.
##' @param eventOnly A boolean value specifying whether the uncertain records
##' only include possible events. The default value is \code{FALSE}, which
##' considers the censoring cases as the possible truth in addition to event
##' records.
##' @param ... Other arguments for future usage.
##'
##' @return A data frame with the following columns, \itemize{ \item \code{ID}:
##' subject ID \item \code{obsTime}: observed event times \item
##' \code{eventInd}: event indicators \item \code{latentInd}: latent label
##' indicating the true recorods } and the corresponding covariates.
##'
##' @references
##'
##' Bender, R., Augustin, T. and Blettner, M. (2005),
##' Generating survival times to simulate Cox proportional hazards models.
##' Statistics in Medicine, 24: 1713--1723. doi:10.1002/sim.2059.
##'
##' Nadarajah, S., & Kotz, S. (2006). R programs for truncated
##' distributions. Journal of Statistical Software, 16(c02).
##'
##' @examples
##' dat <- simuWeibull(nSubject = 100)
##' @importFrom stats runif rnorm
##' @export
simuWeibull <- function(nSubject = 1e3, beta0Vec, xMat, maxNum = 2,
nRecordProb = c(0.9, 0.1),
matchCensor = 0.1, matchEvent = 0.1,
censorMin = 0.5, censorMax = 12.5,
lambda = 0.005, rho = 0.7,
fakeLambda1 = lambda * exp(- 3), fakeRho1 = rho,
fakeLambda2 = lambda * exp(3), fakeRho2 = rho,
mixture = 0.5, randomMiss = TRUE,
eventOnly = FALSE, ...)
{
## 1. Generate the true dataset with unique event time per subject and
## cencoring time. The censoring rate of the truth data set can be
## controled.

## 2. Take a sample of subject ID, where the sample size depends on the
## matching rate from original data to death database.

## 3. Generate fake records for selected the subjects.

## 3.1. If the subject has event time censored, i.e. the truth is the
## censoring record, generate given number of fake event times by different
## x's or distribution parameters and only keep fake events that happened
## before censoring time. (It is possible that not much fake events left,
## which means that the alternative distribution of fake events is probably
## not reasonable for the censoring time picked.)

## 3.2. If the subject has event time recorded, i.e. the truth is the event
## time, generate event times by different x's or distribution parameters.
## Take them and the censoring time picked as all the fake
## records. Similarly, only keep event times before the censoring time.

## non-sense just to surpress complaint from R check
ID <- NULL

## set design matrix and coefficients for event times
if (missing(xMat)) {
x1 <- round(rnorm(nSubject), 3)
x2 <- round(rnorm(nSubject), 3)
x3 <- round(rnorm(nSubject), 3)
x4 <- sample(c(0, 1), size = nSubject, replace = TRUE)
xMat <- cbind(x1, x2, x3, x4)
} else {
xMat <- as.matrix(xMat)
}
beta0Vec <- if (missing(beta0Vec))
rep(1, ncol(xMat))
else
as.numeric(beta0Vec)
## event times
eventTime <- denWeib(nSubject, beta0Vec, xMat, lambda, rho, gen = TRUE)

## censoring times
censorTime <- runif(n = nSubject, min = censorMin, max = censorMax)

eventInd <- censorTime > eventTime
obsTime <- ifelse(eventInd, eventTime, censorTime)
trueDat <- data.frame(ID = seq_len(nSubject), xMat, obsTime = obsTime,
eventTime = eventTime, censorTime = censorTime,
eventInd, latentInd = 1L)

## match records for subjects actually having events
numSubFake1 <- round(matchEvent * sum(trueDat$eventInd))
subIDfake1 <- if (randomMiss) {
with(subset(trueDat, eventInd == 1L),
sample(ID, size = numSubFake1))
} else {
with(subset(trueDat, eventInd == 1L),
sample(ID, size = numSubFake1,
prob = eventTime))
}

## match records for censoring cases
numSubFake0 <- round(matchCensor * sum(! trueDat$eventInd))
subIDfake0 <- if (randomMiss) {
with(subset(trueDat, eventInd == 0L),
sample(ID, size = numSubFake0))
} else {
with(subset(trueDat, eventInd == 0L),
sample(ID, size = numSubFake0,
prob = eventTime))
}

if (maxNum == 1) nRecordProb <- 1
nRecords <- sample(maxNum, size = numSubFake0 + numSubFake1,
replace = TRUE, prob = nRecordProb)
numFake <- sum(nRecords) - numSubFake1
multiInd <- c(rep(c(subIDfake0, subIDfake1), nRecords - 1), subIDfake0)
fakeDat <- trueDat[multiInd, ]
xMat <- xMat[multiInd, ]

## fake event time from alternative dist. # 1 and # 2
fakes1 <- denWeib(numFake, beta0Vec, xMat, lambda = fakeLambda1,
rho = fakeRho1, censorTime = fakeDat$censorTime,
gen = TRUE)
fakes2 <- denWeib(numFake, beta0Vec, xMat, lambda = fakeLambda2,
rho = fakeRho2, censorTime = fakeDat$censorTime,
gen = TRUE)

## fake censored data for all selected subject with true event times
fakeCensor <- subset(trueDat, ID %in% subIDfake1 & eventInd)
if (eventOnly || ! nrow(fakeCensor)) {
fakeCensor <- NULL
} else {
fakeCensor$obsTime <- fakeCensor$censorTime
fakeCensor$eventInd <- fakeCensor$latentInd <- 0L
}

## setup mixture of alternative density
indDist1 <- runif(numFake)
fakeDat$obsTime <- ifelse(indDist1 < mixture, fakes1, fakes2)
fakeDat$eventInd <- 1L
fakeDat$latentInd <- 0L

## remove all fake events after censorTime
## fakeDat <- subset(fakeDat, obsTime <= censorTime)
## by using truncated distribution, no need to remove

## combine all parts as output
outDat <- rbind(trueDat, fakeDat, fakeCensor)
outDat$eventInd <- as.integer(outDat$eventInd)
out <- outDat[order(outDat$ID), ]
row.names(out) <- out$eventTime <- out$censorTime <- NULL
out
}


### internal function ==========================================================
denWeib <- function (nEvent = 1e3, beta0Vec, xMat, lambda, rho,
censorTime = NULL, gen = FALSE, ...) {

if (missing(beta0Vec))
beta0Vec <- c(1, 1)
if (missing(xMat))
xMat <- rep(0, length(beta0Vec))

vVec <- runif(nEvent)
if (gen) {
expXbeta <- exp(xMat %*% beta0Vec)
if (is.null(censorTime)) {
eventTime <- (- log(1 - vVec) / (lambda * expXbeta)) ^ (1 / rho)
return(eventTime)
}
## generate event times before given censoring time
## using truncated distribution
## reference: Nadarajah and Kotz (2006). JSS.
Gb <- 1 - exp(- lambda * censorTime ^ rho * expXbeta)
eventTime <- (- log(1 - vVec * Gb) / (lambda * expXbeta)) ^ (1 / rho)
idx <- eventTime >= censorTime
return(eventTime)
}

## otherwise
newScale <- lambda * exp(xMat %*% beta0Vec)
eventTime <- (- log(vVec) / newScale) ^ (1 / rho)
tVec <- seq.int(0, max(eventTime), by = 0.01)
htSt <- newScale * rho * tVec ^ (rho - 1) * exp(- newScale * tVec ^ rho)
list(time = tVec, htSt = htSt, x = xMat, beta = beta0Vec)
}
11 changes: 5 additions & 6 deletions man/iCoxph.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit f4237d1

Please sign in to comment.